Beispiel #1
0
def run_prediction((train_data, train_tags, test_data, test_tags, idx)):
    logger.info('training sequential model...')
    all_values = flatten(train_data)
    # binarize
    binarizers = fit_binarizers(all_values)
    test_data = call_for_each_element(test_data, binarize, [binarizers], data_type='sequential')
    train_data = call_for_each_element(train_data, binarize, [binarizers], data_type='sequential')

    x_train = np.array([np.array(xi) for xi in train_data])
    y_train = np.array([np.array(xi) for xi in train_tags])
    x_test = np.array([np.array(xi) for xi in test_data])
    y_test = np.array([np.array(xi) for xi in test_tags])
    
    sequence_learner = PystructSequenceLearner()
    sequence_learner.fit(x_train, y_train)
    structured_hyp = sequence_learner.predict(x_test)
    
    logger.info('scoring sequential model...')
    flattened_hyp = flatten(structured_hyp)
    
    flattened_ref = flatten(y_test)
    test_tags = flattened_ref
    
    logger.info('Structured prediction f1: ')
    cur_res = f1_score(flattened_ref, flattened_hyp, average=None)
    logger.info('[ {}, {} ], {}'.format(cur_res[0], cur_res[1], f1_score(flattened_ref, flattened_hyp, pos_label=None)))

    return (cur_res, idx)
Beispiel #2
0
def run_prediction((train_data, train_tags, test_data, test_tags, idx)):
    logger.info('training sequential model...')
    all_values = flatten(train_data)
    # binarize
    binarizers = fit_binarizers(all_values)
    test_data = call_for_each_element(test_data,
                                      binarize, [binarizers],
                                      data_type='sequential')
    train_data = call_for_each_element(train_data,
                                       binarize, [binarizers],
                                       data_type='sequential')

    x_train = np.array([np.array(xi) for xi in train_data])
    y_train = np.array([np.array(xi) for xi in train_tags])
    x_test = np.array([np.array(xi) for xi in test_data])
    y_test = np.array([np.array(xi) for xi in test_tags])

    sequence_learner = PystructSequenceLearner()
    sequence_learner.fit(x_train, y_train)
    structured_hyp = sequence_learner.predict(x_test)

    logger.info('scoring sequential model...')
    flattened_hyp = flatten(structured_hyp)

    flattened_ref = flatten(y_test)
    test_tags = flattened_ref

    logger.info('Structured prediction f1: ')
    cur_res = f1_score(flattened_ref, flattened_hyp, average=None)
    logger.info('[ {}, {} ], {}'.format(
        cur_res[0], cur_res[1],
        f1_score(flattened_ref, flattened_hyp, pos_label=None)))

    return (cur_res, idx)
Beispiel #3
0
def main(config):
    workers = config['workers']

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    train_data_generator = build_object(config['datasets']['training'][0])
    train_data = train_data_generator.generate()
    #    train_data = {}
    #    for gen in train_data_generators:
    #        data = gen.generate()
    #        for key in data:
    #            if key not in train_data:
    #                train_data[key] = []
    #            train_data[key].extend(data[key])
    # test
    test_data_generator = build_object(config['datasets']['test'][0])
    test_data = test_data_generator.generate()

    logger.info("Train data keys: {}".format(train_data.keys()))
    logger.info("Train data sequences: {}".format(len(train_data['target'])))
    logger.info("Sample sequence: {}".format(
        [w.encode('utf-8') for w in train_data['target'][0]]))
    #    logger.info("Sample sequence: {}".format(train_data['similarity'][0]))
    #    sys.exit()
    #logger.info("Alignment file: {}".format(train_data['alignments_file']))
    #logger.info("Alignment file: {}".format(test_data['alignments_file']))

    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        test_data = r.generate(test_data)

    borders = config['borders'] if 'borders' in config else False

    if 'multiply_data_train' not in config:
        pass
    elif config['multiply_data_train'] == 'ngrams':
        logger.info("Multiply data: {} and {}".format(
            config['multiply_data_train'], config['multiply_data_test']))
        train_data = multiply_data_ngrams(train_data, borders=borders)
        logger.info("Sequences: {}, tag sequences: {}".format(
            len(train_data['target']), len(train_data['tags'])))
    elif config['multiply_data_train'] == '1ton':
        logger.info("Multiply data: {} and {}".format(
            config['multiply_data_train'], config['multiply_data_test']))
        train_data = multiply_data(train_data, borders=borders)
    elif config['multiply_data_train'] == 'duplicate':
        train_data = multiply_data_base(train_data)
    elif config['multiply_data_train'] == 'all':
        train_data = multiply_data_all(train_data, borders=borders)
    else:
        print("Unknown 'multiply data train' value: {}".format(
            config['multiply_data_train']))
    logger.info("Train data example: {}".format(train_data['target'][:10]))
    logger.info("Train tags example: {}".format(train_data['tags'][:10]))
    logger.info("Extended train representations: {}".format(
        len(train_data['target'])))
    #    print(train_data[:2])
    logger.info("Simple test representations: {}".format(
        len(test_data['target'])))
    if 'multiply_data_test' not in config:
        pass
    elif config['multiply_data_test'] == 'ngrams':
        test_data = multiply_data_ngrams(test_data, borders=borders)
    elif config['multiply_data_test'] == '1ton':
        test_data = multiply_data(test_data, borders=borders)
    else:
        print("Unknown 'multiply data test' value: {}".format(
            config['multiply_data_test']))
    logger.info("Extended test representations: {}".format(
        len(test_data['target'])))

    logger.info('here are the keys in your representations: {}'.format(
        train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    #    data_type = config['contexts'] if 'contexts' in config else 'plain'
    data_type = config['data_type'] if 'data_type' in config else 'sequential'

    test_contexts = create_contexts(test_data, data_type=data_type)
    test_contexts_seq = create_contexts(test_data, data_type='sequential')
    train_contexts = create_contexts(train_data, data_type=data_type)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'],
                                    test_data['target']]))

    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts,
                                       tags_from_contexts,
                                       data_type=data_type)
    test_tags = call_for_each_element(test_contexts,
                                      tags_from_contexts,
                                      data_type=data_type)
    test_tags_seq = call_for_each_element(test_contexts_seq,
                                          tags_from_contexts,
                                          data_type='sequential')

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_features = call_for_each_element(test_contexts,
                                          contexts_to_features,
                                          [feature_extractors, workers],
                                          data_type=data_type)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts,
                                           contexts_to_features,
                                           [feature_extractors, workers],
                                           data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))
    logger.info('train features sample: {}'.format(train_features[:5]))
    logger.info('train tags sample: {}'.format(train_tags[:5]))

    logger.info(
        'All of your features now exist in their raw representation, but they may not be numbers yet'
    )
    # END FEATURE EXTRACTION

    # BEGIN CONVERTING FEATURES TO NUMBERS
    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        feature_names = [
            f for extractor in feature_extractors
            for f in extractor.get_feature_names()
        ]
        features_num = len(feature_names)
        true_features_num = len(all_values[0])

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features,
                                              binarize, [binarizers],
                                              data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features,
                                               binarize, [binarizers],
                                               data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')
    logger.info('training and test sets successfully generated')

    # the way that we persist depends upon the structure of the data (plain/sequence/token_dict)
    # TODO: remove this once we have a list containing all datasets
    if config['features']['persist']:
        if 'persist_format' in config['features']:
            persist_format = config['features']['persist_format']
        else:
            persist_format = 'crf++'
        experiment_datasets = [{
            'name': 'test',
            'features': test_features,
            'tags': test_tags
        }, {
            'name': 'train',
            'features': train_features,
            'tags': train_tags
        }]
        feature_names = [
            f for extractor in feature_extractors
            for f in extractor.get_feature_names()
        ]

        if config['persist_dir']:
            persist_dir = config['persist_dir']
        else:
            persist_dir = os.path.getcwd()
        logger.info('persisting your features to: {}'.format(persist_dir))
        # for each dataset, write a file and persist the features
        for dataset_obj in experiment_datasets:
            persist_features(dataset_obj['name'],
                             dataset_obj['features'],
                             persist_dir,
                             feature_names=feature_names,
                             tags=dataset_obj['tags'],
                             file_format=persist_format)
    sys.exit()

    # BEGIN LEARNING

    # TODO: different sequence learning modules need different representation, we should wrap them in a class
    # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different
    from sklearn.metrics import f1_score, precision_score, recall_score
    import numpy as np
    tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1}
    if data_type == 'sequential':
        logger.info('training sequential model...')

        train_tags = [[tag_map[tag] for tag in seq] for seq in train_tags]
        test_tags = [[tag_map[tag] for tag in seq] for seq in test_tags]

        x_train = np.array([np.array(xi) for xi in train_features])
        y_train = np.array([np.array(xi) for xi in train_tags])
        x_test = np.array([numpy.array(xi) for xi in test_features])
        y_test = np.array([numpy.array(xi) for xi in test_tags])

        # pystruct
        from marmot.learning.pystruct_sequence_learner import PystructSequenceLearner
        sequence_learner = PystructSequenceLearner()
        sequence_learner.fit(x_train, y_train)
        structured_hyp = sequence_learner.predict(x_test)
        # only the last word in every sequence should be counted
        flattened_hyp = []
        flattened_ref = []
        if long_test:
            for idx, seq in enumerate(structured_hyp):
                flattened_hyp.append(seq[-1])
                flattened_ref.append(y_test[idx][-1])
        else:
            flattened_hyp = flatten(structured_hyp)
            flattened_ref = flatten(y_test)

        logger.info('scoring sequential model...')

        # TODO: the flattening is currently a hack to let us use the same evaluation code for structured and plain tasks
        #        flattened_hyp = flatten(structured_hyp)

        # end pystruct
        #        for idx, seq in enumerate(test_tags_seq):
        #            cnt += len(seq)
        #            if cnt >= len(test_predictions):
        #                print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx))
        #                print("Sequence: ", test_contexts_seq[idx])
        #        if long_test:
        #            cnt = -1
        #            new_predictions = []
        #            new_true = []
        #            for seq in test_tags_seq:
        #                cnt += len(seq)
        #                new_predictions.append(tag_map[test_predictions[cnt]])
        #               new_true.append(tag_map[seq[-1]])
        #            test_predictions = new_predictions
        #            test_tags = new_true
        #

        #        print(f1_score(test_predictions, test_tags, average=None))

        print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp))
        logger.info('Structured prediction f1: ')
        print(f1_score(flattened_ref, flattened_hyp, average=None))
        print(
            f1_score(flattened_ref,
                     flattened_hyp,
                     average='weighted',
                     pos_label=None))
        logger.info("Sequence correlation: ")
        print(
            sequence_correlation_weighted(y_test, structured_hyp,
                                          verbose=True)[1])

    else:
        train_tags = [tag_map[tag] for tag in train_tags]
        test_tags = [tag_map[tag] for tag in test_tags]

        # data_type is 'token' or 'plain'
        logger.info('start training...')
        classifier_type = import_class(
            config['learning']['classifier']['module'])
        # train the classifier(s)
        classifier_map = map_classifiers(train_features,
                                         train_tags,
                                         classifier_type,
                                         data_type=data_type)
        logger.info('classifying the test instances')
        test_predictions = predict_all(test_features,
                                       classifier_map,
                                       data_type=data_type)
        #        assert(len(test_predictions) == len(flatten(test_tags_seq))), "long predictions: {}, sequential: {}".format(len(test_predictions), len(flatten(test_tags_seq)))
        cnt = 0
        test_predictions_seq = []
        test_tags_seq_num = []
        tag_map = {'OK': 1, 'BAD': 0, 1: 1, 0: 0}
        long_test = True if 'multiply_data_test' in config and (
            config['multiply_data_test'] == 'ngrams'
            or config['multiply_data_test'] == '1ton') else False
        for idx, seq in enumerate(test_tags_seq):
            test_predictions_seq.append([])
            test_tags_seq_num.append([])
            for w in seq:
                test_predictions_seq[-1].append(tag_map[test_predictions[cnt]])
                test_tags_seq_num[-1].append(tag_map[w])
                cnt += 1
#            cnt += len(seq)
#            if cnt >= len(test_predictions):
#                print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx))
#                print("Sequence: ", test_contexts_seq[idx])
        if long_test:
            cnt = -1
            new_predictions = []
            new_true = []
            for seq in test_tags_seq:
                cnt += len(seq)
                new_predictions.append(tag_map[test_predictions[cnt]])
                new_true.append(tag_map[seq[-1]])
            test_predictions = new_predictions
            test_tags = new_true

        print(f1_score(test_predictions, test_tags, average=None))
        print(
            f1_score(test_predictions,
                     test_tags,
                     average='weighted',
                     pos_label=None))
        print("Precision: {}, recall: {}".format(
            precision_score(test_predictions, test_tags, average=None),
            recall_score(test_predictions, test_tags, average=None)))
        logger.info("Sequence correlation: ")
        print(
            sequence_correlation_weighted(test_tags_seq_num,
                                          test_predictions_seq,
                                          verbose=True)[1])
Beispiel #4
0
def main(config):
    workers = config['workers']

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    train_data_generators = build_objects(config['datasets']['training'])
    train_data = {}
    for gen in train_data_generators:
        data = gen.generate()
        for key in data:
            if key not in train_data:
                train_data[key] = []
            train_data[key].extend(data[key])
    # test
    test_data_generator = build_object(config['datasets']['test'][0])
    test_data = test_data_generator.generate()

    logger.info("Train data keys: {}".format(train_data.keys()))
    logger.info("Train data sequences: {}".format(len(train_data['target'])))
    logger.info("Sample sequence: {}".format([w.encode('utf-8') for w in train_data['target'][0]]))
#    logger.info("Sample sequence: {}".format(train_data['similarity'][0]))
#    sys.exit()

    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        test_data = r.generate(test_data)

    borders = config['borders'] if 'borders' in config else False

    if 'multiply_data_train' not in config:
        pass
    elif config['multiply_data_train'] == 'ngrams':
        train_data = multiply_data_ngrams(train_data, borders=borders)
    elif config['multiply_data_train'] == '1ton':
        train_data = multiply_data(train_data, borders=borders)
    elif config['multiply_data_train'] == 'duplicate':
        train_data = multiply_data_base(train_data)
    elif config['multiply_data_train'] == 'all':
        train_data = multiply_data_all(train_data, borders=borders)
    else:
        print("Unknown 'multiply data train' value: {}".format(config['multiply_data_train']))
    logger.info("Extended train representations: {}".format(len(train_data['target'])))
    #    print(train_data[:2])
    logger.info("Simple test representations: {}".format(len(test_data['target'])))
    if 'multiply_data_test' not in config:
        pass
    elif config['multiply_data_test'] == 'ngrams':
        test_data = multiply_data_ngrams(test_data, borders=borders)
    elif config['multiply_data_test'] == '1ton':
        test_data = multiply_data(test_data, borders=borders)
    else:
        print("Unknown 'multiply data test' value: {}".format(config['multiply_data_test']))
    logger.info("Extended test representations: {}".format(len(test_data['target'])))
    
    logger.info('here are the keys in your representations: {}'.format(train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['contexts'] if 'contexts' in config else 'plain'

    test_contexts = create_contexts(test_data, data_type=data_type)
    test_contexts_seq = create_contexts(test_data, data_type='sequential')
    train_contexts = create_contexts(train_data, data_type=data_type)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'], test_data['target']]))
 
    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type)
    test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type)
    test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential')

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info('All of your features now exist in their raw representation, but they may not be numbers yet')
    # END FEATURE EXTRACTION

    # BEGIN CONVERTING FEATURES TO NUMBERS
    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]
        features_num = len(feature_names)
        true_features_num = len(all_values[0])

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')
    logger.info('training and test sets successfully generated')

    # the way that we persist depends upon the structure of the data (plain/sequence/token_dict)
    # TODO: remove this once we have a list containing all datasets
    if config['features']['persist']:
        if 'persist_format' in config['features']:
            persist_format = config['features']['persist_format']
        else:
            persist_format = 'crf++'
        experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}]
        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]

        if config['features']['persist_dir']:
            persist_dir = config['features']['persist_dir']
        else:
            persist_dir = os.path.getcwd()
        logger.info('persisting your features to: {}'.format(persist_dir))
        # for each dataset, write a file and persist the features
        for dataset_obj in experiment_datasets:
            persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format)

    # BEGIN LEARNING

    # TODO: different sequence learning modules need different representation, we should wrap them in a class
    # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different
    from sklearn.metrics import f1_score, precision_score, recall_score
    import numpy as np
    tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1}
    if data_type == 'sequential':
        logger.info('training sequential model...')

        train_tags = [[tag_map[tag] for tag in seq] for seq in train_tags]
        test_tags = [[tag_map[tag] for tag in seq] for seq in test_tags]

        x_train = np.array([np.array(xi) for xi in train_features])
        y_train = np.array([np.array(xi) for xi in train_tags])
        x_test = np.array([numpy.array(xi) for xi in test_features])
        y_test = np.array([numpy.array(xi) for xi in test_tags])

        # pystruct
        from marmot.learning.pystruct_sequence_learner import PystructSequenceLearner
        sequence_learner = PystructSequenceLearner()
        sequence_learner.fit(x_train, y_train)
        structured_hyp = sequence_learner.predict(x_test)
        # only the last word in every sequence should be counted
        flattened_hyp = []
        flattened_ref = []
        if long_test:
            for idx, seq in enumerate(structured_hyp):
                flattened_hyp.append(seq[-1])
                flattened_ref.append(y_test[idx][-1])
        else:
            flattened_hyp = flatten(structured_hyp)
            flattened_ref = flatten(y_test)

        logger.info('scoring sequential model...')

        # TODO: the flattening is currently a hack to let us use the same evaluation code for structured and plain tasks
#        flattened_hyp = flatten(structured_hyp)

        # end pystruct
#        for idx, seq in enumerate(test_tags_seq):
#            cnt += len(seq)
#            if cnt >= len(test_predictions):
#                print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx))
#                print("Sequence: ", test_contexts_seq[idx])
#        if long_test:
#            cnt = -1
#            new_predictions = []
#            new_true = []
#            for seq in test_tags_seq:
#                cnt += len(seq)
#                new_predictions.append(tag_map[test_predictions[cnt]])
 #               new_true.append(tag_map[seq[-1]])
#            test_predictions = new_predictions
#            test_tags = new_true
#

#        print(f1_score(test_predictions, test_tags, average=None))


        print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp))
        logger.info('Structured prediction f1: ')
        print(f1_score(flattened_ref, flattened_hyp, average=None))
        print(f1_score(flattened_ref, flattened_hyp, average='weighted', pos_label=None))
        logger.info("Sequence correlation: ")
        print(sequence_correlation_weighted(y_test, structured_hyp, verbose=True)[1])

    else:
        train_tags = [tag_map[tag] for tag in train_tags]
        test_tags = [tag_map[tag] for tag in test_tags]

       # data_type is 'token' or 'plain'
        logger.info('start training...')
        classifier_type = import_class(config['learning']['classifier']['module'])
        # train the classifier(s)
        classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type)
        logger.info('classifying the test instances')
        test_predictions = predict_all(test_features, classifier_map, data_type=data_type)
#        assert(len(test_predictions) == len(flatten(test_tags_seq))), "long predictions: {}, sequential: {}".format(len(test_predictions), len(flatten(test_tags_seq)))
        cnt = 0
        test_predictions_seq = []
        test_tags_seq_num = []
        tag_map = {'OK': 1, 'BAD': 0, 1: 1, 0: 0}
        long_test = True if 'multiply_data_test' in config and (config['multiply_data_test'] == 'ngrams' or config['multiply_data_test'] == '1ton') else False
        for idx, seq in enumerate(test_tags_seq):
            test_predictions_seq.append([])
            test_tags_seq_num.append([])
            for w in seq:
                test_predictions_seq[-1].append(tag_map[test_predictions[cnt]])
                test_tags_seq_num[-1].append(tag_map[w])
                cnt += 1
#            cnt += len(seq)
#            if cnt >= len(test_predictions):
#                print("long predictions: {}, sequential: {}, sequence #{}".format(len(test_predictions), len(flatten(test_tags_seq)), idx))
#                print("Sequence: ", test_contexts_seq[idx])
        if long_test:
            cnt = -1
            new_predictions = []
            new_true = []
            for seq in test_tags_seq:
                cnt += len(seq)
                new_predictions.append(tag_map[test_predictions[cnt]])
                new_true.append(tag_map[seq[-1]])
            test_predictions = new_predictions
            test_tags = new_true

        print(f1_score(test_predictions, test_tags, average=None))
        print(f1_score(test_predictions, test_tags, average='weighted', pos_label=None))
        print("Precision: {}, recall: {}".format(precision_score(test_predictions, test_tags, average=None), recall_score(test_predictions, test_tags, average=None)))
        logger.info("Sequence correlation: ")
        print(sequence_correlation_weighted(test_tags_seq_num, test_predictions_seq, verbose=True)[1])
Beispiel #5
0
def main(config):
    workers = config['workers']

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    train_data_generators = build_objects(config['datasets']['training'])
    train_data = {}
    for gen in train_data_generators:
        data = gen.generate()
        for key in data:
            if key not in train_data:
                train_data[key] = []
            train_data[key].extend(data[key])
    # test
    test_data_generator = build_object(config['datasets']['test'][0])
    test_data = test_data_generator.generate()

    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        test_data = r.generate(test_data)

    logger.info('here are the keys in your representations: {}'.format(train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['contexts'] if 'contexts' in config else 'plain'

    test_contexts = create_contexts(test_data, data_type=data_type)
    train_contexts = create_contexts(train_data, data_type=data_type)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'], test_data['target']]))
 
    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    # make sure the test_context and train_context keys are in sync
    # TODO: this is important when we are learning token-level classifiers
#    experiment_utils.sync_keys(train_contexts, test_contexts)

    # TODO: this is important when we are learning token-level classifiers
    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
#    assert set(test_contexts.keys()) == set(train_contexts.keys())

    train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type)
    test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info('All of your features now exist in their raw representation, but they may not be numbers yet')
    # END FEATURE EXTRACTION

    # BEGIN CONVERTING FEATURES TO NUMBERS

    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')

    logger.info('training and test sets successfully generated')

    # the way that we persist depends upon the structure of the data (plain/sequence/token_dict)
    # TODO: remove this once we have a list containing all datasets
    if config['features']['persist']:
        if 'persist_format' in config['features']:
            persist_format = config['features']['persist_format']
        else:
            persist_format = 'crf++'
        experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}]
        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]

        if config['features']['persist_dir']:
            persist_dir = config['features']['persist_dir']
        else:
            persist_dir = os.path.getcwd()
        logger.info('persisting your features to: '.format(persist_dir))
        # for each dataset, write a file and persist the features
        for dataset_obj in experiment_datasets:
            persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format)



    # TODO: we should only learn and evaluate the model if this is what the user wants
    # TODO: we should be able to dump the features for each of the user's datasets to a file specified by the user

    # BEGIN LEARNING

    # TODO: different sequence learning modules need different representation, we should wrap them in a class
    # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different
    from sklearn.metrics import f1_score
    import numpy as np
    if data_type == 'sequential':
        logger.info('training sequential model...')

        # TODO: move the tag and array conversion code to the utils of this module
        # TODO: check if X and y are in the format we expect
        # TODO: don't hardcode the dictionary
        tag_map = {u'OK': 1, u'BAD': 0}
        train_tags = [[tag_map[tag] for tag in seq] for seq in train_tags]
        test_tags = [[tag_map[tag] for tag in seq] for seq in test_tags]

        # make sure that everything is numpy
        # cast the dataset to numpy array (ndarrays)
        # note that these are _NOT_ matrices, because the inner sequences have different lengths
        x_train = np.array([np.array(xi) for xi in train_features])
        y_train = np.array([np.array(xi) for xi in train_tags])
        x_test = np.array([numpy.array(xi) for xi in test_features])
        y_test = np.array([numpy.array(xi) for xi in test_tags])

        # SEQLEARN
        # from seqlearn.perceptron import StructuredPerceptron
        #
        # # seqlearn needs a flat list of instances
        # x_train = np.array([i for seq in x_train for i in seq])
        # y_train = np.array([i for seq in y_train for i in seq])
        # x_test = np.array([i for seq in x_test for i in seq])
        # y_test = np.array([i for seq in y_test for i in seq])
        #
        # # seqlearn requires the lengths of each sequence
        # lengths_train = [len(seq) for seq in train_features]
        # lengths_test = [len(seq) for seq in test_features]
        #
        # clf = StructuredPerceptron(verbose=True, max_iter=400)
        # clf.fit(x_train, y_train, lengths_train)
        #
        # structured_predictions = clf.predict(x_test, lengths_test)
        # logger.info('f1 from seqlearn: {}'.format(f1_score(y_test, structured_predictions, average=None)))

        # END SEQLEARN

        # pystruct
        from marmot.learning.pystruct_sequence_learner import PystructSequenceLearner
        sequence_learner = PystructSequenceLearner()
        sequence_learner.fit(x_train, y_train)
        structured_hyp = sequence_learner.predict(x_test)

        logger.info('scoring sequential model...')
        # print('score: ' + str(structured_predictor.score(x_test, y_test)))

        # TODO: implement this in the config
        # classifier_type = import_class(config['learning']['classifier']['module'])

        # TODO: the flattening is currently a hack to let us use the same evaluation code for structured and plain tasks
        flattened_hyp = flatten(structured_hyp)

        # end pystruct

        test_predictions = flattened_hyp
        flattened_ref = flatten(y_test)
        test_tags = flattened_ref

        logger.info('Structured prediction f1: ')
        print(f1_score(flattened_ref, flattened_hyp, average=None))

    else:
        # data_type is 'token' or 'plain'
        logger.info('start training...')
        classifier_type = import_class(config['learning']['classifier']['module'])
        # train the classifier(s)
        classifier_map = map_classifiers(train_features, train_tags, classifier_type, data_type=data_type)
        logger.info('classifying the test instances')
        test_predictions = predict_all(test_features, classifier_map, data_type=data_type)

    # TODO: this section only works for 'plain'

    print(f1_score(test_predictions, test_tags, average=None))

    # EVALUATION
    logger.info('evaluating your results')

    # TODO: remove the hard coding of the tags here
    bad_count = sum(1 for t in test_tags if t == u'BAD' or t == 0)
    good_count = sum(1 for t in test_tags if t == u'OK' or t == 1)

    total = len(test_tags)
    assert (total == bad_count+good_count), 'tag counts should be correct'
    percent_good = good_count / total
    logger.info('percent good in test set: {}'.format(percent_good))
    logger.info('percent bad in test set: {}'.format(1 - percent_good))

    import numpy as np

    random_class_results = []
    random_weighted_results = []
    for i in range(20):
        random_tags = list(np.random.choice([1, 0], total, [percent_good, 1-percent_good]))
        # random_tags = [u'GOOD' for i in range(total)]
        random_class_f1 = f1_score(test_tags, random_tags, average=None)
        random_class_results.append(random_class_f1)
        logger.info('two class f1 random score ({}): {}'.format(i, random_class_f1))
        # random_average_f1 = f1_score(random_tags, test_tags, average='weighted')
        random_average_f1 = weighted_fmeasure(test_tags, random_tags)
        random_weighted_results.append(random_average_f1)
        # logger.info('average f1 random score ({}): {}'.format(i, random_average_f1))

    avg_random_class = np.average(random_class_results, axis=0)
    avg_weighted = np.average(random_weighted_results)
    logger.info('two class f1 random average score: {}'.format(avg_random_class))
    logger.info('weighted f1 random average score: {}'.format(avg_weighted))

    actual_class_f1 = f1_score(test_tags, test_predictions, average=None)
    actual_average_f1 = weighted_fmeasure(test_tags, test_predictions)
    logger.info('two class f1 ACTUAL SCORE: {}'.format(actual_class_f1))
    logger.info('weighted f1 ACTUAL SCORE: {}'.format(actual_average_f1))

    if data_type == 'token':
        f1_map = {}
        for token, predicted in test_predictions.iteritems():
            logger.info("Evaluating results for token = " + token)
            actual = test_tags_actual[token]
#            print('Actual: ', actual)
#            print('Predicted: ', predicted)
#            logger.info("\ttotal instances: " + str(len(predicted)))
            f1_map[token] = weighted_fmeasure(actual, predicted)
            logger.info('Printing the map of f1 scores by token: ')
        print(f1_map)
    elif data_type == 'plain':
        f1 = weighted_fmeasure(test_tags, test_predictions)