Esempio n. 1
0
    def test_token_classifiers(self):
        interesting_tokens = set(['the','it', 'a'])
        context_creators = import_utils.build_objects(self.config['context_creators'])
        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)

        feature_extractors = import_utils.build_objects(self.config['feature_extractors'])
        token_context_features = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors)
        binarizers = experiment_utils.fit_binarizers(experiment_utils.flatten(token_context_features.values()))
        token_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in token_context_features.items()}

        token_context_tags = experiment_utils.tags_from_contexts(token_contexts)

        # train the classifier for each token
        classifier_type = experiment_utils.import_class(self.config['learning']['classifier']['module'])

        classifier_map = learning_utils.token_classifiers(token_context_features, token_context_tags, classifier_type)
        self.assertEqual(set(token_context_tags.keys()), set(classifier_map.keys()))
        for tok, classifier in classifier_map.items():
            self.assertTrue(hasattr(classifier, 'predict'))
Esempio n. 2
0
def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(
        config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' +
                str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(
        config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info(
        'mapping the training contexts over the interesting tokens in train...'
    )
    train_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                   train_context_creators,
                                                   workers=workers)

    # load and parse the test data
    logger.info(
        'mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                  [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info(
        'filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts,
                                                      min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts,
                                                     min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {
        k: np.array([wmt_binary_classes[v] for v in val])
        for k, val in train_context_tags.items()
    }

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes,
                                                    test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(
        config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(
        test_contexts, feature_extractors, workers=workers)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(
        train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(
        train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in test_context_features.items()
    }
    train_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in train_context_features.items()
    }

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(
        config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features,
                                                      train_context_tags,
                                                      classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)
def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' + str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info('mapping the training contexts over the interesting tokens in train...')
    train_contexts = experiment_utils.map_contexts(interesting_tokens, train_context_creators, workers=workers)

    # load and parse the test data
    logger.info('mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens, [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info('filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts, min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts, min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {k: np.array([wmt_binary_classes[v] for v in val]) for k, val in train_context_tags.items()}

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes, test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(test_contexts, feature_extractors, workers=workers)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in test_context_features.items()}
    train_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in train_context_features.items()}

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features, train_context_tags, classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)