Exemple #1
0
    def test_filter_contexts(self):
        context_creator_list = self.config['context_creators']
        context_creators = import_utils.build_objects(context_creator_list)

        fake_token = '_z_z_z'
        interesting_tokens = set(['the','it', 'a', fake_token])
        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)
        self.assertTrue(fake_token in token_contexts)

        filtered_contexts = experiment_utils.filter_contexts(token_contexts, min_total=1)
        self.assertFalse(fake_token in filtered_contexts, 'a token that does not exist should not have len(contexts) >= 1')
Exemple #2
0
    def test_map_contexts(self):
        context_creator_list = self.config['context_creators']
        context_creators = import_utils.build_objects(context_creator_list)
        interesting_tokens = set(['the','it'])

        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)
        for token in token_contexts.keys():
            self.assertTrue(token in interesting_tokens)
            self.assertTrue(len(token_contexts[token]) > 0)
            for context in token_contexts[token][:10]:
                self.assertTrue(context['token'] != None)
Exemple #3
0
    def test_filter_context_class(self):
        context_creator_list = self.config['context_creators2']
        context_creators = import_utils.build_objects(context_creator_list)

        interesting_tokens = set(['del','pescado'])
        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)
        self.assertTrue('pescado' in token_contexts)

        filtered_contexts = experiment_utils.filter_contexts_class(token_contexts, min_total=self.config['filters']['min_total'], min_class_count=self.config['filters']['min_class_count'], proportion=self.config['filters']['proportion'])
        self.assertTrue('del' in filtered_contexts)
        self.assertFalse('pescado' in filtered_contexts)
Exemple #4
0
    def test_tags_from_contexts(self):
        context_creator_list = self.config['context_creators']
        context_creators = import_utils.build_objects(context_creator_list)
        interesting_tokens = set(['the','it', 'a'])

        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)

        token_tags = experiment_utils.tags_from_contexts(token_contexts)

        self.assertEqual(set(token_tags.keys()), set(token_contexts.keys()))
        for tok, tag_vector in token_tags.items():
            self.assertTrue(tag_vector.shape[0] == len(token_contexts[tok]))
Exemple #5
0
    def test_convert_tagset(self):
        wmt_binary_classes = {0 :u'BAD', 1: u'OK'}
        context_creator_list = self.config['context_creators']
        context_creators = import_utils.build_objects(context_creator_list)

        interesting_tokens = set(['the','it', 'a'])
        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)
        new_token_contexts = experiment_utils.convert_tagset(wmt_binary_classes, token_contexts)
        reverse_map = {v:k for k,v in wmt_binary_classes.items()}
        for idx, tok_and_contexts in enumerate(new_token_contexts.iteritems()):
            tok, contexts = tok_and_contexts
            for idx, context in enumerate(contexts):
                self.assertEqual(reverse_map[context['tag']], token_contexts[tok][idx]['tag'])
Exemple #6
0
    def test_map_feature_extractor(self):
        context_creator_list = self.config['context_creators']
        context_creators = import_utils.build_objects(context_creator_list)
        interesting_tokens = set(['the','it', 'a'])

        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)

        feature_extractor_list = self.config['feature_extractors'][:1]
        feature_extractors = import_utils.build_objects(feature_extractor_list)

        mapped_context = np.hstack([experiment_utils.map_feature_extractor((token_contexts['the'][0], extractor)) for extractor in feature_extractors])
        self.assertTrue(isinstance(mapped_context, np.ndarray))
        # uses the TokenCountFeatureExtractor, which returns 3 features
        self.assertTrue(len(mapped_context) == 3)
Exemple #7
0
    def test_token_contexts_to_features(self):
        context_creator_list = self.config['context_creators']
        context_creators = import_utils.build_objects(context_creator_list)
        interesting_tokens = set(['the','it', 'a'])

        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)

        feature_extractor_list = self.config['feature_extractors'][:1]
        feature_extractors = import_utils.build_objects(feature_extractor_list)

        workers = 8
        mapped_contexts = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors, workers=8)

        self.assertEqual(set(mapped_contexts.keys()), set(token_contexts.keys()))
        for tok, feature_vecs in mapped_contexts.items():
            self.assertTrue(feature_vecs.shape[0] == len(token_contexts[tok]))
Exemple #8
0
    def test_time_token_contexts_to_features(self):
        context_creator_list = self.config['context_creators']
        context_creators = import_utils.build_objects(context_creator_list)
        interesting_tokens = set(['the','it', 'a'])

        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)

        feature_extractor_list = self.config['feature_extractors'][:1]
        feature_extractors = import_utils.build_objects(feature_extractor_list)

        start = time.time()
        mapped_contexts = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors)
        finish = time.time() - start
        print "Single: ", finish

        start = time.time()
        mapped_contexts = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors, workers=10)
        finish = time.time() - start
        print "Multiple: ", finish
    def test_token_classifiers(self):
        interesting_tokens = set(['the','it', 'a'])
        context_creators = import_utils.build_objects(self.config['context_creators'])
        token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators)

        feature_extractors = import_utils.build_objects(self.config['feature_extractors'])
        token_context_features = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors)
        binarizers = experiment_utils.fit_binarizers(experiment_utils.flatten(token_context_features.values()))
        token_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in token_context_features.items()}

        token_context_tags = experiment_utils.tags_from_contexts(token_contexts)

        # train the classifier for each token
        classifier_type = experiment_utils.import_class(self.config['learning']['classifier']['module'])

        classifier_map = learning_utils.token_classifiers(token_context_features, token_context_tags, classifier_type)
        self.assertEqual(set(token_context_tags.keys()), set(classifier_map.keys()))
        for tok, classifier in classifier_map.items():
            self.assertTrue(hasattr(classifier, 'predict'))
Exemple #10
0
def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(
        config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' +
                str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(
        config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info(
        'mapping the training contexts over the interesting tokens in train...'
    )
    train_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                   train_context_creators,
                                                   workers=workers)

    # load and parse the test data
    logger.info(
        'mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens,
                                                  [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info(
        'filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts,
                                                      min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts,
                                                     min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {
        k: np.array([wmt_binary_classes[v] for v in val])
        for k, val in train_context_tags.items()
    }

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes,
                                                    test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(
        config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(
        test_contexts, feature_extractors, workers=workers)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(
        train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(
        train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in test_context_features.items()
    }
    train_context_features = {
        k: [experiment_utils.binarize(v, binarizers) for v in val]
        for k, val in train_context_features.items()
    }

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(
        config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features,
                                                      train_context_tags,
                                                      classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)
def main(config):
    # load ContextCreators from config file, run their input functions, and pass the result into the initialization function
    # init() all context creators specified by the user with their arguments
    # import them according to their fully-specified class names in the config file
    # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want)

    # Chris - working - we want to hit every token
    interesting_tokens = experiment_utils.import_and_call_function(config['interesting_tokens'])
    print "INTERESTING TOKENS: ", interesting_tokens
    logger.info('The number of interesting tokens is: ' + str(len(interesting_tokens)))
    workers = config['workers']

    # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator)
    logger.info('building the context creators...')
    train_context_creators = experiment_utils.build_objects(config['context_creators'])

    # get the contexts for all of our interesting words (may be +,- or, multi-class)
    logger.info('mapping the training contexts over the interesting tokens in train...')
    train_contexts = experiment_utils.map_contexts(interesting_tokens, train_context_creators, workers=workers)

    # load and parse the test data
    logger.info('mapping the training contexts over the interesting tokens in test...')
    test_context_creator = experiment_utils.build_objects(config['testing'])
    test_contexts = experiment_utils.map_contexts(interesting_tokens, [test_context_creator])

    min_total = config['filters']['min_total']
    # filter token contexts based on the user-specified filter criteria
    logger.info('filtering the contexts by the total number of available instances...')
    train_contexts = experiment_utils.filter_contexts(train_contexts, min_total=min_total)
    test_contexts = experiment_utils.filter_contexts(test_contexts, min_total=min_total)

    # make sure the test_context and train_context keys are in sync
    experiment_utils.sync_keys(train_contexts, test_contexts)

    # test_contexts = filter_contexts(test_contexts, min_total=min_total)
    assert set(test_contexts.keys()) == set(train_contexts.keys())

    # extract the 'tag' attribute into the y-value for classification
    # tags may need to be converted to be consistent with the training data
    wmt_binary_classes = {u'BAD': 0, u'OK': 1}
    train_context_tags = experiment_utils.tags_from_contexts(train_contexts)
    train_context_tags = {k: np.array([wmt_binary_classes[v] for v in val]) for k, val in train_context_tags.items()}

    test_contexts = experiment_utils.convert_tagset(wmt_binary_classes, test_contexts)
    test_tags_actual = experiment_utils.tags_from_contexts(test_contexts)

    # all of the feature extraction should be parallelizable
    # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error:
    # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>}
    feature_extractors = experiment_utils.build_feature_extractors(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_context_features = experiment_utils.token_contexts_to_features_categorical(test_contexts, feature_extractors, workers=workers)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_context_features = experiment_utils.token_contexts_to_features_categorical(train_contexts, feature_extractors, workers=workers)

    # flatten so that we can properly binarize the features
    all_values = experiment_utils.flatten(test_context_features.values())
    all_values.extend(experiment_utils.flatten(train_context_features.values()))
    binarizers = experiment_utils.fit_binarizers(all_values)
    test_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in test_context_features.items()}
    train_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in train_context_features.items()}

    # BEGIN LEARNING
    classifier_type = experiment_utils.import_class(config['learning']['classifier']['module'])
    # train the classifier for each token
    classifier_map = learning_utils.token_classifiers(train_context_features, train_context_tags, classifier_type)

    # classify the test instances
    # TODO: output a file in WMT format
    # WORKING - dump the output in WMT format
    logger.info('classifying the test instances')
    test_predictions = {}
    for key, features in test_context_features.iteritems():
        try:
            classifier = classifier_map[key]
            predictions = classifier.predict(features)
            test_predictions[key] = predictions
        except KeyError as e:
            print(key + " - is NOT in the classifier map")
            raise

    #### put the rest of the code into a separate 'evaluate' function that reads the WMT files

    # create the performance report for each word in the test data that we had a classifier for
    # TODO: Working - evaluate based on the format
    f1_map = {}
    for token, predicted in test_predictions.iteritems():
        logger.info("Evaluating results for token = " + token)
        actual = test_tags_actual[token]
        print 'Actual: ', actual
        print 'Predicted: ', predicted
        logger.info("\ttotal instances: " + str(len(predicted)))
        f1_map[token] = weighted_fmeasure(actual, predicted)

    logger.info('Printing the map of f1 scores by token: ')
    print(f1_map)