def test_filter_contexts(self): context_creator_list = self.config['context_creators'] context_creators = import_utils.build_objects(context_creator_list) fake_token = '_z_z_z' interesting_tokens = set(['the','it', 'a', fake_token]) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) self.assertTrue(fake_token in token_contexts) filtered_contexts = experiment_utils.filter_contexts(token_contexts, min_total=1) self.assertFalse(fake_token in filtered_contexts, 'a token that does not exist should not have len(contexts) >= 1')
def test_map_contexts(self): context_creator_list = self.config['context_creators'] context_creators = import_utils.build_objects(context_creator_list) interesting_tokens = set(['the','it']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) for token in token_contexts.keys(): self.assertTrue(token in interesting_tokens) self.assertTrue(len(token_contexts[token]) > 0) for context in token_contexts[token][:10]: self.assertTrue(context['token'] != None)
def test_filter_context_class(self): context_creator_list = self.config['context_creators2'] context_creators = import_utils.build_objects(context_creator_list) interesting_tokens = set(['del','pescado']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) self.assertTrue('pescado' in token_contexts) filtered_contexts = experiment_utils.filter_contexts_class(token_contexts, min_total=self.config['filters']['min_total'], min_class_count=self.config['filters']['min_class_count'], proportion=self.config['filters']['proportion']) self.assertTrue('del' in filtered_contexts) self.assertFalse('pescado' in filtered_contexts)
def test_tags_from_contexts(self): context_creator_list = self.config['context_creators'] context_creators = import_utils.build_objects(context_creator_list) interesting_tokens = set(['the','it', 'a']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) token_tags = experiment_utils.tags_from_contexts(token_contexts) self.assertEqual(set(token_tags.keys()), set(token_contexts.keys())) for tok, tag_vector in token_tags.items(): self.assertTrue(tag_vector.shape[0] == len(token_contexts[tok]))
def test_convert_tagset(self): wmt_binary_classes = {0 :u'BAD', 1: u'OK'} context_creator_list = self.config['context_creators'] context_creators = import_utils.build_objects(context_creator_list) interesting_tokens = set(['the','it', 'a']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) new_token_contexts = experiment_utils.convert_tagset(wmt_binary_classes, token_contexts) reverse_map = {v:k for k,v in wmt_binary_classes.items()} for idx, tok_and_contexts in enumerate(new_token_contexts.iteritems()): tok, contexts = tok_and_contexts for idx, context in enumerate(contexts): self.assertEqual(reverse_map[context['tag']], token_contexts[tok][idx]['tag'])
def test_map_feature_extractor(self): context_creator_list = self.config['context_creators'] context_creators = import_utils.build_objects(context_creator_list) interesting_tokens = set(['the','it', 'a']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) feature_extractor_list = self.config['feature_extractors'][:1] feature_extractors = import_utils.build_objects(feature_extractor_list) mapped_context = np.hstack([experiment_utils.map_feature_extractor((token_contexts['the'][0], extractor)) for extractor in feature_extractors]) self.assertTrue(isinstance(mapped_context, np.ndarray)) # uses the TokenCountFeatureExtractor, which returns 3 features self.assertTrue(len(mapped_context) == 3)
def test_token_contexts_to_features(self): context_creator_list = self.config['context_creators'] context_creators = import_utils.build_objects(context_creator_list) interesting_tokens = set(['the','it', 'a']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) feature_extractor_list = self.config['feature_extractors'][:1] feature_extractors = import_utils.build_objects(feature_extractor_list) workers = 8 mapped_contexts = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors, workers=8) self.assertEqual(set(mapped_contexts.keys()), set(token_contexts.keys())) for tok, feature_vecs in mapped_contexts.items(): self.assertTrue(feature_vecs.shape[0] == len(token_contexts[tok]))
def test_time_token_contexts_to_features(self): context_creator_list = self.config['context_creators'] context_creators = import_utils.build_objects(context_creator_list) interesting_tokens = set(['the','it', 'a']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) feature_extractor_list = self.config['feature_extractors'][:1] feature_extractors = import_utils.build_objects(feature_extractor_list) start = time.time() mapped_contexts = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors) finish = time.time() - start print "Single: ", finish start = time.time() mapped_contexts = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors, workers=10) finish = time.time() - start print "Multiple: ", finish
def test_token_classifiers(self): interesting_tokens = set(['the','it', 'a']) context_creators = import_utils.build_objects(self.config['context_creators']) token_contexts = experiment_utils.map_contexts(interesting_tokens, context_creators) feature_extractors = import_utils.build_objects(self.config['feature_extractors']) token_context_features = experiment_utils.token_contexts_to_features(token_contexts, feature_extractors) binarizers = experiment_utils.fit_binarizers(experiment_utils.flatten(token_context_features.values())) token_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in token_context_features.items()} token_context_tags = experiment_utils.tags_from_contexts(token_contexts) # train the classifier for each token classifier_type = experiment_utils.import_class(self.config['learning']['classifier']['module']) classifier_map = learning_utils.token_classifiers(token_context_features, token_context_tags, classifier_type) self.assertEqual(set(token_context_tags.keys()), set(classifier_map.keys())) for tok, classifier in classifier_map.items(): self.assertTrue(hasattr(classifier, 'predict'))
def main(config): # load ContextCreators from config file, run their input functions, and pass the result into the initialization function # init() all context creators specified by the user with their arguments # import them according to their fully-specified class names in the config file # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want) # Chris - working - we want to hit every token interesting_tokens = experiment_utils.import_and_call_function( config['interesting_tokens']) print "INTERESTING TOKENS: ", interesting_tokens logger.info('The number of interesting tokens is: ' + str(len(interesting_tokens))) workers = config['workers'] # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator) logger.info('building the context creators...') train_context_creators = experiment_utils.build_objects( config['context_creators']) # get the contexts for all of our interesting words (may be +,- or, multi-class) logger.info( 'mapping the training contexts over the interesting tokens in train...' ) train_contexts = experiment_utils.map_contexts(interesting_tokens, train_context_creators, workers=workers) # load and parse the test data logger.info( 'mapping the training contexts over the interesting tokens in test...') test_context_creator = experiment_utils.build_objects(config['testing']) test_contexts = experiment_utils.map_contexts(interesting_tokens, [test_context_creator]) min_total = config['filters']['min_total'] # filter token contexts based on the user-specified filter criteria logger.info( 'filtering the contexts by the total number of available instances...') train_contexts = experiment_utils.filter_contexts(train_contexts, min_total=min_total) test_contexts = experiment_utils.filter_contexts(test_contexts, min_total=min_total) # make sure the test_context and train_context keys are in sync experiment_utils.sync_keys(train_contexts, test_contexts) # test_contexts = filter_contexts(test_contexts, min_total=min_total) assert set(test_contexts.keys()) == set(train_contexts.keys()) # extract the 'tag' attribute into the y-value for classification # tags may need to be converted to be consistent with the training data wmt_binary_classes = {u'BAD': 0, u'OK': 1} train_context_tags = experiment_utils.tags_from_contexts(train_contexts) train_context_tags = { k: np.array([wmt_binary_classes[v] for v in val]) for k, val in train_context_tags.items() } test_contexts = experiment_utils.convert_tagset(wmt_binary_classes, test_contexts) test_tags_actual = experiment_utils.tags_from_contexts(test_contexts) # all of the feature extraction should be parallelizable # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error: # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>} feature_extractors = experiment_utils.build_feature_extractors( config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_context_features = experiment_utils.token_contexts_to_features_categorical( test_contexts, feature_extractors, workers=workers) logger.info( 'mapping the feature extractors over the contexts for train...') train_context_features = experiment_utils.token_contexts_to_features_categorical( train_contexts, feature_extractors, workers=workers) # flatten so that we can properly binarize the features all_values = experiment_utils.flatten(test_context_features.values()) all_values.extend(experiment_utils.flatten( train_context_features.values())) binarizers = experiment_utils.fit_binarizers(all_values) test_context_features = { k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in test_context_features.items() } train_context_features = { k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in train_context_features.items() } # BEGIN LEARNING classifier_type = experiment_utils.import_class( config['learning']['classifier']['module']) # train the classifier for each token classifier_map = learning_utils.token_classifiers(train_context_features, train_context_tags, classifier_type) # classify the test instances # TODO: output a file in WMT format # WORKING - dump the output in WMT format logger.info('classifying the test instances') test_predictions = {} for key, features in test_context_features.iteritems(): try: classifier = classifier_map[key] predictions = classifier.predict(features) test_predictions[key] = predictions except KeyError as e: print(key + " - is NOT in the classifier map") raise #### put the rest of the code into a separate 'evaluate' function that reads the WMT files # create the performance report for each word in the test data that we had a classifier for # TODO: Working - evaluate based on the format f1_map = {} for token, predicted in test_predictions.iteritems(): logger.info("Evaluating results for token = " + token) actual = test_tags_actual[token] print 'Actual: ', actual print 'Predicted: ', predicted logger.info("\ttotal instances: " + str(len(predicted))) f1_map[token] = weighted_fmeasure(actual, predicted) logger.info('Printing the map of f1 scores by token: ') print(f1_map)
def main(config): # load ContextCreators from config file, run their input functions, and pass the result into the initialization function # init() all context creators specified by the user with their arguments # import them according to their fully-specified class names in the config file # it's up to the user to specify context creators which extract both negative and positive examples (if that's what they want) # Chris - working - we want to hit every token interesting_tokens = experiment_utils.import_and_call_function(config['interesting_tokens']) print "INTERESTING TOKENS: ", interesting_tokens logger.info('The number of interesting tokens is: ' + str(len(interesting_tokens))) workers = config['workers'] # Note: context creators currently create their own interesting tokens internally (interesting tokens controls the index of the context creator) logger.info('building the context creators...') train_context_creators = experiment_utils.build_objects(config['context_creators']) # get the contexts for all of our interesting words (may be +,- or, multi-class) logger.info('mapping the training contexts over the interesting tokens in train...') train_contexts = experiment_utils.map_contexts(interesting_tokens, train_context_creators, workers=workers) # load and parse the test data logger.info('mapping the training contexts over the interesting tokens in test...') test_context_creator = experiment_utils.build_objects(config['testing']) test_contexts = experiment_utils.map_contexts(interesting_tokens, [test_context_creator]) min_total = config['filters']['min_total'] # filter token contexts based on the user-specified filter criteria logger.info('filtering the contexts by the total number of available instances...') train_contexts = experiment_utils.filter_contexts(train_contexts, min_total=min_total) test_contexts = experiment_utils.filter_contexts(test_contexts, min_total=min_total) # make sure the test_context and train_context keys are in sync experiment_utils.sync_keys(train_contexts, test_contexts) # test_contexts = filter_contexts(test_contexts, min_total=min_total) assert set(test_contexts.keys()) == set(train_contexts.keys()) # extract the 'tag' attribute into the y-value for classification # tags may need to be converted to be consistent with the training data wmt_binary_classes = {u'BAD': 0, u'OK': 1} train_context_tags = experiment_utils.tags_from_contexts(train_contexts) train_context_tags = {k: np.array([wmt_binary_classes[v] for v in val]) for k, val in train_context_tags.items()} test_contexts = experiment_utils.convert_tagset(wmt_binary_classes, test_contexts) test_tags_actual = experiment_utils.tags_from_contexts(test_contexts) # all of the feature extraction should be parallelizable # note that a feature extractor MUST be able to parse the context exchange format, or it should throw an error: # { 'token': <token>, index: <idx>, 'source': [<source toks>]', 'target': [<target toks>], 'tag': <tag>} feature_extractors = experiment_utils.build_feature_extractors(config['feature_extractors']) logger.info('mapping the feature extractors over the contexts for test...') test_context_features = experiment_utils.token_contexts_to_features_categorical(test_contexts, feature_extractors, workers=workers) logger.info('mapping the feature extractors over the contexts for train...') train_context_features = experiment_utils.token_contexts_to_features_categorical(train_contexts, feature_extractors, workers=workers) # flatten so that we can properly binarize the features all_values = experiment_utils.flatten(test_context_features.values()) all_values.extend(experiment_utils.flatten(train_context_features.values())) binarizers = experiment_utils.fit_binarizers(all_values) test_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in test_context_features.items()} train_context_features = {k: [experiment_utils.binarize(v, binarizers) for v in val] for k, val in train_context_features.items()} # BEGIN LEARNING classifier_type = experiment_utils.import_class(config['learning']['classifier']['module']) # train the classifier for each token classifier_map = learning_utils.token_classifiers(train_context_features, train_context_tags, classifier_type) # classify the test instances # TODO: output a file in WMT format # WORKING - dump the output in WMT format logger.info('classifying the test instances') test_predictions = {} for key, features in test_context_features.iteritems(): try: classifier = classifier_map[key] predictions = classifier.predict(features) test_predictions[key] = predictions except KeyError as e: print(key + " - is NOT in the classifier map") raise #### put the rest of the code into a separate 'evaluate' function that reads the WMT files # create the performance report for each word in the test data that we had a classifier for # TODO: Working - evaluate based on the format f1_map = {} for token, predicted in test_predictions.iteritems(): logger.info("Evaluating results for token = " + token) actual = test_tags_actual[token] print 'Actual: ', actual print 'Predicted: ', predicted logger.info("\ttotal instances: " + str(len(predicted))) f1_map[token] = weighted_fmeasure(actual, predicted) logger.info('Printing the map of f1 scores by token: ') print(f1_map)