def test_extract_system_mentions(self): expected_spans = sorted([ spans.Span(0, 1), spans.Span(0, 5), spans.Span(3, 5), spans.Span(5, 5), spans.Span(8, 10), spans.Span(8, 11), spans.Span(13, 16), spans.Span(13, 20), spans.Span(14, 14), spans.Span(18, 20), spans.Span(22, 23), spans.Span(25, 25), spans.Span(33, 34) ]) self.assertEqual(expected_spans, [ mention.span for mention in mention_extractor.extract_system_mentions( self.real_document, filter_mentions=False)[1:] ]) expected_spans = sorted([ spans.Span(2, 2), spans.Span(4, 4), spans.Span(6, 7), spans.Span(6, 11), spans.Span(9, 10), spans.Span(9, 11) ]) self.assertEqual(expected_spans, [ mention.span for mention in mention_extractor.extract_system_mentions( self.another_real_document, filter_mentions=False)[1:] ]) expected_spans = sorted([ spans.Span(2, 2), spans.Span(4, 4), spans.Span(6, 11), spans.Span(9, 10), spans.Span(9, 11) ]) self.assertEqual(expected_spans, [ mention.span for mention in mention_extractor.extract_system_mentions( self.another_real_document, filter_mentions=True)[1:] ])
def test_extract_system_mentions(self): expected_spans = sorted([ spans.Span(0, 1), spans.Span(0, 5), spans.Span(3, 5), spans.Span(5, 5), spans.Span(8, 10), spans.Span(8, 11), spans.Span(13, 16), spans.Span(13, 20), spans.Span(14, 14), spans.Span(18, 20), spans.Span(22, 23), spans.Span(25, 25), spans.Span(33, 34) ]) self.assertEqual(expected_spans, [mention.span for mention in mention_extractor.extract_system_mentions( self.real_document, filter_mentions=False)[1:]]) expected_spans = sorted([ spans.Span(2, 2), spans.Span(4, 4), spans.Span(6, 7), spans.Span(6, 11), spans.Span(9, 10), spans.Span(9, 11) ]) self.assertEqual(expected_spans, [mention.span for mention in mention_extractor.extract_system_mentions( self.another_real_document, filter_mentions=False)[1:]]) expected_spans = sorted([ spans.Span(2, 2), spans.Span(4, 4), spans.Span(6, 11), spans.Span(9, 10), spans.Span(9, 11) ]) self.assertEqual(expected_spans, [mention.span for mention in mention_extractor.extract_system_mentions( self.another_real_document, filter_mentions=True)[1:]])
def do_coreference(self): testing_corpus = corpora.Corpus("input", [ self.p.run_on_doc(io.StringIO(self.txt.get("0.0", tki.END)), "input") ]) logging.info("Extracting system mentions.") for doc in testing_corpus: doc.system_mentions = mention_extractor.extract_system_mentions( doc) mention_entity_mapping, antecedent_mapping = experiments.predict( testing_corpus, self.extractor, self.perceptron, clusterer.all_ante) testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) logging.info("Visualize") for doc in testing_corpus: max_id = 0 for mention in doc.system_mentions[1:]: set_id = mention.attributes["set_id"] if set_id: max_id = max(set_id, max_id) max_id += 1 doc.annotated_mentions = [] for i, mention in enumerate(doc.system_mentions[1:]): if mention.attributes["set_id"]: mention.attributes[ "annotated_set_id"] = mention.attributes["set_id"] else: mention.attributes["annotated_set_id"] = max_id + i doc.annotated_mentions.append(mention) ex = error_extractors.ErrorExtractor( testing_corpus, spanning_tree_algorithms.recall_accessibility, spanning_tree_algorithms.precision_system_output) ex.add_system(testing_corpus) decisions = ex.get_errors() visualizer = visualization.Visualizer(decisions, "input", for_raw_input=True) visualizer.run()
def do_coreference(self): testing_corpus = corpora.Corpus("input", [self.p.run_on_doc( io.StringIO(self.txt.get("0.0", tki.END)), "input")]) logging.info("Extracting system mentions.") for doc in testing_corpus: doc.system_mentions = mention_extractor.extract_system_mentions(doc) mention_entity_mapping, antecedent_mapping = experiments.predict( testing_corpus, self.extractor, self.perceptron, clusterer.all_ante ) testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) logging.info("Visualize") for doc in testing_corpus: max_id = 0 for mention in doc.system_mentions[1:]: set_id = mention.attributes["set_id"] if set_id: max_id = max(set_id, max_id) max_id += 1 doc.annotated_mentions = [] for i, mention in enumerate(doc.system_mentions[1:]): if mention.attributes["set_id"]: mention.attributes["annotated_set_id"] = mention.attributes[ "set_id"] else: mention.attributes["annotated_set_id"] = max_id + i doc.annotated_mentions.append(mention) ex = error_extractors.ErrorExtractor(testing_corpus, spanning_tree_algorithms.recall_accessibility, spanning_tree_algorithms.precision_system_output) ex.add_system(testing_corpus) decisions = ex.get_errors() visualizer = visualization.Visualizer(decisions, "input", for_raw_input=True) visualizer.run()
perceptron = import_helper.import_from_path(args.perceptron)(priors=priors, weights=weights, cost_scaling=0) extractor = instance_extractors.InstanceExtractor( import_helper.import_from_path(args.extractor), mention_features, pairwise_features, cost_functions.null_cost, perceptron.get_labels()) logging.info("Reading in data.") testing_corpus = corpora.Corpus.from_file( "testing", codecs.open(args.input_filename, "r", "utf-8")) logging.info("Extracting system mentions.") for doc in testing_corpus: doc.system_mentions = mention_extractor.extract_system_mentions(doc) mention_entity_mapping, antecedent_mapping = experiments.predict( testing_corpus, extractor, perceptron, import_helper.import_from_path(args.clusterer)) testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) logging.info("Write corpus to file.") testing_corpus.write_to_file(codecs.open(args.output_filename, "w", "utf-8")) if args.ante: logging.info("Write antecedent decisions to file") testing_corpus.write_antecedent_decisions_to_file(open(args.ante, "w")) if args.gold:
def call_cort(text_blob): mention_features = [ features.fine_type, features.gender, features.number, features.sem_class, features.deprel, features.head_ner, features.length, features.head, features.first, features.last, features.preceding_token, features.next_token, features.governor, features.ancestry ] pairwise_features = [ features.exact_match, features.head_match, features.same_speaker, features.alias, features.sentence_distance, features.embedding, features.modifier, features.tokens_contained, features.head_contained, features.token_distance ] # todo make sure these are exact! model_abs = '/Users/ryanpanos/Documents/code/cort_experiments/models/model-pair-train+dev.obj' #OMG evil! # perceptron_path = 'cort.coreference.approaches.mention_ranking.RankingPerceptron' # extractor_path = ' cort.coreference.approaches.mention_ranking.extract_substructures' perceptron_path = 'cort.coreference.approaches.mention_ranking.RankingPerceptron' extractor_path = ' coreference.approaches.mention_ranking.extract_substructures' corenlp_path = '/Users/ryanpanos/Documents/code/StanfordNLP/stanford-corenlp-full-2016-10-31/' #OMG evil! clusterer_path = 'cort.coreference.clusterer.all_ante' # logging.info("Loading model.") print("Loading model . ... (this takes a while) ") priors, weights = pickle.load(open(model_abs, "rb")) print("Model loaded.") # perceptron = import_helper.import_from_path(perceptron_path)( # priors=priors, # weights=weights, # cost_scaling=0 # ) perceptron = RankingPerceptron(priors=priors, weights=weights, cost_scaling=0) extractor = instance_extractors.InstanceExtractor( # import_helper.import_from_path(extractor_path), extract_substructures, mention_features, pairwise_features, cost_functions.null_cost, perceptron.get_labels()) logging.info("Reading in and preprocessing data.") p = pipeline.Pipeline(corenlp_path) testing_corpus = p.run_on_blob("corpus", text_blob) logging.info("Extracting system mentions.") for doc in testing_corpus: doc.system_mentions = mention_extractor.extract_system_mentions(doc) mention_entity_mapping, antecedent_mapping = experiments.predict( testing_corpus, extractor, perceptron, # import_helper.import_from_path(clusterer_path) all_ante) testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) logging.info("Write output to file.") output_ls = [] for doc in testing_corpus: output = doc.to_simple_output() # my_file = codecs.open(doc.identifier + "." + args.suffix, "w", "utf-8") # my_file.write(output) print " output: \n" + output # my_file.close() output_ls.append(output) logging.info("Done.") return
# cost_functions.null_cost, # perceptron.get_labels() # ) logging.info("Reading in data.") training_corpus = corpora.Corpus.from_file("training", codecs.open(args.input_filename, "r", "utf-8")) logging.info("Extracting system mentions.") dummy_counter_for_train = 0 for doc in training_corpus: if dummy_counter_for_train % 100 == 99: logging.info("We are extracting doc " + str(dummy_counter_for_train) + ": " + doc.identifier) dummy_counter_for_train += 1 doc.system_mentions = mention_extractor.extract_system_mentions(doc) # logging.info("\tVerifying attributes.") # for doc in training_corpus: # doc.antecedent_decisions = {} # print(doc, doc.antecedent_decisions) # for mention in doc.system_mentions: # if not "antecedent" in mention.attributes.keys(): # mention.attributes["antecedent"] = None # if not "set_id" in mention.attributes.keys(): # mention.attributes["set_id"] = None # # logging.info("\tExtracting instances and features.") # substructures, arc_information = extractor.extract(training_corpus) #