def read(self): """ :returns: nalaf.structures.data.Dataset """ xmls = [] if os.path.isdir(self.path): xmls = [ os.path.join(root, file) for root, _, files in os.walk(self.path) for file in files if file.startswith('medline') and file.endswith('xml') ] elif self.path.startswith('medline') and self.path.endswith('xml'): xmls = [self.path] dataset = Dataset() for xml in xmls: for child in ET.parse(xml).getroot(): pmid = next(child.iter('PMID')).text document = Document() article = next(child.iter('Article')) title = next(article.iter('ArticleTitle')).text document.parts['title'] = Part(title, is_abstract=False) try: abstract = next(article.iter('AbstractText')).text document.parts['abstract'] = Part(abstract) except StopIteration: pass dataset.documents[pmid] = document return dataset
def setUpClass(cls): cls.dataset = Dataset() doc1 = Document() cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1 for s in TEST_SENTENCES_SINGLE_ROOT: part = Part(s) doc1.parts[s] = part doc2 = Document() cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2 for s in TEST_SENTENCES_MULTI_ROOT: part = Part(s) doc2.parts[s] = part cls.nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(cls.nlp) cls.splitter = NLTKSplitter() cls.tokenizer = GenericTokenizer( lambda string: (tok.text for tok in cls.nlp.tokenizer(string))) cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) cls.parser.parse(cls.dataset) cls.computed_sentences = [] for sentence in cls.dataset.sentences(): dist, then = compute_shortest_paths(sentence) cls.computed_sentences.append((dist, then, sentence))
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" part1 = Part('123') part2 = Part('45678') ann1 = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=1, text='2', confidence=0) ann2 = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=1, text='567', confidence=1) ann1.subclass = 0 ann2.subclass = 2 part1.annotations.append(ann1) part2.annotations.append(ann2) cls.doc.parts['s1h1'] = part1 cls.doc.parts['s2p1'] = part2 doc2 = Document() doc3 = Document().parts['someid'] = Part('marmor stein und eisen') cls.dataset2 = Dataset() cls.dataset2.documents['newid'] = doc3 cls.dataset2.documents['testid'] = doc2
def test_DocumentLevelRelationEvaluator_order_irrelevant(self): evaluator = DocumentLevelRelationEvaluator(rel_type=STUB_R_ID_1) dataset = Dataset() doc_1 = Document() part_1 = Part('_irrelevant_') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 part_1.relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_1, 0, "TOOL"), Entity(STUB_E_ID_2, 0, "maynard")), ] # - part_1.predicted_relations = [ Relation(STUB_R_ID_1, Entity(STUB_E_ID_2, 0, "maynard"), Entity(STUB_E_ID_1, 0, "TOOL")), ] self._apply_pipeline(dataset) # --- evals = evaluator.evaluate(dataset) evaluation = evals(STUB_R_ID_1) self.assertEqual(evaluation.tp, 1) self.assertEqual(evaluation.fn, 0) self.assertEqual(evaluation.fp, 0) computation = evals(STUB_R_ID_1).compute(strictness="exact") self.assertEqual(computation.f_measure, 1.0)
def setUp(self): part = Part('Make making made. Try tried tries.') part.sentences = [[Token('Make', 0), Token('making', 5), Token('made', 12)], [Token('Try', 18), Token('tried', 22), Token('tries', 28)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part self.generator = PorterStemFeatureGenerator()
def generate_abstracts(self, list_of_pmids): """ Generates list of documents using pmids and the restapi interface from tmtools. Source: "http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/tmTools/" :param list_of_pmids: strings :return nalaf.structures.Dataset: dataset """ # if os.path.isfile('cache.json'): # with open('cache.json') as f: # tm_var = json.load() # else: url_tmvar = 'http://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/Mutation/{0}/JSON/' url_converter = 'http://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/' # load cache.json if exists if os.path.exists('cache.json'): with open('cache.json', 'r', encoding='utf-8') as f: tm_var = json.load(f) else: tm_var = {} for pmid in list_of_pmids: if pmid not in tm_var: # if pmid was not already downloaded from tmTools req = requests.get(url_tmvar.format(pmid)) try: tm_var[pmid] = req.json() except ValueError: pass # cache the tmVar annotations so we don't pull them every time with open('cache.json', 'w') as file: json.dump(tm_var, file, indent=4) # for key in tm_var: # print(json.dumps(tm_var[key], indent=4)) dataset = Dataset() for doc_id in list_of_pmids: if doc_id in tm_var: doc = Document() text = tm_var[doc_id]['text'] part = Part(text) denotations = tm_var[doc_id]['denotations'] annotations = [] for deno in denotations: ann = Entity( class_id=self.mut_class_id, offset=int(deno['span']['begin']), text=text[deno['span']['begin']:deno['span']['end']]) annotations.append(ann) # note should the annotations from tmvar go to predicted_annotations or annotations? part.annotations = annotations doc.parts['abstract'] = part dataset.documents[doc_id] = doc return dataset
def setUpClass(cls): # create a sample dataset1 (1) to test cls.dataset1 = Dataset() doc_1 = Document() text = '.... aaaa .... bbbb .... cccc .... dddd .... eeee .... ffff .... gggg .... hhhh .... jjjj' part_1 = Part(text) cls.dataset1.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 exact_1 = Entity(STUB_E_ID_1, 5, 'aaaa') exact_1.subclass = 1 exact_2 = Entity(STUB_E_ID_1, 55, 'ffff') exact_2.subclass = 2 exact_3 = Entity(STUB_E_ID_1, 75, 'hhhh') exact_3.subclass = 2 overlap_1_1 = Entity(STUB_E_ID_1, 25, 'cccc') overlap_1_1.subclass = 1 overlap_1_2 = Entity(STUB_E_ID_1, 26, 'cc') overlap_1_2.subclass = 1 overlap_2_1 = Entity(STUB_E_ID_1, 32, '.. ddd') overlap_2_1.subclass = 2 overlap_2_2 = Entity(STUB_E_ID_1, 36, 'ddd ...') overlap_2_2.subclass = 2 overlap_3_1 = Entity(STUB_E_ID_1, 65, 'gggg') overlap_3_1.subclass = 1 overlap_3_2 = Entity(STUB_E_ID_1, 62, '.. gggg ..') overlap_3_2.subclass = 2 missing_1 = Entity('e2', 45, 'eeee') missing_1.subclass = 1 missing_2 = Entity('e2', 84, 'jjjj') missing_2.subclass = 1 spurios = Entity('e2', 15, 'bbbb') spurios.subclass = 1 part_1.annotations = [ exact_1, exact_2, exact_3, overlap_1_1, overlap_2_1, overlap_3_1, missing_1, missing_2 ] part_1.predicted_annotations = [ exact_1, exact_2, exact_3, overlap_1_2, overlap_2_2, overlap_3_2, spurios ]
def setUp(self): part = Part('Make making made. Try tried tries.') part.sentences = [[ Token('Make', 0), Token('making', 5), Token('made', 12) ], [Token('Try', 18), Token('tried', 22), Token('tries', 28)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part for token in self.dataset.tokens(): token.features['a'] = 'a' token.features['b'] = 'b'
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" cls.part = Part( 'Here is a random sentence for the benefit of your mamma') cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=10, text='random sentence', confidence=0) cls.part.annotations.append(cls.entity) cls.doc.parts['s1h1'] = cls.part # Apply through pipeline NLTKSplitter().split(cls.dataset) NLTK_TOKENIZER.tokenize(cls.dataset) nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(nlp) cls.parser.parse(cls.dataset) # cls.part.percolate_tokens_to_entities() cls.sentence = cls.part.sentences[0]
def setUp(self): part = Part('Word1 word2 word3. Word4 word5 word6.') part.sentences = [[ Token('Word1', 0), Token('word2', 6), Token('word3', 12) ], [Token('Word4', 19), Token('word5', 25), Token('word6', 31)]] self.dataset = Dataset() self.dataset.documents['doc_1'] = Document() self.dataset.documents['doc_1'].parts['part_1'] = part self.simple_generator = SimpleFeatureGenerator() self.sentence_generator = SentenceMarkerFeatureGenerator()
def read(self): """ :returns: nalaf.structures.data.Dataset """ dataset = Dataset() with open(self.corpus_file, encoding='utf-8') as file: for row in file: columns = row.split("\t") docid = columns[0] typ = columns[1] start = columns[2] end = columns[3] entity_text = columns[7] class_id = None if typ == 'Mutation': class_id = self.mut_class_id elif typ == 'AminoacidResidue': class_id = self.residue_class_id if class_id: document = dataset.documents.get(docid, Document()) part = Part(entity_text) document.parts[typ + '|' + start + '|' + end] = part part.annotations.append( Entity(class_id, int(start), entity_text)) dataset.documents[docid] = document return dataset
def read_file(a_file, filename, dataset=None, whole_basename_as_docid=False): if dataset is None: dataset = Dataset() soup = BeautifulSoup(a_file, "html.parser") document = Document() for part in soup.find_all(id=re.compile('^s')): if re.match(r'^s[3-9]', part['id']): is_abstract = False else: is_abstract = True document.parts[part['id']] = Part(str(part.string), is_abstract=is_abstract) doc_id = os.path.basename(filename).replace('.plain.html', '').replace( '.html', '').replace('.xml', '') if not whole_basename_as_docid and '-' in doc_id: doc_id = doc_id.split('-')[-1] dataset.documents[doc_id] = document return dataset
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc part1 = Part('Sentence 1: e_1_yolo may be related to e_2_tool plus hey, e_2_coco. Sentence 2: e_1_nin. Sentence 3: e_2_musk. Sentence 4: nothing') entities = [ # Sent 1 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=12, text='e_1_yolo', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=39, text='e_2_tool', confidence=0), Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=58, text='e_2_coco', confidence=0), # Sent 2 Entity(class_id=STUB_ENTITY_CLASS_ID_1, offset=80, text='e_1_nin', confidence=0), # Sent 3 Entity(class_id=STUB_ENTITY_CLASS_ID_2, offset=101, text='e_2_musk', confidence=0), # Sent 4 ] for e in entities: part1.annotations.append(e) cls.doc.parts['s1h1'] = part1 cls.splitter = NLTKSplitter() cls.tokenizer = NLTK_TOKENIZER cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) # assert False, str(list(cls.dataset.sentences())) assert 4 == len(list(cls.dataset.sentences())), str(list(cls.dataset.sentences()))
def setUpClass(cls): cls.dataset = Dataset() doc = Document() part = Part( 'This is one sentence. This is another one.\n This is the third one; here continues.' ) cls.dataset.documents['doc_1'] = doc doc.parts['part_1'] = part
def _create_basic_dataset(self): dataset = Dataset() doc_1 = Document() part_1 = Part('_irrelevant_') dataset.documents['doc_1'] = doc_1 doc_1.parts['part_1'] = part_1 self._apply_pipeline(dataset) return (dataset, part_1)
def download(self, pmids): for pmid in pmids: if pmid in self.cache: xml = ET.fromstring(self.cache[pmid]) else: req = requests.get(self.pubmed_url, { 'db': 'pubmed', 'retmode': 'xml', 'id': pmid }) text = req.text xml = ET.fromstring(text) self.cache[pmid] = text doc = Document() if self.one_part: joined_text = '\n'.join( element.text for element in chain(xml.findall('.//ArticleTitle'), xml.findall('.//AbstractText'))) doc.parts['title_and_abstract'] = Part(joined_text) else: # for now only include title and abstract title_elem = xml.find('.//ArticleTitle') if title_elem is not None: doc.parts['title'] = Part(title_elem.text) abstract_elem = xml.findall('.//AbstractText') if abstract_elem is not None: abstract_elems = [] for elem in abstract_elem: if 'Label' in elem.attrib and elem.attrib[ 'Label'] != 'UNLABELLED': abstract_elems.append('{}: {}'.format( elem.attrib['Label'], elem.text)) else: abstract_elems.append(elem.text) abstract_elems = filter(None, abstract_elems) doc.parts['abstract'] = Part(' '.join(abstract_elems)) # yield the document but only if you found anything if len(doc.parts) > 0: yield pmid, doc
def test_main_verbs(self): for _, _, sentence in self.computed_sentences: print() print(sentence) verbs = set( Part.get_main_verbs(sentence, token_map=lambda t: t.features["lemma"])) print("\t", verbs)
def __process_file(filename): document = Document() with open(filename) as file: part_id = 1 for part in re.split('\n\n', file.read()): if part.strip(): document.parts['{}'.format(part_id)] = Part(part) part_id += 1 return os.path.split(filename)[-1], document
def setUpClass(cls): # create a sample dataset to test cls.dataset = Dataset() doc_id1 = Document() # 15 tokens in 2 sentences doc_id1.parts['p1'] = Part('insertionefsA dup23.23') doc_id1.parts['p1'].sentences = [[Token('insertionefsA', 0), Token('dup23.23', 14)]] cls.dataset.documents['doc_id1'] = doc_id1 cls.feature = TmVarFeatureGenerator() cls.feature.generate(dataset=cls.dataset)
def read(self): """ :returns: nalaf.structures.data.Dataset """ part = Part(self.string) document = Document() dataset = Dataset() dataset.documents['doc_1'] = document document.parts['part_1'] = part return dataset
def read(self): """ :returns: nalaf.structures.data.Dataset """ dataset = Dataset() with open(self.corpus_file, encoding='utf-8') as file: documents = file.read().strip().split('\n\n') for document_text in documents: lines = document_text.strip().splitlines() first_line = re.search('(\d+)\|t\|(.*)', lines[0]) doc_id = first_line.group(1) tmvar_title = first_line.group(2) tmvar_abstract = re.search('(\d+)\|a\|(.*)', lines[1]).group(2) document = Document() title = Part(tmvar_title) abstract = Part(tmvar_abstract) document.parts['title'] = title document.parts['abstract'] = abstract for line in lines[2:]: _, start, end, _, _, _ = line.split('\t') start = int(start) end = int(end) if 0 <= start < end <= len(tmvar_title): part = title else: part = abstract start -= len(tmvar_title) + 1 end -= len(tmvar_title) + 1 part.annotations.append( Entity(self.mut_class_id, start, part.text[start:end])) dataset.documents[doc_id] = document return dataset
def setUpClass(cls): text1 = "Flowers in the Rain. Are absolutely marvellous. Though i would say this text is stupid. Cheers!" part1 = Part(text1) doc = Document() doc.parts['firstpart'] = part1 dataset = Dataset() dataset.documents['firstdocument'] = doc NLTKSplitter().split(dataset) # TmVarTokenizer().tokenize(dataset) cls.data = dataset cls.testpart = dataset.documents['firstdocument'].parts['firstpart']
def setUpClass(cls): # create a sample dataset to test cls.dataset = Dataset() doc_id1 = Document() doc_id1.parts['t1'] = Part('This title blows your mind') text = str( 'This magic only exists in your dreams. To become reality, you have to work at it. ' 'Thr is only available with the residue threonine and a mutation, ' 'though things can change positions ' 'when adding some more replacements. Between me being sorry ' 'and you being an insertion.') doc_id1.parts['p1'] = Part(text.replace('\n', '')) cls.dataset.documents['doc_id1'] = doc_id1 NLTKSplitter().split(cls.dataset) TmVarTokenizer().tokenize(cls.dataset) cls.feature = NLMentionFeatureGenerator(thr=4) cls.feature.generate(dataset=cls.dataset)
def read(self): """ read each .txt file in the directory, parse it and create and instance of Document form a dataset consisting of every document parsed and return it :returns structures.data.Dataset """ dataset = Dataset() with open(self.corpus_folder, encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: docid, title, abstract = row title = title.strip() abstract = abstract.strip() document = Document() if title: document.parts['title'] = Part(title) if abstract and abstract != 'null': document.parts['abstract'] = Part(abstract) dataset.documents[docid] = document return dataset
def read(self): """ read each .txt file in the directory, parse it and create and instance of Document form a dataset consisting of every document parsed and return it :returns structures.data.Dataset """ dataset = Dataset() with open(self.corpus_file, encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: document = Document() document.parts['abstract'] = Part(row[1]) dataset.documents[row[0]] = document return dataset
def setup_class(cls): # create a sample dataset to test cls.dataset = Dataset() doc_id1 = Document() # 15 tokens in 2 sentences doc_id1.parts['p1'] = Part( 'This is some sample text. This is another, sample sentence with coma.' ) doc_id1.parts['p1'].sentences_ = [ 'This is some sample text.', 'This is another, sample sentence with coma.' ] cls.dataset.documents['doc_id1'] = doc_id1 cls.tokenizer = NLTK_TOKENIZER cls.tokenizer.tokenize(cls.dataset)
def setup_class(cls): # create a sample dataset to test cls.dataset = Dataset() doc_id1 = Document() # 15 tokens in 2 sentences doc_id1.parts['p1'] = Part( 'this is some sample text. it contains this c.2708_2711delTTAG mutation.' ) doc_id1.parts['p1'].sentences_ = [ 'this is some sample text.', 'it contains this c.2708_2711delTTAG mutation.' ] cls.dataset.documents['doc_id1'] = doc_id1 cls.tokenizer = TmVarTokenizer() cls.tokenizer.tokenize(cls.dataset)
def setup_class(cls): # create a sample dataset to test cls.dataset = Dataset() part = Part('some text c.A100G p.V100Q some text') part.sentences = [[ Token('some', 0), Token('text', 5), Token('c', 10), Token('.', 11), Token('A', 12), Token('100', 13), Token('G', 16), Token('p', 18), Token('.', 19), Token('V', 20), Token('100', 21), Token('Q', 24), Token('some', 26), Token('text', 31) ]] predicted_labels = [ 'O', 'O', 'B', 'I', 'I', 'I', 'E', 'A', 'I', 'I', 'I', 'E', 'O', 'O' ] for index, label in enumerate(predicted_labels): part.sentences[0][index].predicted_labels = [Label(label)] cls.dataset.documents['doc_1'] = Document() cls.dataset.documents['doc_1'].parts['p1'] = part part = Part('test edge case DNA A927B test') part.sentences = [[ Token('test', 0), Token('edge', 5), Token('case', 10), Token('DNA', 15), Token('A', 19), Token('927', 20), Token('B', 23), Token('test', 25) ]] predicted_labels = ['O', 'O', 'O', 'O', 'M', 'P', 'M', 'O'] for index, label in enumerate(predicted_labels): part.sentences[0][index].predicted_labels = [Label(label)] cls.dataset.documents['doc_1'].parts['p2'] = part
def _get_test_data(self, entity_sentence, assumed_tokens_words=None): if assumed_tokens_words is None: assumed_tokens_words = entity_sentence.split(' ') # Create dataset dataset = StringReader(entity_sentence).read() part = next(dataset.parts()) entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=0, text=entity_sentence) part.annotations.append(entity) # Apply through pipeline NLTKSplitter().split(dataset) NLTK_TOKENIZER.tokenize(dataset) self.parser.parse(dataset) # Rest sentences = part.sentences assert len(sentences) == 1 sentence = sentences[0] assert len(assumed_tokens_words) == len(sentence) for (assumed_token_word, actual_token) in zip(assumed_tokens_words, sentence): assert assumed_token_word == actual_token.word part.compute_tokens_depth() roots = Part.get_sentence_roots(sentence) for r in roots: self._assert_depth_eq(r, 0) part.set_entities_head_tokens() return (dataset, sentence, entity, roots)
def generate(self, corpus, f_set, use_gold, use_pred): assert not (use_gold and use_pred), "No support for both" self.extract_abbreviation_synonyms(corpus, use_gold, use_pred) for docid, document in corpus.documents.items(): for edge in document.edges(): sentence = edge.get_combined_sentence() entities_in_sentences = edge.get_any_entities_in_sentences(predicted=use_pred) total_count = 0 # We sort to have a deterministic order creation of the features for e_class_id in sorted(entities_in_sentences): entities = entities_in_sentences[e_class_id] # TODO this is wrong for other entitiey types nor appearing in the edge # TODO also what about if the same entity type appears in both ends of the same edge? as in a protein-protein relation --> Just rest the counts of the edge individual_count = len(entities) - 1 # rest 1, as one is already one of the edge's entities -- assert individual_count >= 0 total_count += individual_count self.add_with_value(f_set, edge, 'f_counts_individual', individual_count, 'int', 'individual', e_class_id) self.add_with_value(f_set, edge, 'f_counts_total', total_count, 'int', 'total (all classes)') entities_between_entities = edge.get_any_entities_between_entities(predicted=use_pred) total_count = 0 # We sort to have a deterministic order creation of the features for e_class_id in sorted(entities_between_entities): entities = entities_between_entities[e_class_id] individual_count = len(entities) total_count += individual_count self.add_with_value(f_set, edge, 'f_counts_in_between_individual', individual_count, 'int', 'individual', e_class_id) self.add_with_value(f_set, edge, 'f_counts_in_between_total', total_count, 'int', 'total (all classes)') order = edge.entity1.class_id < edge.entity2.class_id if order: self.add(f_set, edge, 'f_order') for token in sentence: self.add(f_set, edge, 'f_bow', masked_text(token, edge.same_part, use_gold, use_pred, token_map=lambda t: t.features['lemma'], token_is_number_fun=lambda _: "NUM")) self.add(f_set, edge, 'f_pos', token.features['coarsed_pos']) self.add_with_value(f_set, edge, 'f_tokens_count', len(sentence)) # Remember, the edge's entities are sorted, i.e. e1.offset < e2.offset _e1_first_token_index = edge.entity1.tokens[0].features['tmp_id'] _e2_last_token_index = edge.entity2.tokens[-1].features['tmp_id'] assert _e1_first_token_index < _e2_last_token_index, (docid, sentence, edge.entity1.text, edge.entity2.text, _e1_first_token_index, _e2_last_token_index) self.add_with_value(f_set, edge, 'f_tokens_count_before', len(sentence[:_e1_first_token_index])) self.add_with_value(f_set, edge, 'f_tokens_count_after', len(sentence[(_e2_last_token_index+1):])) # if Part.is_negated(sentence): self.add(f_set, edge, "f_sentence_is_negated") # verbs = set(Part.get_main_verbs(sentence, token_map=lambda t: t.features["lemma"])) if len(verbs) == 0: self.add(f_set, edge, "f_main_verbs", "NO_MAIN_VERB") else: for v in verbs: self.add(f_set, edge, "f_main_verbs", v) counters = {} for part in document: for entity in (part.annotations if use_gold else part.predicted_annotations): ent_type_counter = counters.get(entity.class_id, Counter()) ent_key = __class__.entity2key(entity) ent_type_counter.update([ent_key]) counters[entity.class_id] = ent_type_counter e1_key = __class__.entity2key(edge.entity1) e1_count = counters[edge.entity1.class_id][e1_key] self.add_with_value(f_set, edge, 'f_entity1_count', e1_count) e2_key = __class__.entity2key(edge.entity2) e2_count = counters[edge.entity2.class_id][e2_key] self.add_with_value(f_set, edge, 'f_entity2_count', e2_count) together_counter = Counter() diff_sentences = {} for aux_edge in document.edges(): if aux_edge.e1_sentence_id == aux_edge.e2_sentence_id: together_key = __class__.edge2key(aux_edge) sents = diff_sentences.get(together_key, []) if aux_edge.e1_sentence_id not in sents: sents.append(aux_edge.e1_sentence_id) diff_sentences[together_key] = sents together_counter.update([together_key]) together_key = __class__.edge2key(edge) together_count = together_counter[together_key] if together_count > 0: self.add_with_value(f_set, edge, 'f_diff_sents_together_count', together_count)