def read_text(pathname): with open(pathname) as fp: text = fp.read() sentences = [] offset = 0 for sent in text.split('\n'): sentence = bioc.BioCSentence() sentence.infons['filename'] = pathname.stem sentence.offset = offset sentence.text = sent sentences.append(sentence) i = 0 for m in re.finditer('\S+', sent): if i == 0 and m.start() != 0: # add fake ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = '' ann.add_location(bioc.BioCLocation(offset, 0)) sentence.add_annotation(ann) i += 1 ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = m.group() ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group()))) sentence.add_annotation(ann) i += 1 offset += len(sent) + 1 return sentences
def test_extend(): text = 'findings: no pneumothorax.' d = text_to_bioc([text], type='d/p/s') a = bioc.BioCAnnotation() a.text = 'pneumothorax' a.add_location(bioc.BioCLocation(13, 12)) d.passages[0].add_annotation(a) detector.__call__(d) # fake ann a = bioc.BioCAnnotation() a.text = 'eumothor' a.add_location(bioc.BioCLocation(15, 8)) d.passages[0].add_annotation(a) a = bioc.BioCAnnotation() a.text = 'foo' a.add_location(bioc.BioCLocation(27, 3)) d.passages[0].add_annotation(a) _extend(d, 'negation') assert d.passages[0].annotations[1].infons['negation'] == 'True' assert 'negation' not in d.passages[0].annotations[2].infons d.passages[0].annotations[0].infons['CUI'] = 'xxx' d.passages[0].annotations[2].infons['CUI'] = 'xxx' _extend(d, 'negation') assert 'negation' not in d.passages[0].annotations[2].infons
def save_as_collection(list_of_pymedext_documents: List[Document]): """save a list of pymedext document as a bioc collection . It will return a bioc collection object. :param list_of_pymedext_documents: a list of Document :returns: a bioc collection object """ this_bioc_collection = bioc.BioCCollection() for this_pymedext_doc in list_of_pymedext_documents: this_bioc_doc = bioc.BioCDocument() for annot in this_pymedext_doc.annotations: # print(annot.type) print(annot.source) if annot.type == "raw_text": if this_bioc_collection.source == '': this_bioc_collection.source = annot.source if annot.source == "BioCPassage": print(annot.ngram) print(annot.value) this_passage = bioc.BioCPassage() this_passage.text = annot.ngram this_passage.offset = annot.span[0] this_bioc_doc.add_passage(this_passage) # passageAttributes to add elif annot.source == "BioCAnnotation": this_annotation = bioc.BioCAnnotation() this_annotation.infons = annot.attributes this_annotation.id = annot.attributes["id"] this_annotation.text = annot.ngram thisLocation = bioc.BioCLocation( annot.span[0], annot.span[1] - annot.span[0]) this_annotation.add_location(thisLocation) this_bioc_doc.passages[-1].add_annotation(this_annotation) this_bioc_collection.add_document(this_bioc_doc) return (this_bioc_collection)
def convertKindredCorpusToBioCCollection(corpus): assert isinstance(corpus,kindred.Corpus) collection = bioc.BioCCollection() for kdoc in corpus.documents: assert isinstance(kdoc,kindred.Document) biocDoc = bioc.BioCDocument() collection.add_document(biocDoc) if 'id' in kdoc.metadata: biocDoc.id = kdoc.metadata['id'] biocDoc.infons = kdoc.metadata passage = bioc.BioCPassage() passage.text = kdoc.text passage.offset = 0 biocDoc.add_passage(passage) seenEntityIDs = set() kindredID2BiocID = {} for e in kdoc.entities: assert isinstance(e,kindred.Entity) a = bioc.BioCAnnotation() a.text = e.text a.infons = {'type':e.entityType} a.infons.update(e.metadata) if e.sourceEntityID is None: a.id = str(e.entityID) else: a.id = e.sourceEntityID assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id seenEntityIDs.add(a.id) kindredID2BiocID[e.entityID] = a.id for start,end in e.position: l = bioc.BioCLocation(offset=start, length=(end-start)) a.locations.append(l) passage.annotations.append(a) for r in kdoc.relations: assert isinstance(r,kindred.Relation) biocR = bioc.BioCRelation() biocR.infons = {'type':r.relationType} entitiesInRelation = r.entities argNames = r.argNames if argNames is None: argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ] for argName,entity in zip(argNames,entitiesInRelation): node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID]) biocR.nodes.append(node) passage.relations.append(biocR) return collection
def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set): with open(infile, 'r', encoding='utf-8') as fin: with open(outfile, 'w', encoding='utf8') as fout: collection = bioc.load(fin) for document in collection.documents: for passage in document.passages: tag_result = bioTag(passage.text, biotag_dic, nn_model, onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'], Threshold=para_set['ML_Threshold']) mention_num = 0 for ele in tag_result: bioc_note = bioc.BioCAnnotation() bioc_note.id = str(mention_num) mention_num += 1 bioc_note.infons['identifier'] = ele[2] bioc_note.infons['type'] = "Phenotype" bioc_note.infons['score'] = ele[3] start = int(ele[0]) last = int(ele[1]) loc = bioc.BioCLocation(offset=str(start), length=str(last - start)) bioc_note.locations.append(loc) bioc_note.text = passage.text[start:last] passage.annotations.append(bioc_note) bioc.dump(collection, fout, pretty_print=True)
def dump_results(doc_name, entities, opt): entity_id = 1 collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) entity_id += 1 anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation( entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0]) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.name if len(entity.norm_ids) > 0: anno_entity.infons['UMLS code'] = entity.norm_ids[0] anno_entity.infons['UMLS term'] = entity.norm_names[0] else: anno_entity.infons['UMLS code'] = 'N/A' anno_entity.infons['UMLS term'] = 'N/A' with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w', 'UTF-8') as fp: bioc.dump(collection, fp)
def annotation(this, json_note): note = bioc.BioCAnnotation() note.id = json_note['id'] note.infons = json_note['infons'] note.text = json_note['text'] note.locations = [this.location(l) for l in json_note['locations']] return note
def pubtator2bioc_ann(ann: PubTatorAnn) -> bioc.BioCAnnotation: biocann = bioc.BioCAnnotation() biocann.id = 'T{}'.format(ann.start) biocann.infons['type'] = ann.type biocann.infons['concept_id'] = ann.id biocann.add_location(bioc.BioCLocation(ann.start, ann.end - ann.start)) biocann.text = ann.text return biocann
def run_metamap_col(collection, mm, cuis=None): """ Get CUIs from metamap. Args: collection(BioCCollection): mm(MetaMap): MetaMap instance Returns: BioCCollection """ try: annIndex = itertools.count() sentence_map = collections.OrderedDict() for document in collection.documents: for passage in document.passages: for sentence in passage.sentences: sentence_map['{}-{}'.format(document.id.replace('.', '-'), sentence.offset)] = (passage, sentence) sents = [] ids = [] for k in sentence_map: ids.append(k) sents.append(remove_newline(sentence_map[k][1].text)) concepts, error = mm.extract_concepts(sents, ids) if error is None: for concept in concepts: concept_index = adapt_concept_index(concept.index) try: if cuis is not None: # if no CUI is returned for this concept - skip it concept_cui = getattr(concept, 'cui', None) if concept_cui not in cuis: continue m = re.match(r'(\d+)/(\d+)', concept.pos_info) if m: passage = sentence_map[concept_index][0] sentence = sentence_map[concept_index][1] start = int(m.group(1)) - 1 length = int(m.group(2)) ann = bioc.BioCAnnotation() ann.id = str(next(annIndex)) ann.infons['CUI'] = concept.cui ann.infons['semtype'] = concept.semtypes[1:-1] ann.infons['term'] = concept.preferred_name ann.infons['annotator'] = 'MetaMap' ann.add_location( bioc.BioCLocation(sentence.offset + start, length)) ann.text = sentence.text[start:start + length] passage.annotations.append(ann) except: logging.exception('') except: logging.exception("Cannot process %s", collection.source) return collection
def convert_dg(dependency_graph, text: str, offset: int, ann_index: int = 0, rel_index: int = 0) \ -> Tuple[List[bioc.BioCAnnotation], List[bioc.BioCRelation]]: """ Convert dependency graph to annotations and relations """ annotations = [] relations = [] annotation_id_map = {} start = 0 for node in dependency_graph: if node.index in annotation_id_map: continue node_form = node.form index = text.find(node_form, start) if index == -1: node_form = adapt_value(node.form) index = text.find(node_form, start) if index == -1: logging.debug( 'Cannot convert parse tree to dependency graph at %d\n%d\n%s', start, offset, str(dependency_graph)) continue ann = bioc.BioCAnnotation() ann.id = 'T{}'.format(ann_index) ann.text = node_form ann.infons['tag'] = node.pos start = index ann.add_location(bioc.BioCLocation(start + offset, len(node_form))) annotations.append(ann) annotation_id_map[node.index] = ann_index ann_index += 1 start += len(node_form) for node in dependency_graph: if node.head == 0: ann = annotations[annotation_id_map[node.index]] ann.infons['ROOT'] = True continue relation = bioc.BioCRelation() relation.id = 'R{}'.format(rel_index) relation.infons['dependency'] = node.deprel if node.extra: relation.infons['extra'] = node.extra if node.index in annotation_id_map and node.head in annotation_id_map: relation.add_node( bioc.BioCNode('T{}'.format(annotation_id_map[node.index]), 'dependant')) relation.add_node( bioc.BioCNode('T{}'.format(annotation_id_map[node.head]), 'governor')) relations.append(relation) rel_index += 1 return annotations, relations
def test_neg_regex(): text = 'findings: no pneumothorax.' assert is_neg_regex(text) d = text_to_bioc([text], type='d/p/s') a = bioc.BioCAnnotation() a.text = 'pneumothorax' a.add_location(bioc.BioCLocation(13, 12)) d.passages[0].add_annotation(a) detector.__call__(d) assert d.passages[0].annotations[0].infons['negation'] == 'True'
def run_metamap(document, mm, cuis=None): """ Get CUIs from metamap. Args: document(BioCDocument): mm(MetaMap): MetaMap instance Returns: BioCDocument """ try: annIndex = itertools.count() sentence_map = collections.OrderedDict() for passage in document.passages: for sentence in passage.sentences: sentence_map[str(sentence.offset)] = (passage, sentence) sents = [] ids = [] for k in sentence_map: ids.append(k) sents.append(remove_newline(sentence_map[k][1].text)) concepts, error = mm.extract_concepts(sents, ids) print('Done') if error is None: for concept in concepts: # print(concept) concept_index = adapt_concept_index(concept.index) try: if cuis is not None and concept.cui not in cuis: continue m = re.match(r'(\d+)/(\d+)', concept.pos_info) if m: passage = sentence_map[concept_index][0] sentence = sentence_map[concept_index][1] start = int(m.group(1)) - 1 length = int(m.group(2)) ann = bioc.BioCAnnotation() ann.id = str(next(annIndex)) ann.infons['CUI'] = concept.cui ann.infons['semtype'] = concept.semtypes[1:-1] ann.infons['term'] = concept.preferred_name ann.infons['annotator'] = 'MetaMap' ann.add_location( bioc.BioCLocation(sentence.offset + start, length)) ann.text = sentence.text[start:start + length] passage.annotations.append(ann) except: logging.exception('') except: logging.exception("Cannot process %s", document.id) return document
def strip_annotation_markers( text: str, annotations_map: Dict[str, str]) -> Tuple[str, List[bioc.BioCAnnotation]]: """ Given a set of annotations, remove any which are found in the current text and return the new string as well as the positions of the annotations in the transformed string """ matched_annotations: List[Tuple[int, int.str]] = [] for ann_marker in annotations_map: # citation in brackets patterns = [ (r'[^\S\t]?[\(\[\{]' + re.escape(ann_marker) + r'[\)\]\}]', 0), # citation in brackets ( r'[^\S\t]' + re.escape(ann_marker) + r'\.', 1, ), # citation at end of sentence, remove extra whitespace ( r'[^\S\t]' + re.escape(ann_marker) + r'[^\S\t]', 1, ), # citation surrounded by whitespace (re.escape(ann_marker), 0), # citation by itself ] for pattern, end_offset in patterns: match = re.search(pattern, text) if match: matched_annotations.append( (match.start(), match.end() - end_offset, ann_marker)) break transformed_annotations: List[bioc.BioCAnnotation] = [] transformed_text = text offset = 0 for start, end, marker in matched_annotations: ann = bioc.BioCAnnotation() ann.id = marker ann.infons['citation_text'] = annotations_map[marker] ann.infons['type'] = 'citation' transformed_text = transformed_text[:start - offset] + transformed_text[end - offset:] # since the token place-holder is removed, must be start - 1 (and previous offset) for the new position ann.add_location(bioc.BioCLocation(start - offset - 1, 0)) offset += end - start transformed_annotations.append(ann) return transformed_text, transformed_annotations
def translateNCRFPPintoBioc(doc_token, predict_results, file_name): collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = file_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 entity_id = 1 sent_num = len(predict_results) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) sent_token = doc_token[(doc_token['sent_idx'] == idx)] assert sent_token.shape[0] == sent_length, "file {}, sent {}".format( file_name, idx) labelSequence = [] for idy in range(sent_length): token = sent_token.iloc[idy] label = predict_results[idx][0][idy] labelSequence.append(label) if label[0] == 'S' or label[0] == 'B': anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) anno_entity.infons['type'] = label[2:] anno_entity_location = bioc.BioCLocation( token['start'], token['end'] - token['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = token['text'] entity_id += 1 elif label[0] == 'M' or label[0] == 'E': if checkWrongState(labelSequence): anno_entity = passage.annotations[-1] whitespacetoAdd = token['start'] - anno_entity.locations[ 0].end for _ in range(whitespacetoAdd): anno_entity.text += " " anno_entity.text += token['text'] anno_entity.locations[0].length = token[ 'end'] - anno_entity.locations[0].offset bioc_file = open(file_name + ".bioc.xml", 'w') bioc.dump(collection, bioc_file) bioc_file.close()
def add_match(self, impression, sentence, ann_index, phrase, observation, start, end): """Add the match data and metadata to the impression object in place.""" annotation = bioc.BioCAnnotation() annotation.id = ann_index annotation.infons['CUI'] = None annotation.infons['semtype'] = None annotation.infons['term'] = phrase annotation.infons[OBSERVATION] = observation annotation.infons['annotator'] = 'Phrase' length = end - start annotation.add_location( bioc.BioCLocation(sentence.offset + start, length)) annotation.text = sentence.text[start:start + length] impression.annotations.append(annotation)
def tokenize_text(text, id): sentences = [] doc = nlp(text) for sent in doc.sents: sentence = bioc.BioCSentence() sentence.infons['filename'] = id sentence.offset = sent.start_char sentence.text = text[sent.start_char:sent.end_char] sentences.append(sentence) i = 0 for token in sent: for t, start, end in split_punct(token.text, token.idx): ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = t ann.add_location(bioc.BioCLocation(start, end - start)) sentence.add_annotation(ann) i += 1 return sentences
def test_clean_sentences(): cleanup = CleanUp() doc = text_to_bioc(['No pneumothorax.', 'No pneumothorax.'], type='d/p/s') p = doc.passages[0] for i in range(10, 0, -1): ann = bioc.BioCAnnotation() ann.add_location(bioc.BioCLocation(i, 1)) p.add_annotation(ann) assert len(doc.passages[0].sentences) == 2 doc = cleanup.__call__(doc) assert len(doc.passages[0].sentences) == 0 assert len(doc.passages[0].annotations) == 10 for i in range(10): assert doc.passages[0].annotations[i].total_span.offset == 10 - i doc = cleanup.__call__(doc, sort_anns=True) for i in range(10): assert doc.passages[0].annotations[i].total_span.offset == i + 1
def create_bioc_document_from_document_json(document): b_document = bioc.BioCDocument() b_document.id = document['sourceid'] passage = bioc.BioCPassage() passage.text = document['text'] passage.offset = 0 annotation_user_map = {} for denotation in document['denotations']: annotation_user_map[denotation['id']] = denotation['userId'] if denotation['userId'] != 0: continue annotation = bioc.BioCAnnotation() annotation.id = denotation['id'] location = bioc.BioCLocation(0, 0) location.offset = denotation['span']['begin'] location.length = denotation['span']['end'] - denotation['span'][ 'begin'] annotation.locations.append(location) annotation.text = document['text'][ denotation['span']['begin']:denotation['span']['end']] annotation.infons = denotation['obj'] passage.add_annotation(annotation) for relation in document['relations']: subj_from_current_user = annotation_user_map[relation['subj']] == 0 obj_from_current_user = annotation_user_map[relation['obj']] == 0 if not (subj_from_current_user and obj_from_current_user): continue b_relation = bioc.BioCRelation() b_relation.id = relation['id'] start_node = bioc.BioCNode('', '') end_node = bioc.BioCNode('', '') start_node.refid = relation['subj'] end_node.refid = relation['obj'] b_relation.add_node(start_node) b_relation.add_node(end_node) b_relation.infons = relation['pred'] passage.add_relation(b_relation) b_document.add_passage(passage) return b_document
def __call__(self, document, *args, **kwargs): annotation_index = itertools.count() for passage in document.passages: for sentence in passage.sentences: obs_phrases = self.observation2mention_phrases.items() for observation, phrases in obs_phrases: for phrase in phrases: pattern = self.compile_pattern(phrase) for match in pattern.finditer(sentence.text): start, end = match.span(0) if self.overlaps_with_unmention(sentence, observation, start, end): continue annotation = bioc.BioCAnnotation() annotation.id = str(next(annotation_index)) annotation.infons['term'] = phrase annotation.infons["observation"] = observation annotation.infons['annotator'] = 'RegEx' annotation.infons['vocab'] = self.vocab_name annotation.add_location(bioc.BioCLocation(sentence.offset + start, end - start)) annotation.text = sentence.text[start:end] passage.annotations.append(annotation) return document
if start1 > end2: pass elif start2 > end1: pass else: overlapping = True break if not overlapping: nonoverlapping.append ((start1,end1)) for start,end in nonoverlapping: for annotationType,conceptids in candidates[(start,end)].items(): conceptid = conceptids = ";".join(sorted(list(set(conceptids)))) a = bioc.BioCAnnotation() a.text = passage.text[start:end] a.infons = {'type':annotationType, 'conceptid': conceptid} a.id = 'T%d' % currentID currentID += 1 if end <= start: continue biocLoc = bioc.BioCLocation(offset=passage.offset+start, length=(end-start)) a.locations.append(biocLoc) passage.annotations.append(a) writer.write_document(doc) print ('Done!')
def predict(opt, data): seq_model = SeqModel(data) if opt.test_in_cpu: seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst})) seq_wordseq = WordSequence(data, False, True, True, True) if opt.test_in_cpu: seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst})) classify_model = ClassifyModel(data) if opt.test_in_cpu: classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst})) classify_wordseq = WordSequence(data, True, False, True, False) if opt.test_in_cpu: classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst})) input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.'] # for idx in tqdm(range(len(input_files))): for idx in range(len(input_files)): start = time.time() fileName = join(opt.input,input_files[idx]) doc_name = input_files[idx] doc_token = processOneFile(fileName) doc = generateDataForOneFile(doc_token) raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data) relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'argument 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'argument 2') bioc_relation.add_node(node2) with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp) end = time.time() logging.info("process %s complete with %.2fs" % (input_files[idx], end-start)) logging.info("test finished")
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir): logging.info("loading ... vocab") relation_vocab = pickle.load( open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb')) logging.info("loading ... result") results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb")) for i in tqdm(range(len(test_relation))): doc_entity = test_entity[i] doc_name = test_name[i] collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for _, entity in doc_entity.iterrows(): anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity['id'] anno_entity.infons['type'] = entity['type'] anno_entity_location = bioc.BioCLocation( entity['start'], entity['end'] - entity['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = entity['text'] relation_id = 1 for result in results: if doc_name == result['doc_name']: former = doc_entity[( doc_entity['id'] == result['former_id'])].iloc[0] latter = doc_entity[( doc_entity['id'] == result['latter_id'])].iloc[0] relation_type = relation_vocab.lookup_id2str(result['type']) if relation_type == '<unk>': continue elif my_utils.relationConstraint1(relation_type, former['type'], latter['type']) == False: continue else: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = str(relation_id) relation_id += 1 bioc_relation.infons['type'] = relation_type node1 = bioc.BioCNode(former['id'], 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(latter['id'], 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def evaluate_via_bioc(test_docs, crf, extractor, prediction_dir, made_base_dir=None): print('Total documents for evaluation : {}'.format(len(test_docs))) if not os.path.exists(prediction_dir): os.makedirs(prediction_dir) existing_files = glob.glob('{0}/*'.format(prediction_dir)) existing_files_removed = 0 for f in existing_files: os.remove(f) existing_files_removed += 1 print('Existing files removed : {}'.format(existing_files_removed)) prediction_documents_written = 0 reference_filenames = [] for test_doc in test_docs: #print('Working on document : {}'.format(test_doc.filename)) collection = bioc.BioCCollection() document = bioc.BioCDocument() document.id = test_doc.filename collection.add_document(document) passage = bioc.BioCPassage() passage.offset = 0 document.add_passage(passage) next_annotation_id = 1 # now an annotation can be written for each label prediction for sentence in test_doc.tokenized_doc.sentences: sentence_tokens = [] # gather tokens in a sentence for token_offset_pair in sentence: token = test_doc.text[ token_offset_pair[0]:token_offset_pair[1]] sentence_tokens.append(token) if len(sentence_tokens) == 0: continue sentence_features = extractor.sent2features(sentence_tokens) sentence_pred = crf.predict([sentence_features])[0] if len(sentence_pred) != len(sentence): print('Sentence Features Length : {}'.format( len(sentence_features))) print('Sentence Pred Length : {}'.format(len(sentence_pred))) print('Sentence Length : {}'.format(len(sentence))) # walk manually through the predictions and add spans as appropriate token_idx = 0 while token_idx < len(sentence_pred): token_pred = sentence_pred[token_idx] if token_pred != 'O': base_label = token_pred.replace('B-', '').replace('I-', '') start_offset = sentence[token_idx][0] end_offset = sentence[token_idx][1] # now let's look to the right as long as we see tokens which are part of this same label while token_idx + 1 < len(sentence_pred) and sentence_pred[ token_idx + 1] == ('I-' + base_label): # advance the token token_idx += 1 # update the end of this span end_offset = sentence[token_idx][1] # finally we have an annotation that we can add annotation = bioc.BioCAnnotation() annotation.infons['type'] = base_label annotation.text = test_doc.text[start_offset:end_offset] # current reference replaces newlines with literal '\n' annotation.text = annotation.text.replace('\n', '\\n').replace( '\r', '\\r') annotation.id = str(next_annotation_id) location = bioc.BioCLocation(start_offset, end_offset - start_offset) next_annotation_id += 1 annotation.add_location(location) passage.add_annotation(annotation) # advance the token no matter what happened above token_idx += 1 prediction_filename = os.path.join( prediction_dir, '{}.bioc.xml'.format(test_doc.filename)) if made_base_dir is not None: reference_filename = os.path.join( os.path.join(made_base_dir, 'annotations'), '{}.bioc.xml'.format(test_doc.filename)) reference_filenames.append(reference_filename) with open(prediction_filename, 'w') as fp: bioc.dump(collection, fp) prediction_documents_written += 1 print('Total prediction documents written : {}'.format( prediction_documents_written)) # finally we can invoke some evaluation (if enabled) if made_base_dir is not None: annotation_dir = os.path.join(made_base_dir, 'annotations') text_dir = os.path.join(made_base_dir, 'corpus') # first param can be an actual directory (string) or a list of filepaths get_f_scores(reference_filenames, prediction_dir, text_dir)
def test(data, opt, predict_dir): test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData( data.test_dir) # evaluate on test data and output results in bioc format, one doc one file data.load(opt.data_file) data.MAX_SENTENCE_LENGTH = -1 data.show_data_summary() data.fix_alphabet() seq_model = SeqModel(data) seq_model.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'model.pkl'))) ner_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \ data.feature_emb_dims[data.feature_name2id['[POS]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i)))) ner_hiddenlist.append(temp) ner_wordrep = WordRep(data, False, True, True, data.use_char) ner_wordrep.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl'))) classify_model = ClassifyModel(data) classify_model.load_state_dict( torch.load(os.path.join(opt.re_dir, 'model.pkl'))) re_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\ 2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i)))) re_hiddenlist.append(temp) re_wordrep = WordRep(data, True, False, True, False) re_wordrep.load_state_dict( torch.load(os.path.join(opt.re_dir, 'wordrep.pkl'))) for i in tqdm(range(len(test_name))): doc_name = test_name[i] doc_token = test_token[i] doc_entity = test_entity[i] if opt.use_gold_ner: entities = [] for _, e in doc_entity.iterrows(): entity = Entity() entity.create(e['id'], e['type'], e['start'], e['end'], e['text'], e['sent_idx'], e['tf_start'], e['tf_end']) entities.append(entity) else: ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity) data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer( ncrf_data, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = ner_evaluateWhenTest(data, ner_wordrep, ner_hiddenlist, seq_model) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc( doc_token, entities, doc_name, data) relations = re_evaluateWhenTest( re_wordrep, re_hiddenlist, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(predict_dir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def add_dependency(self, obj): # create bioc sentence sentence = bioc.BioCSentence() sentence.offset = 0 sentence.text = obj['text'] annotation = bioc.BioCAnnotation() annotation.infons['parse tree'] = obj['parse tree'] sentence.add_annotation(annotation) self.p2d.convert_s(sentence) m = {} for i, tok in enumerate(obj['toks']): tok['id'] = i # find bioc annotation found = False for ann in sentence.annotations: loc = ann.total_span if utils.intersect((tok['start'], tok['end']), (loc.offset, loc.offset + loc.length)): if ann.id in m: logging.debug('Duplicated id mapping: %s', ann.id) m[ann.id] = i if 'ROOT' in ann.infons: tok['ROOT'] = True found = True break if not found: logging.debug('Cannot find %s in \n%s', tok, obj['id']) for rel in sentence.relations: node0 = rel.nodes[0] node1 = rel.nodes[1] if node0.refid in m and node1.refid in m: if node0.role == 'governor': gov = m[node0.refid] dep = m[node1.refid] else: gov = m[node1.refid] dep = m[node0.refid] if gov == dep: logging.debug('Discard self loop') continue tok = obj['toks'][dep] if 'governor' in tok: if tok['governor'] == gov: pass if 'extra' in rel.infons: pass else: logging.debug('%s: Two heads: %s', obj['id'], str(rel)) else: tok['governor'] = gov tok['dependency'] = rel.infons['dependency'] else: ann0 = None ann1 = None for annotation in sentence.annotations: if annotation.id == node0.refid: ann0 = annotation if annotation.id == node1.refid: ann1 = annotation logging.debug('Cannot find %s or %s in sentence: %s', node0, node1, obj['id']) logging.debug('%s', ann0) logging.debug('%s', ann1)