def test_extend(): text = 'findings: no pneumothorax.' d = text_to_bioc([text], type='d/p/s') a = bioc.BioCAnnotation() a.text = 'pneumothorax' a.add_location(bioc.BioCLocation(13, 12)) d.passages[0].add_annotation(a) detector.__call__(d) # fake ann a = bioc.BioCAnnotation() a.text = 'eumothor' a.add_location(bioc.BioCLocation(15, 8)) d.passages[0].add_annotation(a) a = bioc.BioCAnnotation() a.text = 'foo' a.add_location(bioc.BioCLocation(27, 3)) d.passages[0].add_annotation(a) _extend(d, 'negation') assert d.passages[0].annotations[1].infons['negation'] == 'True' assert 'negation' not in d.passages[0].annotations[2].infons d.passages[0].annotations[0].infons['CUI'] = 'xxx' d.passages[0].annotations[2].infons['CUI'] = 'xxx' _extend(d, 'negation') assert 'negation' not in d.passages[0].annotations[2].infons
def read_text(pathname): with open(pathname) as fp: text = fp.read() sentences = [] offset = 0 for sent in text.split('\n'): sentence = bioc.BioCSentence() sentence.infons['filename'] = pathname.stem sentence.offset = offset sentence.text = sent sentences.append(sentence) i = 0 for m in re.finditer('\S+', sent): if i == 0 and m.start() != 0: # add fake ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = '' ann.add_location(bioc.BioCLocation(offset, 0)) sentence.add_annotation(ann) i += 1 ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = m.group() ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group()))) sentence.add_annotation(ann) i += 1 offset += len(sent) + 1 return sentences
def save_as_collection(list_of_pymedext_documents: List[Document]): """save a list of pymedext document as a bioc collection . It will return a bioc collection object. :param list_of_pymedext_documents: a list of Document :returns: a bioc collection object """ this_bioc_collection = bioc.BioCCollection() for this_pymedext_doc in list_of_pymedext_documents: this_bioc_doc = bioc.BioCDocument() for annot in this_pymedext_doc.annotations: # print(annot.type) print(annot.source) if annot.type == "raw_text": if this_bioc_collection.source == '': this_bioc_collection.source = annot.source if annot.source == "BioCPassage": print(annot.ngram) print(annot.value) this_passage = bioc.BioCPassage() this_passage.text = annot.ngram this_passage.offset = annot.span[0] this_bioc_doc.add_passage(this_passage) # passageAttributes to add elif annot.source == "BioCAnnotation": this_annotation = bioc.BioCAnnotation() this_annotation.infons = annot.attributes this_annotation.id = annot.attributes["id"] this_annotation.text = annot.ngram thisLocation = bioc.BioCLocation( annot.span[0], annot.span[1] - annot.span[0]) this_annotation.add_location(thisLocation) this_bioc_doc.passages[-1].add_annotation(this_annotation) this_bioc_collection.add_document(this_bioc_doc) return (this_bioc_collection)
def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set): with open(infile, 'r', encoding='utf-8') as fin: with open(outfile, 'w', encoding='utf8') as fout: collection = bioc.load(fin) for document in collection.documents: for passage in document.passages: tag_result = bioTag(passage.text, biotag_dic, nn_model, onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'], Threshold=para_set['ML_Threshold']) mention_num = 0 for ele in tag_result: bioc_note = bioc.BioCAnnotation() bioc_note.id = str(mention_num) mention_num += 1 bioc_note.infons['identifier'] = ele[2] bioc_note.infons['type'] = "Phenotype" bioc_note.infons['score'] = ele[3] start = int(ele[0]) last = int(ele[1]) loc = bioc.BioCLocation(offset=str(start), length=str(last - start)) bioc_note.locations.append(loc) bioc_note.text = passage.text[start:last] passage.annotations.append(bioc_note) bioc.dump(collection, fout, pretty_print=True)
def convertKindredCorpusToBioCCollection(corpus): assert isinstance(corpus,kindred.Corpus) collection = bioc.BioCCollection() for kdoc in corpus.documents: assert isinstance(kdoc,kindred.Document) biocDoc = bioc.BioCDocument() collection.add_document(biocDoc) if 'id' in kdoc.metadata: biocDoc.id = kdoc.metadata['id'] biocDoc.infons = kdoc.metadata passage = bioc.BioCPassage() passage.text = kdoc.text passage.offset = 0 biocDoc.add_passage(passage) seenEntityIDs = set() kindredID2BiocID = {} for e in kdoc.entities: assert isinstance(e,kindred.Entity) a = bioc.BioCAnnotation() a.text = e.text a.infons = {'type':e.entityType} a.infons.update(e.metadata) if e.sourceEntityID is None: a.id = str(e.entityID) else: a.id = e.sourceEntityID assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id seenEntityIDs.add(a.id) kindredID2BiocID[e.entityID] = a.id for start,end in e.position: l = bioc.BioCLocation(offset=start, length=(end-start)) a.locations.append(l) passage.annotations.append(a) for r in kdoc.relations: assert isinstance(r,kindred.Relation) biocR = bioc.BioCRelation() biocR.infons = {'type':r.relationType} entitiesInRelation = r.entities argNames = r.argNames if argNames is None: argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ] for argName,entity in zip(argNames,entitiesInRelation): node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID]) biocR.nodes.append(node) passage.relations.append(biocR) return collection
def dump_results(doc_name, entities, opt): entity_id = 1 collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) entity_id += 1 anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation( entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0]) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.name if len(entity.norm_ids) > 0: anno_entity.infons['UMLS code'] = entity.norm_ids[0] anno_entity.infons['UMLS term'] = entity.norm_names[0] else: anno_entity.infons['UMLS code'] = 'N/A' anno_entity.infons['UMLS term'] = 'N/A' with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w', 'UTF-8') as fp: bioc.dump(collection, fp)
def pubtator2bioc_ann(ann: PubTatorAnn) -> bioc.BioCAnnotation: biocann = bioc.BioCAnnotation() biocann.id = 'T{}'.format(ann.start) biocann.infons['type'] = ann.type biocann.infons['concept_id'] = ann.id biocann.add_location(bioc.BioCLocation(ann.start, ann.end - ann.start)) biocann.text = ann.text return biocann
def run_metamap_col(collection, mm, cuis=None): """ Get CUIs from metamap. Args: collection(BioCCollection): mm(MetaMap): MetaMap instance Returns: BioCCollection """ try: annIndex = itertools.count() sentence_map = collections.OrderedDict() for document in collection.documents: for passage in document.passages: for sentence in passage.sentences: sentence_map['{}-{}'.format(document.id.replace('.', '-'), sentence.offset)] = (passage, sentence) sents = [] ids = [] for k in sentence_map: ids.append(k) sents.append(remove_newline(sentence_map[k][1].text)) concepts, error = mm.extract_concepts(sents, ids) if error is None: for concept in concepts: concept_index = adapt_concept_index(concept.index) try: if cuis is not None: # if no CUI is returned for this concept - skip it concept_cui = getattr(concept, 'cui', None) if concept_cui not in cuis: continue m = re.match(r'(\d+)/(\d+)', concept.pos_info) if m: passage = sentence_map[concept_index][0] sentence = sentence_map[concept_index][1] start = int(m.group(1)) - 1 length = int(m.group(2)) ann = bioc.BioCAnnotation() ann.id = str(next(annIndex)) ann.infons['CUI'] = concept.cui ann.infons['semtype'] = concept.semtypes[1:-1] ann.infons['term'] = concept.preferred_name ann.infons['annotator'] = 'MetaMap' ann.add_location( bioc.BioCLocation(sentence.offset + start, length)) ann.text = sentence.text[start:start + length] passage.annotations.append(ann) except: logging.exception('') except: logging.exception("Cannot process %s", collection.source) return collection
def convert_dg(dependency_graph, text: str, offset: int, ann_index: int = 0, rel_index: int = 0) \ -> Tuple[List[bioc.BioCAnnotation], List[bioc.BioCRelation]]: """ Convert dependency graph to annotations and relations """ annotations = [] relations = [] annotation_id_map = {} start = 0 for node in dependency_graph: if node.index in annotation_id_map: continue node_form = node.form index = text.find(node_form, start) if index == -1: node_form = adapt_value(node.form) index = text.find(node_form, start) if index == -1: logging.debug( 'Cannot convert parse tree to dependency graph at %d\n%d\n%s', start, offset, str(dependency_graph)) continue ann = bioc.BioCAnnotation() ann.id = 'T{}'.format(ann_index) ann.text = node_form ann.infons['tag'] = node.pos start = index ann.add_location(bioc.BioCLocation(start + offset, len(node_form))) annotations.append(ann) annotation_id_map[node.index] = ann_index ann_index += 1 start += len(node_form) for node in dependency_graph: if node.head == 0: ann = annotations[annotation_id_map[node.index]] ann.infons['ROOT'] = True continue relation = bioc.BioCRelation() relation.id = 'R{}'.format(rel_index) relation.infons['dependency'] = node.deprel if node.extra: relation.infons['extra'] = node.extra if node.index in annotation_id_map and node.head in annotation_id_map: relation.add_node( bioc.BioCNode('T{}'.format(annotation_id_map[node.index]), 'dependant')) relation.add_node( bioc.BioCNode('T{}'.format(annotation_id_map[node.head]), 'governor')) relations.append(relation) rel_index += 1 return annotations, relations
def run_metamap(document, mm, cuis=None): """ Get CUIs from metamap. Args: document(BioCDocument): mm(MetaMap): MetaMap instance Returns: BioCDocument """ try: annIndex = itertools.count() sentence_map = collections.OrderedDict() for passage in document.passages: for sentence in passage.sentences: sentence_map[str(sentence.offset)] = (passage, sentence) sents = [] ids = [] for k in sentence_map: ids.append(k) sents.append(remove_newline(sentence_map[k][1].text)) concepts, error = mm.extract_concepts(sents, ids) print('Done') if error is None: for concept in concepts: # print(concept) concept_index = adapt_concept_index(concept.index) try: if cuis is not None and concept.cui not in cuis: continue m = re.match(r'(\d+)/(\d+)', concept.pos_info) if m: passage = sentence_map[concept_index][0] sentence = sentence_map[concept_index][1] start = int(m.group(1)) - 1 length = int(m.group(2)) ann = bioc.BioCAnnotation() ann.id = str(next(annIndex)) ann.infons['CUI'] = concept.cui ann.infons['semtype'] = concept.semtypes[1:-1] ann.infons['term'] = concept.preferred_name ann.infons['annotator'] = 'MetaMap' ann.add_location( bioc.BioCLocation(sentence.offset + start, length)) ann.text = sentence.text[start:start + length] passage.annotations.append(ann) except: logging.exception('') except: logging.exception("Cannot process %s", document.id) return document
def test_neg_regex(): text = 'findings: no pneumothorax.' assert is_neg_regex(text) d = text_to_bioc([text], type='d/p/s') a = bioc.BioCAnnotation() a.text = 'pneumothorax' a.add_location(bioc.BioCLocation(13, 12)) d.passages[0].add_annotation(a) detector.__call__(d) assert d.passages[0].annotations[0].infons['negation'] == 'True'
def test_location(): base = bioc.BioCLocation(1, 10) assert base != 'foo' assert base.end == 11 assert base.contains(9) assert not base.contains(11) loc = bioc.BioCLocation(1, 10) assert base == loc loc = bioc.BioCLocation(2, 9) assert base != loc assert loc in base assert base not in loc locs = {base, loc} assert base in locs assert loc in locs with pytest.raises(TypeError): assert 'foo' in base
def translateNCRFPPintoBioc(doc_token, predict_results, file_name): collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = file_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 entity_id = 1 sent_num = len(predict_results) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) sent_token = doc_token[(doc_token['sent_idx'] == idx)] assert sent_token.shape[0] == sent_length, "file {}, sent {}".format( file_name, idx) labelSequence = [] for idy in range(sent_length): token = sent_token.iloc[idy] label = predict_results[idx][0][idy] labelSequence.append(label) if label[0] == 'S' or label[0] == 'B': anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = str(entity_id) anno_entity.infons['type'] = label[2:] anno_entity_location = bioc.BioCLocation( token['start'], token['end'] - token['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = token['text'] entity_id += 1 elif label[0] == 'M' or label[0] == 'E': if checkWrongState(labelSequence): anno_entity = passage.annotations[-1] whitespacetoAdd = token['start'] - anno_entity.locations[ 0].end for _ in range(whitespacetoAdd): anno_entity.text += " " anno_entity.text += token['text'] anno_entity.locations[0].length = token[ 'end'] - anno_entity.locations[0].offset bioc_file = open(file_name + ".bioc.xml", 'w') bioc.dump(collection, bioc_file) bioc_file.close()
def strip_annotation_markers( text: str, annotations_map: Dict[str, str]) -> Tuple[str, List[bioc.BioCAnnotation]]: """ Given a set of annotations, remove any which are found in the current text and return the new string as well as the positions of the annotations in the transformed string """ matched_annotations: List[Tuple[int, int.str]] = [] for ann_marker in annotations_map: # citation in brackets patterns = [ (r'[^\S\t]?[\(\[\{]' + re.escape(ann_marker) + r'[\)\]\}]', 0), # citation in brackets ( r'[^\S\t]' + re.escape(ann_marker) + r'\.', 1, ), # citation at end of sentence, remove extra whitespace ( r'[^\S\t]' + re.escape(ann_marker) + r'[^\S\t]', 1, ), # citation surrounded by whitespace (re.escape(ann_marker), 0), # citation by itself ] for pattern, end_offset in patterns: match = re.search(pattern, text) if match: matched_annotations.append( (match.start(), match.end() - end_offset, ann_marker)) break transformed_annotations: List[bioc.BioCAnnotation] = [] transformed_text = text offset = 0 for start, end, marker in matched_annotations: ann = bioc.BioCAnnotation() ann.id = marker ann.infons['citation_text'] = annotations_map[marker] ann.infons['type'] = 'citation' transformed_text = transformed_text[:start - offset] + transformed_text[end - offset:] # since the token place-holder is removed, must be start - 1 (and previous offset) for the new position ann.add_location(bioc.BioCLocation(start - offset - 1, 0)) offset += end - start transformed_annotations.append(ann) return transformed_text, transformed_annotations
def add_match(self, impression, sentence, ann_index, phrase, observation, start, end): """Add the match data and metadata to the impression object in place.""" annotation = bioc.BioCAnnotation() annotation.id = ann_index annotation.infons['CUI'] = None annotation.infons['semtype'] = None annotation.infons['term'] = phrase annotation.infons[OBSERVATION] = observation annotation.infons['annotator'] = 'Phrase' length = end - start annotation.add_location( bioc.BioCLocation(sentence.offset + start, length)) annotation.text = sentence.text[start:start + length] impression.annotations.append(annotation)
def tokenize_text(text, id): sentences = [] doc = nlp(text) for sent in doc.sents: sentence = bioc.BioCSentence() sentence.infons['filename'] = id sentence.offset = sent.start_char sentence.text = text[sent.start_char:sent.end_char] sentences.append(sentence) i = 0 for token in sent: for t, start, end in split_punct(token.text, token.idx): ann = bioc.BioCAnnotation() ann.id = f'a{i}' ann.text = t ann.add_location(bioc.BioCLocation(start, end - start)) sentence.add_annotation(ann) i += 1 return sentences
def test_clean_sentences(): cleanup = CleanUp() doc = text_to_bioc(['No pneumothorax.', 'No pneumothorax.'], type='d/p/s') p = doc.passages[0] for i in range(10, 0, -1): ann = bioc.BioCAnnotation() ann.add_location(bioc.BioCLocation(i, 1)) p.add_annotation(ann) assert len(doc.passages[0].sentences) == 2 doc = cleanup.__call__(doc) assert len(doc.passages[0].sentences) == 0 assert len(doc.passages[0].annotations) == 10 for i in range(10): assert doc.passages[0].annotations[i].total_span.offset == 10 - i doc = cleanup.__call__(doc, sort_anns=True) for i in range(10): assert doc.passages[0].annotations[i].total_span.offset == i + 1
def create_bioc_document_from_document_json(document): b_document = bioc.BioCDocument() b_document.id = document['sourceid'] passage = bioc.BioCPassage() passage.text = document['text'] passage.offset = 0 annotation_user_map = {} for denotation in document['denotations']: annotation_user_map[denotation['id']] = denotation['userId'] if denotation['userId'] != 0: continue annotation = bioc.BioCAnnotation() annotation.id = denotation['id'] location = bioc.BioCLocation(0, 0) location.offset = denotation['span']['begin'] location.length = denotation['span']['end'] - denotation['span'][ 'begin'] annotation.locations.append(location) annotation.text = document['text'][ denotation['span']['begin']:denotation['span']['end']] annotation.infons = denotation['obj'] passage.add_annotation(annotation) for relation in document['relations']: subj_from_current_user = annotation_user_map[relation['subj']] == 0 obj_from_current_user = annotation_user_map[relation['obj']] == 0 if not (subj_from_current_user and obj_from_current_user): continue b_relation = bioc.BioCRelation() b_relation.id = relation['id'] start_node = bioc.BioCNode('', '') end_node = bioc.BioCNode('', '') start_node.refid = relation['subj'] end_node.refid = relation['obj'] b_relation.add_node(start_node) b_relation.add_node(end_node) b_relation.infons = relation['pred'] passage.add_relation(b_relation) b_document.add_passage(passage) return b_document
def __call__(self, document, *args, **kwargs): annotation_index = itertools.count() for passage in document.passages: for sentence in passage.sentences: obs_phrases = self.observation2mention_phrases.items() for observation, phrases in obs_phrases: for phrase in phrases: pattern = self.compile_pattern(phrase) for match in pattern.finditer(sentence.text): start, end = match.span(0) if self.overlaps_with_unmention(sentence, observation, start, end): continue annotation = bioc.BioCAnnotation() annotation.id = str(next(annotation_index)) annotation.infons['term'] = phrase annotation.infons["observation"] = observation annotation.infons['annotator'] = 'RegEx' annotation.infons['vocab'] = self.vocab_name annotation.add_location(bioc.BioCLocation(sentence.offset + start, end - start)) annotation.text = sentence.text[start:end] passage.annotations.append(annotation) return document
elif start2 > end1: pass else: overlapping = True break if not overlapping: nonoverlapping.append ((start1,end1)) for start,end in nonoverlapping: for annotationType,conceptids in candidates[(start,end)].items(): conceptid = conceptids = ";".join(sorted(list(set(conceptids)))) a = bioc.BioCAnnotation() a.text = passage.text[start:end] a.infons = {'type':annotationType, 'conceptid': conceptid} a.id = 'T%d' % currentID currentID += 1 if end <= start: continue biocLoc = bioc.BioCLocation(offset=passage.offset+start, length=(end-start)) a.locations.append(biocLoc) passage.annotations.append(a) writer.write_document(doc) print ('Done!')
def evaluate_via_bioc(test_docs, crf, extractor, prediction_dir, made_base_dir=None): print('Total documents for evaluation : {}'.format(len(test_docs))) if not os.path.exists(prediction_dir): os.makedirs(prediction_dir) existing_files = glob.glob('{0}/*'.format(prediction_dir)) existing_files_removed = 0 for f in existing_files: os.remove(f) existing_files_removed += 1 print('Existing files removed : {}'.format(existing_files_removed)) prediction_documents_written = 0 reference_filenames = [] for test_doc in test_docs: #print('Working on document : {}'.format(test_doc.filename)) collection = bioc.BioCCollection() document = bioc.BioCDocument() document.id = test_doc.filename collection.add_document(document) passage = bioc.BioCPassage() passage.offset = 0 document.add_passage(passage) next_annotation_id = 1 # now an annotation can be written for each label prediction for sentence in test_doc.tokenized_doc.sentences: sentence_tokens = [] # gather tokens in a sentence for token_offset_pair in sentence: token = test_doc.text[ token_offset_pair[0]:token_offset_pair[1]] sentence_tokens.append(token) if len(sentence_tokens) == 0: continue sentence_features = extractor.sent2features(sentence_tokens) sentence_pred = crf.predict([sentence_features])[0] if len(sentence_pred) != len(sentence): print('Sentence Features Length : {}'.format( len(sentence_features))) print('Sentence Pred Length : {}'.format(len(sentence_pred))) print('Sentence Length : {}'.format(len(sentence))) # walk manually through the predictions and add spans as appropriate token_idx = 0 while token_idx < len(sentence_pred): token_pred = sentence_pred[token_idx] if token_pred != 'O': base_label = token_pred.replace('B-', '').replace('I-', '') start_offset = sentence[token_idx][0] end_offset = sentence[token_idx][1] # now let's look to the right as long as we see tokens which are part of this same label while token_idx + 1 < len(sentence_pred) and sentence_pred[ token_idx + 1] == ('I-' + base_label): # advance the token token_idx += 1 # update the end of this span end_offset = sentence[token_idx][1] # finally we have an annotation that we can add annotation = bioc.BioCAnnotation() annotation.infons['type'] = base_label annotation.text = test_doc.text[start_offset:end_offset] # current reference replaces newlines with literal '\n' annotation.text = annotation.text.replace('\n', '\\n').replace( '\r', '\\r') annotation.id = str(next_annotation_id) location = bioc.BioCLocation(start_offset, end_offset - start_offset) next_annotation_id += 1 annotation.add_location(location) passage.add_annotation(annotation) # advance the token no matter what happened above token_idx += 1 prediction_filename = os.path.join( prediction_dir, '{}.bioc.xml'.format(test_doc.filename)) if made_base_dir is not None: reference_filename = os.path.join( os.path.join(made_base_dir, 'annotations'), '{}.bioc.xml'.format(test_doc.filename)) reference_filenames.append(reference_filename) with open(prediction_filename, 'w') as fp: bioc.dump(collection, fp) prediction_documents_written += 1 print('Total prediction documents written : {}'.format( prediction_documents_written)) # finally we can invoke some evaluation (if enabled) if made_base_dir is not None: annotation_dir = os.path.join(made_base_dir, 'annotations') text_dir = os.path.join(made_base_dir, 'corpus') # first param can be an actual directory (string) or a list of filepaths get_f_scores(reference_filenames, prediction_dir, text_dir)
def predict(opt, data): seq_model = SeqModel(data) if opt.test_in_cpu: seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst})) seq_wordseq = WordSequence(data, False, True, True, True) if opt.test_in_cpu: seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst})) classify_model = ClassifyModel(data) if opt.test_in_cpu: classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst})) classify_wordseq = WordSequence(data, True, False, True, False) if opt.test_in_cpu: classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst})) input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.'] # for idx in tqdm(range(len(input_files))): for idx in range(len(input_files)): start = time.time() fileName = join(opt.input,input_files[idx]) doc_name = input_files[idx] doc_token = processOneFile(fileName) doc = generateDataForOneFile(doc_token) raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data) relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'argument 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'argument 2') bioc_relation.add_node(node2) with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp) end = time.time() logging.info("process %s complete with %.2fs" % (input_files[idx], end-start)) logging.info("test finished")
pass else: overlapping = True break if not overlapping: nonoverlapping.append((start1, end1)) for start, end in nonoverlapping: for annotationType, conceptids in candidates[( start, end)].items(): conceptid = conceptids = ";".join( sorted(list(set(conceptids)))) a = bioc.BioCAnnotation() a.text = passage.text[start:end] a.infons = { 'type': annotationType, 'conceptid': conceptid } a.id = 'T%d' % currentID currentID += 1 a.locations.append( bioc.BioCLocation(offset=start, length=(end - start))) passage.annotations.append(a) writer.writedocument(doc) print('Done!')
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir): logging.info("loading ... vocab") relation_vocab = pickle.load( open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb')) logging.info("loading ... result") results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb")) for i in tqdm(range(len(test_relation))): doc_entity = test_entity[i] doc_name = test_name[i] collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for _, entity in doc_entity.iterrows(): anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity['id'] anno_entity.infons['type'] = entity['type'] anno_entity_location = bioc.BioCLocation( entity['start'], entity['end'] - entity['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = entity['text'] relation_id = 1 for result in results: if doc_name == result['doc_name']: former = doc_entity[( doc_entity['id'] == result['former_id'])].iloc[0] latter = doc_entity[( doc_entity['id'] == result['latter_id'])].iloc[0] relation_type = relation_vocab.lookup_id2str(result['type']) if relation_type == '<unk>': continue elif my_utils.relationConstraint1(relation_type, former['type'], latter['type']) == False: continue else: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = str(relation_id) relation_id += 1 bioc_relation.infons['type'] = relation_type node1 = bioc.BioCNode(former['id'], 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(latter['id'], 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def location(this, json_loc): loc = bioc.BioCLocation() loc.offset = str(json_loc['offset']) loc.length = str(json_loc['length']) return loc
def test(data, opt, predict_dir): test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData( data.test_dir) # evaluate on test data and output results in bioc format, one doc one file data.load(opt.data_file) data.MAX_SENTENCE_LENGTH = -1 data.show_data_summary() data.fix_alphabet() seq_model = SeqModel(data) seq_model.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'model.pkl'))) ner_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \ data.feature_emb_dims[data.feature_name2id['[POS]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i)))) ner_hiddenlist.append(temp) ner_wordrep = WordRep(data, False, True, True, data.use_char) ner_wordrep.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl'))) classify_model = ClassifyModel(data) classify_model.load_state_dict( torch.load(os.path.join(opt.re_dir, 'model.pkl'))) re_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\ 2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i)))) re_hiddenlist.append(temp) re_wordrep = WordRep(data, True, False, True, False) re_wordrep.load_state_dict( torch.load(os.path.join(opt.re_dir, 'wordrep.pkl'))) for i in tqdm(range(len(test_name))): doc_name = test_name[i] doc_token = test_token[i] doc_entity = test_entity[i] if opt.use_gold_ner: entities = [] for _, e in doc_entity.iterrows(): entity = Entity() entity.create(e['id'], e['type'], e['start'], e['end'], e['text'], e['sent_idx'], e['tf_start'], e['tf_end']) entities.append(entity) else: ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity) data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer( ncrf_data, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = ner_evaluateWhenTest(data, ner_wordrep, ner_hiddenlist, seq_model) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc( doc_token, entities, doc_name, data) relations = re_evaluateWhenTest( re_wordrep, re_hiddenlist, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(predict_dir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)