def convertKindredCorpusToBioCCollection(corpus): assert isinstance(corpus,kindred.Corpus) collection = bioc.BioCCollection() for kdoc in corpus.documents: assert isinstance(kdoc,kindred.Document) biocDoc = bioc.BioCDocument() collection.add_document(biocDoc) if 'id' in kdoc.metadata: biocDoc.id = kdoc.metadata['id'] biocDoc.infons = kdoc.metadata passage = bioc.BioCPassage() passage.text = kdoc.text passage.offset = 0 biocDoc.add_passage(passage) seenEntityIDs = set() kindredID2BiocID = {} for e in kdoc.entities: assert isinstance(e,kindred.Entity) a = bioc.BioCAnnotation() a.text = e.text a.infons = {'type':e.entityType} a.infons.update(e.metadata) if e.sourceEntityID is None: a.id = str(e.entityID) else: a.id = e.sourceEntityID assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id seenEntityIDs.add(a.id) kindredID2BiocID[e.entityID] = a.id for start,end in e.position: l = bioc.BioCLocation(offset=start, length=(end-start)) a.locations.append(l) passage.annotations.append(a) for r in kdoc.relations: assert isinstance(r,kindred.Relation) biocR = bioc.BioCRelation() biocR.infons = {'type':r.relationType} entitiesInRelation = r.entities argNames = r.argNames if argNames is None: argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ] for argName,entity in zip(argNames,entitiesInRelation): node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID]) biocR.nodes.append(node) passage.relations.append(biocR) return collection
def convert_dg(dependency_graph, text: str, offset: int, ann_index: int = 0, rel_index: int = 0) \ -> Tuple[List[bioc.BioCAnnotation], List[bioc.BioCRelation]]: """ Convert dependency graph to annotations and relations """ annotations = [] relations = [] annotation_id_map = {} start = 0 for node in dependency_graph: if node.index in annotation_id_map: continue node_form = node.form index = text.find(node_form, start) if index == -1: node_form = adapt_value(node.form) index = text.find(node_form, start) if index == -1: logging.debug( 'Cannot convert parse tree to dependency graph at %d\n%d\n%s', start, offset, str(dependency_graph)) continue ann = bioc.BioCAnnotation() ann.id = 'T{}'.format(ann_index) ann.text = node_form ann.infons['tag'] = node.pos start = index ann.add_location(bioc.BioCLocation(start + offset, len(node_form))) annotations.append(ann) annotation_id_map[node.index] = ann_index ann_index += 1 start += len(node_form) for node in dependency_graph: if node.head == 0: ann = annotations[annotation_id_map[node.index]] ann.infons['ROOT'] = True continue relation = bioc.BioCRelation() relation.id = 'R{}'.format(rel_index) relation.infons['dependency'] = node.deprel if node.extra: relation.infons['extra'] = node.extra if node.index in annotation_id_map and node.head in annotation_id_map: relation.add_node( bioc.BioCNode('T{}'.format(annotation_id_map[node.index]), 'dependant')) relation.add_node( bioc.BioCNode('T{}'.format(annotation_id_map[node.head]), 'governor')) relations.append(relation) rel_index += 1 return annotations, relations
def create_bioc_document_from_document_json(document): b_document = bioc.BioCDocument() b_document.id = document['sourceid'] passage = bioc.BioCPassage() passage.text = document['text'] passage.offset = 0 annotation_user_map = {} for denotation in document['denotations']: annotation_user_map[denotation['id']] = denotation['userId'] if denotation['userId'] != 0: continue annotation = bioc.BioCAnnotation() annotation.id = denotation['id'] location = bioc.BioCLocation(0, 0) location.offset = denotation['span']['begin'] location.length = denotation['span']['end'] - denotation['span'][ 'begin'] annotation.locations.append(location) annotation.text = document['text'][ denotation['span']['begin']:denotation['span']['end']] annotation.infons = denotation['obj'] passage.add_annotation(annotation) for relation in document['relations']: subj_from_current_user = annotation_user_map[relation['subj']] == 0 obj_from_current_user = annotation_user_map[relation['obj']] == 0 if not (subj_from_current_user and obj_from_current_user): continue b_relation = bioc.BioCRelation() b_relation.id = relation['id'] start_node = bioc.BioCNode('', '') end_node = bioc.BioCNode('', '') start_node.refid = relation['subj'] end_node.refid = relation['obj'] b_relation.add_node(start_node) b_relation.add_node(end_node) b_relation.infons = relation['pred'] passage.add_relation(b_relation) b_document.add_passage(passage) return b_document
def predict(opt, data): seq_model = SeqModel(data) if opt.test_in_cpu: seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst})) seq_wordseq = WordSequence(data, False, True, True, True) if opt.test_in_cpu: seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst})) classify_model = ClassifyModel(data) if opt.test_in_cpu: classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst})) classify_wordseq = WordSequence(data, True, False, True, False) if opt.test_in_cpu: classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu')) else: cuda_src = 'cuda:{}'.format(opt.old_gpu) cuda_dst = 'cuda:{}'.format(opt.gpu) classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst})) input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.'] # for idx in tqdm(range(len(input_files))): for idx in range(len(input_files)): start = time.time() fileName = join(opt.input,input_files[idx]) doc_name = input_files[idx] doc_token = processOneFile(fileName) doc = generateDataForOneFile(doc_token) raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data) relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'argument 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'argument 2') bioc_relation.add_node(node2) with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp) end = time.time() logging.info("process %s complete with %.2fs" % (input_files[idx], end-start)) logging.info("test finished")
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir): logging.info("loading ... vocab") relation_vocab = pickle.load( open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb')) logging.info("loading ... result") results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb")) for i in tqdm(range(len(test_relation))): doc_entity = test_entity[i] doc_name = test_name[i] collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for _, entity in doc_entity.iterrows(): anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity['id'] anno_entity.infons['type'] = entity['type'] anno_entity_location = bioc.BioCLocation( entity['start'], entity['end'] - entity['start']) anno_entity.add_location(anno_entity_location) anno_entity.text = entity['text'] relation_id = 1 for result in results: if doc_name == result['doc_name']: former = doc_entity[( doc_entity['id'] == result['former_id'])].iloc[0] latter = doc_entity[( doc_entity['id'] == result['latter_id'])].iloc[0] relation_type = relation_vocab.lookup_id2str(result['type']) if relation_type == '<unk>': continue elif my_utils.relationConstraint1(relation_type, former['type'], latter['type']) == False: continue else: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = str(relation_id) relation_id += 1 bioc_relation.infons['type'] = relation_type node1 = bioc.BioCNode(former['id'], 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(latter['id'], 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def relation(this, json_rel): rel = bioc.BioCRelation() rel.id = json_rel['id'] rel.infons = json_rel['infons'] rel.nodes = [this.node(n) for n in json_rel['nodes']] return rel
def test(data, opt, predict_dir): test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData( data.test_dir) # evaluate on test data and output results in bioc format, one doc one file data.load(opt.data_file) data.MAX_SENTENCE_LENGTH = -1 data.show_data_summary() data.fix_alphabet() seq_model = SeqModel(data) seq_model.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'model.pkl'))) ner_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \ data.feature_emb_dims[data.feature_name2id['[POS]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i)))) ner_hiddenlist.append(temp) ner_wordrep = WordRep(data, False, True, True, data.use_char) ner_wordrep.load_state_dict( torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl'))) classify_model = ClassifyModel(data) classify_model.load_state_dict( torch.load(os.path.join(opt.re_dir, 'model.pkl'))) re_hiddenlist = [] for i in range(opt.hidden_num): if i == 0: input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\ 2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']] output_size = data.HP_hidden_dim else: input_size = data.HP_hidden_dim output_size = data.HP_hidden_dim temp = HiddenLayer(data, input_size, output_size) temp.load_state_dict( torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i)))) re_hiddenlist.append(temp) re_wordrep = WordRep(data, True, False, True, False) re_wordrep.load_state_dict( torch.load(os.path.join(opt.re_dir, 'wordrep.pkl'))) for i in tqdm(range(len(test_name))): doc_name = test_name[i] doc_token = test_token[i] doc_entity = test_entity[i] if opt.use_gold_ner: entities = [] for _, e in doc_entity.iterrows(): entity = Entity() entity.create(e['id'], e['type'], e['start'], e['end'], e['text'], e['sent_idx'], e['tf_start'], e['tf_end']) entities.append(entity) else: ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity) data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer( ncrf_data, data.word_alphabet, data.char_alphabet, data.feature_alphabets, data.label_alphabet, data.number_normalized, data.MAX_SENTENCE_LENGTH) decode_results = ner_evaluateWhenTest(data, ner_wordrep, ner_hiddenlist, seq_model) entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name) collection = bioc.BioCCollection() document = bioc.BioCDocument() collection.add_document(document) document.id = doc_name passage = bioc.BioCPassage() document.add_passage(passage) passage.offset = 0 for entity in entities: anno_entity = bioc.BioCAnnotation() passage.add_annotation(anno_entity) anno_entity.id = entity.id anno_entity.infons['type'] = entity.type anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength()) anno_entity.add_location(anno_entity_location) anno_entity.text = entity.text test_X, test_other = relation_extraction.getRelationInstanceForOneDoc( doc_token, entities, doc_name, data) relations = re_evaluateWhenTest( re_wordrep, re_hiddenlist, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']]) for relation in relations: bioc_relation = bioc.BioCRelation() passage.add_relation(bioc_relation) bioc_relation.id = relation.id bioc_relation.infons['type'] = relation.type node1 = bioc.BioCNode(relation.node1.id, 'annotation 1') bioc_relation.add_node(node1) node2 = bioc.BioCNode(relation.node2.id, 'annotation 2') bioc_relation.add_node(node2) with open(os.path.join(predict_dir, doc_name + ".bioc.xml"), 'w') as fp: bioc.dump(collection, fp)
def pubtator2bioc_rel(rel: PubTatorRel) -> bioc.BioCRelation: biocrel = bioc.BioCRelation() biocrel.infons['type'] = rel.type biocrel.add_node(bioc.BioCNode(rel.id1, 'Chemical')) biocrel.add_node(bioc.BioCNode(rel.id2, 'Disease')) return biocrel