Ejemplo n.º 1
0
def convert_dg(dependency_graph, text: str, offset: int, ann_index: int = 0, rel_index: int = 0) \
        -> Tuple[List[bioc.BioCAnnotation], List[bioc.BioCRelation]]:
    """
    Convert dependency graph to annotations and relations
    """
    annotations = []
    relations = []
    annotation_id_map = {}
    start = 0
    for node in dependency_graph:
        if node.index in annotation_id_map:
            continue
        node_form = node.form
        index = text.find(node_form, start)
        if index == -1:
            node_form = adapt_value(node.form)
            index = text.find(node_form, start)
            if index == -1:
                logging.debug(
                    'Cannot convert parse tree to dependency graph at %d\n%d\n%s',
                    start, offset, str(dependency_graph))
                continue

        ann = bioc.BioCAnnotation()
        ann.id = 'T{}'.format(ann_index)
        ann.text = node_form
        ann.infons['tag'] = node.pos

        start = index

        ann.add_location(bioc.BioCLocation(start + offset, len(node_form)))
        annotations.append(ann)
        annotation_id_map[node.index] = ann_index
        ann_index += 1
        start += len(node_form)

    for node in dependency_graph:
        if node.head == 0:
            ann = annotations[annotation_id_map[node.index]]
            ann.infons['ROOT'] = True
            continue
        relation = bioc.BioCRelation()
        relation.id = 'R{}'.format(rel_index)
        relation.infons['dependency'] = node.deprel
        if node.extra:
            relation.infons['extra'] = node.extra
        if node.index in annotation_id_map and node.head in annotation_id_map:
            relation.add_node(
                bioc.BioCNode('T{}'.format(annotation_id_map[node.index]),
                              'dependant'))
            relation.add_node(
                bioc.BioCNode('T{}'.format(annotation_id_map[node.head]),
                              'governor'))
            relations.append(relation)
            rel_index += 1

    return annotations, relations
Ejemplo n.º 2
0
def convertKindredCorpusToBioCCollection(corpus):
	assert isinstance(corpus,kindred.Corpus)
	collection = bioc.BioCCollection()
	for kdoc in corpus.documents:
		assert isinstance(kdoc,kindred.Document)

		biocDoc = bioc.BioCDocument()
		collection.add_document(biocDoc)

		if 'id' in kdoc.metadata:
			biocDoc.id = kdoc.metadata['id']
		biocDoc.infons = kdoc.metadata

		passage = bioc.BioCPassage()
		passage.text = kdoc.text
		passage.offset = 0
		biocDoc.add_passage(passage)

		seenEntityIDs = set()
		kindredID2BiocID = {}
		for e in kdoc.entities:
			assert isinstance(e,kindred.Entity)

			a = bioc.BioCAnnotation()
			a.text = e.text
			a.infons = {'type':e.entityType}
			a.infons.update(e.metadata)

			if e.sourceEntityID is None:
				a.id = str(e.entityID)
			else:
				a.id = e.sourceEntityID

			assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id
			seenEntityIDs.add(a.id)
			kindredID2BiocID[e.entityID] = a.id

			for start,end in e.position:
				l = bioc.BioCLocation(offset=start, length=(end-start))
				a.locations.append(l)

			passage.annotations.append(a)

		for r in kdoc.relations:
			assert isinstance(r,kindred.Relation)
			biocR = bioc.BioCRelation()
			biocR.infons = {'type':r.relationType}
			
			entitiesInRelation = r.entities
			argNames = r.argNames
			if argNames is None:
				argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ]

			for argName,entity in zip(argNames,entitiesInRelation):
				node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID])
				biocR.nodes.append(node)

			passage.relations.append(biocR)

	return collection
Ejemplo n.º 3
0
def create_bioc_document_from_document_json(document):
    b_document = bioc.BioCDocument()
    b_document.id = document['sourceid']
    passage = bioc.BioCPassage()
    passage.text = document['text']
    passage.offset = 0
    annotation_user_map = {}
    for denotation in document['denotations']:
        annotation_user_map[denotation['id']] = denotation['userId']
        if denotation['userId'] != 0:
            continue
        annotation = bioc.BioCAnnotation()
        annotation.id = denotation['id']
        location = bioc.BioCLocation(0, 0)
        location.offset = denotation['span']['begin']
        location.length = denotation['span']['end'] - denotation['span'][
            'begin']
        annotation.locations.append(location)
        annotation.text = document['text'][
            denotation['span']['begin']:denotation['span']['end']]
        annotation.infons = denotation['obj']
        passage.add_annotation(annotation)
    for relation in document['relations']:
        subj_from_current_user = annotation_user_map[relation['subj']] == 0
        obj_from_current_user = annotation_user_map[relation['obj']] == 0
        if not (subj_from_current_user and obj_from_current_user):
            continue
        b_relation = bioc.BioCRelation()
        b_relation.id = relation['id']
        start_node = bioc.BioCNode('', '')
        end_node = bioc.BioCNode('', '')
        start_node.refid = relation['subj']
        end_node.refid = relation['obj']
        b_relation.add_node(start_node)
        b_relation.add_node(end_node)
        b_relation.infons = relation['pred']
        passage.add_relation(b_relation)
    b_document.add_passage(passage)
    return b_document
Ejemplo n.º 4
0
def predict(opt, data):

    seq_model = SeqModel(data)
    if opt.test_in_cpu:
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst}))


    seq_wordseq = WordSequence(data, False, True, True, True)
    if opt.test_in_cpu:
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    classify_model = ClassifyModel(data)
    if opt.test_in_cpu:
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst}))

    classify_wordseq = WordSequence(data, True, False, True, False)
    if opt.test_in_cpu:
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.']


    # for idx in tqdm(range(len(input_files))):
    for idx in range(len(input_files)):

        start = time.time()
        fileName = join(opt.input,input_files[idx])
        doc_name = input_files[idx]

        doc_token = processOneFile(fileName)

        doc = generateDataForOneFile(doc_token)

        raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet,
                                                                   data.feature_alphabets, data.label_alphabet,
                                                                   data.number_normalized,
                                                                   data.MAX_SENTENCE_LENGTH)

        decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids)


        entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text


        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data)

        relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'argument 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'argument 2')
            bioc_relation.add_node(node2)


        with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp:
            bioc.dump(collection, fp)

        end = time.time()
        logging.info("process %s complete with %.2fs" % (input_files[idx], end-start))



    logging.info("test finished")
Ejemplo n.º 5
0
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir):
    logging.info("loading ... vocab")
    relation_vocab = pickle.load(
        open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb'))

    logging.info("loading ... result")
    results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb"))

    for i in tqdm(range(len(test_relation))):

        doc_entity = test_entity[i]
        doc_name = test_name[i]

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for _, entity in doc_entity.iterrows():
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity['id']
            anno_entity.infons['type'] = entity['type']
            anno_entity_location = bioc.BioCLocation(
                entity['start'], entity['end'] - entity['start'])
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity['text']

        relation_id = 1
        for result in results:

            if doc_name == result['doc_name']:

                former = doc_entity[(
                    doc_entity['id'] == result['former_id'])].iloc[0]
                latter = doc_entity[(
                    doc_entity['id'] == result['latter_id'])].iloc[0]

                relation_type = relation_vocab.lookup_id2str(result['type'])
                if relation_type == '<unk>':
                    continue
                elif my_utils.relationConstraint1(relation_type,
                                                  former['type'],
                                                  latter['type']) == False:
                    continue
                else:
                    bioc_relation = bioc.BioCRelation()
                    passage.add_relation(bioc_relation)
                    bioc_relation.id = str(relation_id)
                    relation_id += 1
                    bioc_relation.infons['type'] = relation_type

                    node1 = bioc.BioCNode(former['id'], 'annotation 1')
                    bioc_relation.add_node(node1)
                    node2 = bioc.BioCNode(latter['id'], 'annotation 2')
                    bioc_relation.add_node(node2)

        with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)
Ejemplo n.º 6
0
 def node(this, json_node):
     node = bioc.BioCNode()
     node.refid = json_node['refid']
     node.role = json_node['role']
     return node
Ejemplo n.º 7
0
def test(data, opt, predict_dir):
    test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData(
        data.test_dir)

    # evaluate on test data and output results in bioc format, one doc one file

    data.load(opt.data_file)
    data.MAX_SENTENCE_LENGTH = -1
    data.show_data_summary()

    data.fix_alphabet()
    seq_model = SeqModel(data)
    seq_model.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'model.pkl')))
    ner_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \
                         data.feature_emb_dims[data.feature_name2id['[POS]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i))))
        ner_hiddenlist.append(temp)

    ner_wordrep = WordRep(data, False, True, True, data.use_char)
    ner_wordrep.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl')))

    classify_model = ClassifyModel(data)
    classify_model.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'model.pkl')))
    re_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\
                         2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i))))
        re_hiddenlist.append(temp)

    re_wordrep = WordRep(data, True, False, True, False)
    re_wordrep.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'wordrep.pkl')))

    for i in tqdm(range(len(test_name))):
        doc_name = test_name[i]
        doc_token = test_token[i]
        doc_entity = test_entity[i]

        if opt.use_gold_ner:
            entities = []
            for _, e in doc_entity.iterrows():
                entity = Entity()
                entity.create(e['id'], e['type'], e['start'], e['end'],
                              e['text'], e['sent_idx'], e['tf_start'],
                              e['tf_end'])
                entities.append(entity)
        else:

            ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity)

            data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer(
                ncrf_data, data.word_alphabet, data.char_alphabet,
                data.feature_alphabets, data.label_alphabet,
                data.number_normalized, data.MAX_SENTENCE_LENGTH)

            decode_results = ner_evaluateWhenTest(data, ner_wordrep,
                                                  ner_hiddenlist, seq_model)

            entities = ner.translateNCRFPPintoEntities(doc_token,
                                                       decode_results,
                                                       doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start,
                                                     entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text

        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(
            doc_token, entities, doc_name, data)

        relations = re_evaluateWhenTest(
            re_wordrep, re_hiddenlist, classify_model, test_X, data,
            test_other,
            data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'annotation 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'annotation 2')
            bioc_relation.add_node(node2)

        with open(os.path.join(predict_dir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)
Ejemplo n.º 8
0
def pubtator2bioc_rel(rel: PubTatorRel) -> bioc.BioCRelation:
    biocrel = bioc.BioCRelation()
    biocrel.infons['type'] = rel.type
    biocrel.add_node(bioc.BioCNode(rel.id1, 'Chemical'))
    biocrel.add_node(bioc.BioCNode(rel.id2, 'Disease'))
    return biocrel