예제 #1
0
def read_text(pathname):
    with open(pathname) as fp:
        text = fp.read()
    sentences = []
    offset = 0
    for sent in text.split('\n'):
        sentence = bioc.BioCSentence()
        sentence.infons['filename'] = pathname.stem
        sentence.offset = offset
        sentence.text = sent
        sentences.append(sentence)
        i = 0
        for m in re.finditer('\S+', sent):
            if i == 0 and m.start() != 0:
                # add fake
                ann = bioc.BioCAnnotation()
                ann.id = f'a{i}'
                ann.text = ''
                ann.add_location(bioc.BioCLocation(offset, 0))
                sentence.add_annotation(ann)
                i += 1
            ann = bioc.BioCAnnotation()
            ann.id = f'a{i}'
            ann.text = m.group()
            ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group())))
            sentence.add_annotation(ann)
            i += 1
        offset += len(sent) + 1
    return sentences
예제 #2
0
def test_extend():
    text = 'findings: no pneumothorax.'
    d = text_to_bioc([text], type='d/p/s')
    a = bioc.BioCAnnotation()
    a.text = 'pneumothorax'
    a.add_location(bioc.BioCLocation(13, 12))
    d.passages[0].add_annotation(a)
    detector.__call__(d)

    # fake ann
    a = bioc.BioCAnnotation()
    a.text = 'eumothor'
    a.add_location(bioc.BioCLocation(15, 8))
    d.passages[0].add_annotation(a)

    a = bioc.BioCAnnotation()
    a.text = 'foo'
    a.add_location(bioc.BioCLocation(27, 3))
    d.passages[0].add_annotation(a)

    _extend(d, 'negation')

    assert d.passages[0].annotations[1].infons['negation'] == 'True'
    assert 'negation' not in d.passages[0].annotations[2].infons

    d.passages[0].annotations[0].infons['CUI'] = 'xxx'
    d.passages[0].annotations[2].infons['CUI'] = 'xxx'
    _extend(d, 'negation')
    assert 'negation' not in d.passages[0].annotations[2].infons
예제 #3
0
    def save_as_collection(list_of_pymedext_documents: List[Document]):
        """save a list of pymedext document as a bioc collection .
        It will return a bioc collection object.

        :param list_of_pymedext_documents: a list of Document
        :returns:  a bioc collection object
        """
        this_bioc_collection = bioc.BioCCollection()
        for this_pymedext_doc in list_of_pymedext_documents:
            this_bioc_doc = bioc.BioCDocument()
            for annot in this_pymedext_doc.annotations:
                # print(annot.type)
                print(annot.source)
                if annot.type == "raw_text":
                    if this_bioc_collection.source == '':
                        this_bioc_collection.source = annot.source
                if annot.source == "BioCPassage":
                    print(annot.ngram)
                    print(annot.value)
                    this_passage = bioc.BioCPassage()
                    this_passage.text = annot.ngram
                    this_passage.offset = annot.span[0]
                    this_bioc_doc.add_passage(this_passage)
                    # passageAttributes to add
                elif annot.source == "BioCAnnotation":
                    this_annotation = bioc.BioCAnnotation()
                    this_annotation.infons = annot.attributes
                    this_annotation.id = annot.attributes["id"]
                    this_annotation.text = annot.ngram
                    thisLocation = bioc.BioCLocation(
                        annot.span[0], annot.span[1] - annot.span[0])
                    this_annotation.add_location(thisLocation)
                    this_bioc_doc.passages[-1].add_annotation(this_annotation)
            this_bioc_collection.add_document(this_bioc_doc)
        return (this_bioc_collection)
예제 #4
0
def convertKindredCorpusToBioCCollection(corpus):
	assert isinstance(corpus,kindred.Corpus)
	collection = bioc.BioCCollection()
	for kdoc in corpus.documents:
		assert isinstance(kdoc,kindred.Document)

		biocDoc = bioc.BioCDocument()
		collection.add_document(biocDoc)

		if 'id' in kdoc.metadata:
			biocDoc.id = kdoc.metadata['id']
		biocDoc.infons = kdoc.metadata

		passage = bioc.BioCPassage()
		passage.text = kdoc.text
		passage.offset = 0
		biocDoc.add_passage(passage)

		seenEntityIDs = set()
		kindredID2BiocID = {}
		for e in kdoc.entities:
			assert isinstance(e,kindred.Entity)

			a = bioc.BioCAnnotation()
			a.text = e.text
			a.infons = {'type':e.entityType}
			a.infons.update(e.metadata)

			if e.sourceEntityID is None:
				a.id = str(e.entityID)
			else:
				a.id = e.sourceEntityID

			assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id
			seenEntityIDs.add(a.id)
			kindredID2BiocID[e.entityID] = a.id

			for start,end in e.position:
				l = bioc.BioCLocation(offset=start, length=(end-start))
				a.locations.append(l)

			passage.annotations.append(a)

		for r in kdoc.relations:
			assert isinstance(r,kindred.Relation)
			biocR = bioc.BioCRelation()
			biocR.infons = {'type':r.relationType}
			
			entitiesInRelation = r.entities
			argNames = r.argNames
			if argNames is None:
				argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ]

			for argName,entity in zip(argNames,entitiesInRelation):
				node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID])
				biocR.nodes.append(node)

			passage.relations.append(biocR)

	return collection
예제 #5
0
def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set):

    with open(infile, 'r', encoding='utf-8') as fin:
        with open(outfile, 'w', encoding='utf8') as fout:
            collection = bioc.load(fin)
            for document in collection.documents:
                for passage in document.passages:
                    tag_result = bioTag(passage.text,
                                        biotag_dic,
                                        nn_model,
                                        onlyLongest=para_set['onlyLongest'],
                                        abbrRecog=para_set['abbrRecog'],
                                        Threshold=para_set['ML_Threshold'])
                    mention_num = 0
                    for ele in tag_result:
                        bioc_note = bioc.BioCAnnotation()
                        bioc_note.id = str(mention_num)
                        mention_num += 1
                        bioc_note.infons['identifier'] = ele[2]
                        bioc_note.infons['type'] = "Phenotype"
                        bioc_note.infons['score'] = ele[3]
                        start = int(ele[0])
                        last = int(ele[1])
                        loc = bioc.BioCLocation(offset=str(start),
                                                length=str(last - start))
                        bioc_note.locations.append(loc)
                        bioc_note.text = passage.text[start:last]
                        passage.annotations.append(bioc_note)
            bioc.dump(collection, fout, pretty_print=True)
예제 #6
0
파일: test.py 프로젝트: pj0616/norm-1-30
def dump_results(doc_name, entities, opt):
    entity_id = 1
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = doc_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0

    for entity in entities:
        anno_entity = bioc.BioCAnnotation()
        passage.add_annotation(anno_entity)
        anno_entity.id = str(entity_id)
        entity_id += 1
        anno_entity.infons['type'] = entity.type
        anno_entity_location = bioc.BioCLocation(
            entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0])
        anno_entity.add_location(anno_entity_location)
        anno_entity.text = entity.name
        if len(entity.norm_ids) > 0:
            anno_entity.infons['UMLS code'] = entity.norm_ids[0]
            anno_entity.infons['UMLS term'] = entity.norm_names[0]
        else:
            anno_entity.infons['UMLS code'] = 'N/A'
            anno_entity.infons['UMLS term'] = 'N/A'

    with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w',
                     'UTF-8') as fp:
        bioc.dump(collection, fp)
예제 #7
0
 def annotation(this, json_note):
     note = bioc.BioCAnnotation()
     note.id = json_note['id']
     note.infons = json_note['infons']
     note.text = json_note['text']
     note.locations = [this.location(l)
                       for l in json_note['locations']] 
     return note
예제 #8
0
def pubtator2bioc_ann(ann: PubTatorAnn) -> bioc.BioCAnnotation:
    biocann = bioc.BioCAnnotation()
    biocann.id = 'T{}'.format(ann.start)
    biocann.infons['type'] = ann.type
    biocann.infons['concept_id'] = ann.id
    biocann.add_location(bioc.BioCLocation(ann.start, ann.end - ann.start))
    biocann.text = ann.text
    return biocann
예제 #9
0
def run_metamap_col(collection, mm, cuis=None):
    """
    Get CUIs from metamap.

    Args:
        collection(BioCCollection):
        mm(MetaMap): MetaMap instance

    Returns:
        BioCCollection
    """
    try:
        annIndex = itertools.count()
        sentence_map = collections.OrderedDict()
        for document in collection.documents:
            for passage in document.passages:
                for sentence in passage.sentences:
                    sentence_map['{}-{}'.format(document.id.replace('.', '-'),
                                                sentence.offset)] = (passage,
                                                                     sentence)

        sents = []
        ids = []
        for k in sentence_map:
            ids.append(k)
            sents.append(remove_newline(sentence_map[k][1].text))

        concepts, error = mm.extract_concepts(sents, ids)
        if error is None:
            for concept in concepts:
                concept_index = adapt_concept_index(concept.index)
                try:
                    if cuis is not None:
                        # if no CUI is returned for this concept - skip it
                        concept_cui = getattr(concept, 'cui', None)
                        if concept_cui not in cuis:
                            continue
                    m = re.match(r'(\d+)/(\d+)', concept.pos_info)
                    if m:
                        passage = sentence_map[concept_index][0]
                        sentence = sentence_map[concept_index][1]
                        start = int(m.group(1)) - 1
                        length = int(m.group(2))
                        ann = bioc.BioCAnnotation()
                        ann.id = str(next(annIndex))
                        ann.infons['CUI'] = concept.cui
                        ann.infons['semtype'] = concept.semtypes[1:-1]
                        ann.infons['term'] = concept.preferred_name
                        ann.infons['annotator'] = 'MetaMap'
                        ann.add_location(
                            bioc.BioCLocation(sentence.offset + start, length))
                        ann.text = sentence.text[start:start + length]
                        passage.annotations.append(ann)
                except:
                    logging.exception('')
    except:
        logging.exception("Cannot process %s", collection.source)
    return collection
예제 #10
0
def convert_dg(dependency_graph, text: str, offset: int, ann_index: int = 0, rel_index: int = 0) \
        -> Tuple[List[bioc.BioCAnnotation], List[bioc.BioCRelation]]:
    """
    Convert dependency graph to annotations and relations
    """
    annotations = []
    relations = []
    annotation_id_map = {}
    start = 0
    for node in dependency_graph:
        if node.index in annotation_id_map:
            continue
        node_form = node.form
        index = text.find(node_form, start)
        if index == -1:
            node_form = adapt_value(node.form)
            index = text.find(node_form, start)
            if index == -1:
                logging.debug(
                    'Cannot convert parse tree to dependency graph at %d\n%d\n%s',
                    start, offset, str(dependency_graph))
                continue

        ann = bioc.BioCAnnotation()
        ann.id = 'T{}'.format(ann_index)
        ann.text = node_form
        ann.infons['tag'] = node.pos

        start = index

        ann.add_location(bioc.BioCLocation(start + offset, len(node_form)))
        annotations.append(ann)
        annotation_id_map[node.index] = ann_index
        ann_index += 1
        start += len(node_form)

    for node in dependency_graph:
        if node.head == 0:
            ann = annotations[annotation_id_map[node.index]]
            ann.infons['ROOT'] = True
            continue
        relation = bioc.BioCRelation()
        relation.id = 'R{}'.format(rel_index)
        relation.infons['dependency'] = node.deprel
        if node.extra:
            relation.infons['extra'] = node.extra
        if node.index in annotation_id_map and node.head in annotation_id_map:
            relation.add_node(
                bioc.BioCNode('T{}'.format(annotation_id_map[node.index]),
                              'dependant'))
            relation.add_node(
                bioc.BioCNode('T{}'.format(annotation_id_map[node.head]),
                              'governor'))
            relations.append(relation)
            rel_index += 1

    return annotations, relations
예제 #11
0
def test_neg_regex():
    text = 'findings: no pneumothorax.'
    assert is_neg_regex(text)

    d = text_to_bioc([text], type='d/p/s')
    a = bioc.BioCAnnotation()
    a.text = 'pneumothorax'
    a.add_location(bioc.BioCLocation(13, 12))
    d.passages[0].add_annotation(a)
    detector.__call__(d)
    assert d.passages[0].annotations[0].infons['negation'] == 'True'
예제 #12
0
def run_metamap(document, mm, cuis=None):
    """
    Get CUIs from metamap.

    Args:
        document(BioCDocument):
        mm(MetaMap): MetaMap instance

    Returns:
        BioCDocument
    """
    try:
        annIndex = itertools.count()
        sentence_map = collections.OrderedDict()
        for passage in document.passages:
            for sentence in passage.sentences:
                sentence_map[str(sentence.offset)] = (passage, sentence)

        sents = []
        ids = []
        for k in sentence_map:
            ids.append(k)
            sents.append(remove_newline(sentence_map[k][1].text))

        concepts, error = mm.extract_concepts(sents, ids)
        print('Done')
        if error is None:
            for concept in concepts:
                # print(concept)
                concept_index = adapt_concept_index(concept.index)
                try:
                    if cuis is not None and concept.cui not in cuis:
                        continue
                    m = re.match(r'(\d+)/(\d+)', concept.pos_info)
                    if m:
                        passage = sentence_map[concept_index][0]
                        sentence = sentence_map[concept_index][1]
                        start = int(m.group(1)) - 1
                        length = int(m.group(2))
                        ann = bioc.BioCAnnotation()
                        ann.id = str(next(annIndex))
                        ann.infons['CUI'] = concept.cui
                        ann.infons['semtype'] = concept.semtypes[1:-1]
                        ann.infons['term'] = concept.preferred_name
                        ann.infons['annotator'] = 'MetaMap'
                        ann.add_location(
                            bioc.BioCLocation(sentence.offset + start, length))
                        ann.text = sentence.text[start:start + length]
                        passage.annotations.append(ann)
                except:
                    logging.exception('')
    except:
        logging.exception("Cannot process %s", document.id)
    return document
예제 #13
0
파일: utils.py 프로젝트: jakelever/biotext
def strip_annotation_markers(
        text: str,
        annotations_map: Dict[str,
                              str]) -> Tuple[str, List[bioc.BioCAnnotation]]:
    """
    Given a set of annotations, remove any which are found in the current text and return
    the new string as well as the positions of the annotations in the transformed string
    """
    matched_annotations: List[Tuple[int, int.str]] = []
    for ann_marker in annotations_map:
        # citation in brackets
        patterns = [
            (r'[^\S\t]?[\(\[\{]' + re.escape(ann_marker) + r'[\)\]\}]',
             0),  # citation in brackets
            (
                r'[^\S\t]' + re.escape(ann_marker) + r'\.',
                1,
            ),  # citation at end of sentence, remove extra whitespace
            (
                r'[^\S\t]' + re.escape(ann_marker) + r'[^\S\t]',
                1,
            ),  # citation surrounded by whitespace
            (re.escape(ann_marker), 0),  # citation by itself
        ]
        for pattern, end_offset in patterns:
            match = re.search(pattern, text)
            if match:
                matched_annotations.append(
                    (match.start(), match.end() - end_offset, ann_marker))
                break

    transformed_annotations: List[bioc.BioCAnnotation] = []
    transformed_text = text
    offset = 0

    for start, end, marker in matched_annotations:
        ann = bioc.BioCAnnotation()
        ann.id = marker
        ann.infons['citation_text'] = annotations_map[marker]
        ann.infons['type'] = 'citation'
        transformed_text = transformed_text[:start -
                                            offset] + transformed_text[end -
                                                                       offset:]

        # since the token place-holder is removed, must be start - 1 (and previous offset) for the new position
        ann.add_location(bioc.BioCLocation(start - offset - 1, 0))

        offset += end - start
        transformed_annotations.append(ann)
    return transformed_text, transformed_annotations
예제 #14
0
def translateNCRFPPintoBioc(doc_token, predict_results, file_name):
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = file_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0
    entity_id = 1

    sent_num = len(predict_results)
    for idx in range(sent_num):
        sent_length = len(predict_results[idx][0])
        sent_token = doc_token[(doc_token['sent_idx'] == idx)]

        assert sent_token.shape[0] == sent_length, "file {}, sent {}".format(
            file_name, idx)
        labelSequence = []

        for idy in range(sent_length):
            token = sent_token.iloc[idy]
            label = predict_results[idx][0][idy]
            labelSequence.append(label)

            if label[0] == 'S' or label[0] == 'B':
                anno_entity = bioc.BioCAnnotation()
                passage.add_annotation(anno_entity)
                anno_entity.id = str(entity_id)
                anno_entity.infons['type'] = label[2:]
                anno_entity_location = bioc.BioCLocation(
                    token['start'], token['end'] - token['start'])
                anno_entity.add_location(anno_entity_location)
                anno_entity.text = token['text']
                entity_id += 1

            elif label[0] == 'M' or label[0] == 'E':
                if checkWrongState(labelSequence):
                    anno_entity = passage.annotations[-1]

                    whitespacetoAdd = token['start'] - anno_entity.locations[
                        0].end
                    for _ in range(whitespacetoAdd):
                        anno_entity.text += " "
                    anno_entity.text += token['text']
                    anno_entity.locations[0].length = token[
                        'end'] - anno_entity.locations[0].offset

    bioc_file = open(file_name + ".bioc.xml", 'w')
    bioc.dump(collection, bioc_file)
    bioc_file.close()
예제 #15
0
    def add_match(self, impression, sentence, ann_index, phrase, observation,
                  start, end):
        """Add the match data and metadata to the impression object
        in place."""
        annotation = bioc.BioCAnnotation()
        annotation.id = ann_index
        annotation.infons['CUI'] = None
        annotation.infons['semtype'] = None
        annotation.infons['term'] = phrase
        annotation.infons[OBSERVATION] = observation
        annotation.infons['annotator'] = 'Phrase'
        length = end - start
        annotation.add_location(
            bioc.BioCLocation(sentence.offset + start, length))
        annotation.text = sentence.text[start:start + length]

        impression.annotations.append(annotation)
예제 #16
0
def tokenize_text(text, id):
    sentences = []
    doc = nlp(text)
    for sent in doc.sents:
        sentence = bioc.BioCSentence()
        sentence.infons['filename'] = id
        sentence.offset = sent.start_char
        sentence.text = text[sent.start_char:sent.end_char]
        sentences.append(sentence)
        i = 0
        for token in sent:
            for t, start, end in split_punct(token.text, token.idx):
                ann = bioc.BioCAnnotation()
                ann.id = f'a{i}'
                ann.text = t
                ann.add_location(bioc.BioCLocation(start, end - start))
                sentence.add_annotation(ann)
                i += 1
    return sentences
예제 #17
0
def test_clean_sentences():
    cleanup = CleanUp()

    doc = text_to_bioc(['No pneumothorax.', 'No pneumothorax.'], type='d/p/s')
    p = doc.passages[0]
    for i in range(10, 0, -1):
        ann = bioc.BioCAnnotation()
        ann.add_location(bioc.BioCLocation(i, 1))
        p.add_annotation(ann)

    assert len(doc.passages[0].sentences) == 2
    doc = cleanup.__call__(doc)
    assert len(doc.passages[0].sentences) == 0
    assert len(doc.passages[0].annotations) == 10
    for i in range(10):
        assert doc.passages[0].annotations[i].total_span.offset == 10 - i

    doc = cleanup.__call__(doc, sort_anns=True)
    for i in range(10):
        assert doc.passages[0].annotations[i].total_span.offset == i + 1
예제 #18
0
def create_bioc_document_from_document_json(document):
    b_document = bioc.BioCDocument()
    b_document.id = document['sourceid']
    passage = bioc.BioCPassage()
    passage.text = document['text']
    passage.offset = 0
    annotation_user_map = {}
    for denotation in document['denotations']:
        annotation_user_map[denotation['id']] = denotation['userId']
        if denotation['userId'] != 0:
            continue
        annotation = bioc.BioCAnnotation()
        annotation.id = denotation['id']
        location = bioc.BioCLocation(0, 0)
        location.offset = denotation['span']['begin']
        location.length = denotation['span']['end'] - denotation['span'][
            'begin']
        annotation.locations.append(location)
        annotation.text = document['text'][
            denotation['span']['begin']:denotation['span']['end']]
        annotation.infons = denotation['obj']
        passage.add_annotation(annotation)
    for relation in document['relations']:
        subj_from_current_user = annotation_user_map[relation['subj']] == 0
        obj_from_current_user = annotation_user_map[relation['obj']] == 0
        if not (subj_from_current_user and obj_from_current_user):
            continue
        b_relation = bioc.BioCRelation()
        b_relation.id = relation['id']
        start_node = bioc.BioCNode('', '')
        end_node = bioc.BioCNode('', '')
        start_node.refid = relation['subj']
        end_node.refid = relation['obj']
        b_relation.add_node(start_node)
        b_relation.add_node(end_node)
        b_relation.infons = relation['pred']
        passage.add_relation(b_relation)
    b_document.add_passage(passage)
    return b_document
예제 #19
0
 def __call__(self, document, *args, **kwargs):
     annotation_index = itertools.count()
     for passage in document.passages:
         for sentence in passage.sentences:
             obs_phrases = self.observation2mention_phrases.items()
             for observation, phrases in obs_phrases:
                 for phrase in phrases:
                     pattern = self.compile_pattern(phrase)
                     for match in pattern.finditer(sentence.text):
                         start, end = match.span(0)
                         if self.overlaps_with_unmention(sentence, observation, start, end):
                             continue
                         annotation = bioc.BioCAnnotation()
                         annotation.id = str(next(annotation_index))
                         annotation.infons['term'] = phrase
                         annotation.infons["observation"] = observation
                         annotation.infons['annotator'] = 'RegEx'
                         annotation.infons['vocab'] = self.vocab_name
                         annotation.add_location(bioc.BioCLocation(sentence.offset + start,
                                                                   end - start))
                         annotation.text = sentence.text[start:end]
                         passage.annotations.append(annotation)
     return document
예제 #20
0
							if start1 > end2:
								pass
							elif start2 > end1:
								pass
							else:
								overlapping = True
								break

						if not overlapping:
							nonoverlapping.append ((start1,end1))

					for start,end in nonoverlapping:
						for annotationType,conceptids in candidates[(start,end)].items():
							conceptid = conceptids = ";".join(sorted(list(set(conceptids))))

							a = bioc.BioCAnnotation()
							a.text = passage.text[start:end]
							a.infons = {'type':annotationType, 'conceptid': conceptid}
							a.id = 'T%d' % currentID
							currentID += 1

							if end <= start:
								continue

							biocLoc = bioc.BioCLocation(offset=passage.offset+start, length=(end-start))
							a.locations.append(biocLoc)
							passage.annotations.append(a)

			writer.write_document(doc)

	print ('Done!')
예제 #21
0
def predict(opt, data):

    seq_model = SeqModel(data)
    if opt.test_in_cpu:
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst}))


    seq_wordseq = WordSequence(data, False, True, True, True)
    if opt.test_in_cpu:
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    classify_model = ClassifyModel(data)
    if opt.test_in_cpu:
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst}))

    classify_wordseq = WordSequence(data, True, False, True, False)
    if opt.test_in_cpu:
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.']


    # for idx in tqdm(range(len(input_files))):
    for idx in range(len(input_files)):

        start = time.time()
        fileName = join(opt.input,input_files[idx])
        doc_name = input_files[idx]

        doc_token = processOneFile(fileName)

        doc = generateDataForOneFile(doc_token)

        raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet,
                                                                   data.feature_alphabets, data.label_alphabet,
                                                                   data.number_normalized,
                                                                   data.MAX_SENTENCE_LENGTH)

        decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids)


        entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text


        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data)

        relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'argument 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'argument 2')
            bioc_relation.add_node(node2)


        with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp:
            bioc.dump(collection, fp)

        end = time.time()
        logging.info("process %s complete with %.2fs" % (input_files[idx], end-start))



    logging.info("test finished")
예제 #22
0
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir):
    logging.info("loading ... vocab")
    relation_vocab = pickle.load(
        open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb'))

    logging.info("loading ... result")
    results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb"))

    for i in tqdm(range(len(test_relation))):

        doc_entity = test_entity[i]
        doc_name = test_name[i]

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for _, entity in doc_entity.iterrows():
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity['id']
            anno_entity.infons['type'] = entity['type']
            anno_entity_location = bioc.BioCLocation(
                entity['start'], entity['end'] - entity['start'])
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity['text']

        relation_id = 1
        for result in results:

            if doc_name == result['doc_name']:

                former = doc_entity[(
                    doc_entity['id'] == result['former_id'])].iloc[0]
                latter = doc_entity[(
                    doc_entity['id'] == result['latter_id'])].iloc[0]

                relation_type = relation_vocab.lookup_id2str(result['type'])
                if relation_type == '<unk>':
                    continue
                elif my_utils.relationConstraint1(relation_type,
                                                  former['type'],
                                                  latter['type']) == False:
                    continue
                else:
                    bioc_relation = bioc.BioCRelation()
                    passage.add_relation(bioc_relation)
                    bioc_relation.id = str(relation_id)
                    relation_id += 1
                    bioc_relation.infons['type'] = relation_type

                    node1 = bioc.BioCNode(former['id'], 'annotation 1')
                    bioc_relation.add_node(node1)
                    node2 = bioc.BioCNode(latter['id'], 'annotation 2')
                    bioc_relation.add_node(node2)

        with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)
예제 #23
0
def evaluate_via_bioc(test_docs,
                      crf,
                      extractor,
                      prediction_dir,
                      made_base_dir=None):
    print('Total documents for evaluation : {}'.format(len(test_docs)))

    if not os.path.exists(prediction_dir):
        os.makedirs(prediction_dir)

    existing_files = glob.glob('{0}/*'.format(prediction_dir))
    existing_files_removed = 0
    for f in existing_files:
        os.remove(f)
        existing_files_removed += 1

    print('Existing files removed : {}'.format(existing_files_removed))

    prediction_documents_written = 0
    reference_filenames = []
    for test_doc in test_docs:
        #print('Working on document : {}'.format(test_doc.filename))

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        document.id = test_doc.filename
        collection.add_document(document)
        passage = bioc.BioCPassage()
        passage.offset = 0
        document.add_passage(passage)

        next_annotation_id = 1

        # now an annotation can be written for each label prediction
        for sentence in test_doc.tokenized_doc.sentences:
            sentence_tokens = []
            # gather tokens in a sentence
            for token_offset_pair in sentence:
                token = test_doc.text[
                    token_offset_pair[0]:token_offset_pair[1]]
                sentence_tokens.append(token)
            if len(sentence_tokens) == 0:
                continue

            sentence_features = extractor.sent2features(sentence_tokens)
            sentence_pred = crf.predict([sentence_features])[0]

            if len(sentence_pred) != len(sentence):
                print('Sentence Features Length : {}'.format(
                    len(sentence_features)))
                print('Sentence Pred Length : {}'.format(len(sentence_pred)))
                print('Sentence Length : {}'.format(len(sentence)))

            # walk manually through the predictions and add spans as appropriate
            token_idx = 0
            while token_idx < len(sentence_pred):
                token_pred = sentence_pred[token_idx]
                if token_pred != 'O':
                    base_label = token_pred.replace('B-', '').replace('I-', '')
                    start_offset = sentence[token_idx][0]
                    end_offset = sentence[token_idx][1]
                    # now let's look to the right as long as we see tokens which are part of this same label
                    while token_idx + 1 < len(sentence_pred) and sentence_pred[
                            token_idx + 1] == ('I-' + base_label):
                        # advance the token
                        token_idx += 1
                        # update the end of this span
                        end_offset = sentence[token_idx][1]

                    # finally we have an annotation that we can add
                    annotation = bioc.BioCAnnotation()

                    annotation.infons['type'] = base_label
                    annotation.text = test_doc.text[start_offset:end_offset]
                    # current reference replaces newlines with literal '\n'
                    annotation.text = annotation.text.replace('\n',
                                                              '\\n').replace(
                                                                  '\r', '\\r')
                    annotation.id = str(next_annotation_id)
                    location = bioc.BioCLocation(start_offset,
                                                 end_offset - start_offset)

                    next_annotation_id += 1
                    annotation.add_location(location)
                    passage.add_annotation(annotation)

                # advance the token no matter what happened above
                token_idx += 1

        prediction_filename = os.path.join(
            prediction_dir, '{}.bioc.xml'.format(test_doc.filename))

        if made_base_dir is not None:
            reference_filename = os.path.join(
                os.path.join(made_base_dir, 'annotations'),
                '{}.bioc.xml'.format(test_doc.filename))
            reference_filenames.append(reference_filename)

        with open(prediction_filename, 'w') as fp:
            bioc.dump(collection, fp)
            prediction_documents_written += 1

    print('Total prediction documents written : {}'.format(
        prediction_documents_written))

    # finally we can invoke some evaluation (if enabled)
    if made_base_dir is not None:
        annotation_dir = os.path.join(made_base_dir, 'annotations')
        text_dir = os.path.join(made_base_dir, 'corpus')
        # first param can be an actual directory (string) or a list of filepaths
        get_f_scores(reference_filenames, prediction_dir, text_dir)
예제 #24
0
def test(data, opt, predict_dir):
    test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData(
        data.test_dir)

    # evaluate on test data and output results in bioc format, one doc one file

    data.load(opt.data_file)
    data.MAX_SENTENCE_LENGTH = -1
    data.show_data_summary()

    data.fix_alphabet()
    seq_model = SeqModel(data)
    seq_model.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'model.pkl')))
    ner_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \
                         data.feature_emb_dims[data.feature_name2id['[POS]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i))))
        ner_hiddenlist.append(temp)

    ner_wordrep = WordRep(data, False, True, True, data.use_char)
    ner_wordrep.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl')))

    classify_model = ClassifyModel(data)
    classify_model.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'model.pkl')))
    re_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\
                         2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i))))
        re_hiddenlist.append(temp)

    re_wordrep = WordRep(data, True, False, True, False)
    re_wordrep.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'wordrep.pkl')))

    for i in tqdm(range(len(test_name))):
        doc_name = test_name[i]
        doc_token = test_token[i]
        doc_entity = test_entity[i]

        if opt.use_gold_ner:
            entities = []
            for _, e in doc_entity.iterrows():
                entity = Entity()
                entity.create(e['id'], e['type'], e['start'], e['end'],
                              e['text'], e['sent_idx'], e['tf_start'],
                              e['tf_end'])
                entities.append(entity)
        else:

            ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity)

            data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer(
                ncrf_data, data.word_alphabet, data.char_alphabet,
                data.feature_alphabets, data.label_alphabet,
                data.number_normalized, data.MAX_SENTENCE_LENGTH)

            decode_results = ner_evaluateWhenTest(data, ner_wordrep,
                                                  ner_hiddenlist, seq_model)

            entities = ner.translateNCRFPPintoEntities(doc_token,
                                                       decode_results,
                                                       doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start,
                                                     entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text

        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(
            doc_token, entities, doc_name, data)

        relations = re_evaluateWhenTest(
            re_wordrep, re_hiddenlist, classify_model, test_X, data,
            test_other,
            data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'annotation 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'annotation 2')
            bioc_relation.add_node(node2)

        with open(os.path.join(predict_dir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)
예제 #25
0
    def add_dependency(self, obj):
        # create bioc sentence
        sentence = bioc.BioCSentence()
        sentence.offset = 0
        sentence.text = obj['text']
        annotation = bioc.BioCAnnotation()
        annotation.infons['parse tree'] = obj['parse tree']
        sentence.add_annotation(annotation)

        self.p2d.convert_s(sentence)

        m = {}
        for i, tok in enumerate(obj['toks']):
            tok['id'] = i
            # find bioc annotation
            found = False
            for ann in sentence.annotations:
                loc = ann.total_span
                if utils.intersect((tok['start'], tok['end']),
                                   (loc.offset, loc.offset + loc.length)):
                    if ann.id in m:
                        logging.debug('Duplicated id mapping: %s', ann.id)
                    m[ann.id] = i
                    if 'ROOT' in ann.infons:
                        tok['ROOT'] = True
                    found = True
                    break
            if not found:
                logging.debug('Cannot find %s in \n%s', tok, obj['id'])

        for rel in sentence.relations:
            node0 = rel.nodes[0]
            node1 = rel.nodes[1]
            if node0.refid in m and node1.refid in m:
                if node0.role == 'governor':
                    gov = m[node0.refid]
                    dep = m[node1.refid]
                else:
                    gov = m[node1.refid]
                    dep = m[node0.refid]
                if gov == dep:
                    logging.debug('Discard self loop')
                    continue
                tok = obj['toks'][dep]
                if 'governor' in tok:
                    if tok['governor'] == gov:
                        pass
                    if 'extra' in rel.infons:
                        pass
                    else:
                        logging.debug('%s: Two heads: %s', obj['id'], str(rel))
                else:
                    tok['governor'] = gov
                    tok['dependency'] = rel.infons['dependency']
            else:
                ann0 = None
                ann1 = None
                for annotation in sentence.annotations:
                    if annotation.id == node0.refid:
                        ann0 = annotation
                    if annotation.id == node1.refid:
                        ann1 = annotation
                logging.debug('Cannot find %s or %s in sentence: %s', node0,
                              node1, obj['id'])
                logging.debug('%s', ann0)
                logging.debug('%s', ann1)