Ejemplos de BioCLocation en Python, ejemplos de bioc.BioCLocation en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_negdetect.py Proyecto: yfpeng/negbio2

def test_extend():
    text = 'findings: no pneumothorax.'
    d = text_to_bioc([text], type='d/p/s')
    a = bioc.BioCAnnotation()
    a.text = 'pneumothorax'
    a.add_location(bioc.BioCLocation(13, 12))
    d.passages[0].add_annotation(a)
    detector.__call__(d)

    # fake ann
    a = bioc.BioCAnnotation()
    a.text = 'eumothor'
    a.add_location(bioc.BioCLocation(15, 8))
    d.passages[0].add_annotation(a)

    a = bioc.BioCAnnotation()
    a.text = 'foo'
    a.add_location(bioc.BioCLocation(27, 3))
    d.passages[0].add_annotation(a)

    _extend(d, 'negation')

    assert d.passages[0].annotations[1].infons['negation'] == 'True'
    assert 'negation' not in d.passages[0].annotations[2].infons

    d.passages[0].annotations[0].infons['CUI'] = 'xxx'
    d.passages[0].annotations[2].infons['CUI'] = 'xxx'
    _extend(d, 'negation')
    assert 'negation' not in d.passages[0].annotations[2].infons

Ejemplo n.º 2

0

Mostrar archivo

Archivo: create_i2b2_test_gs.py Proyecto: zxlzr/BLUE_Benchmark

def read_text(pathname):
    with open(pathname) as fp:
        text = fp.read()
    sentences = []
    offset = 0
    for sent in text.split('\n'):
        sentence = bioc.BioCSentence()
        sentence.infons['filename'] = pathname.stem
        sentence.offset = offset
        sentence.text = sent
        sentences.append(sentence)
        i = 0
        for m in re.finditer('\S+', sent):
            if i == 0 and m.start() != 0:
                # add fake
                ann = bioc.BioCAnnotation()
                ann.id = f'a{i}'
                ann.text = ''
                ann.add_location(bioc.BioCLocation(offset, 0))
                sentence.add_annotation(ann)
                i += 1
            ann = bioc.BioCAnnotation()
            ann.id = f'a{i}'
            ann.text = m.group()
            ann.add_location(bioc.BioCLocation(m.start() + offset, len(m.group())))
            sentence.add_annotation(ann)
            i += 1
        offset += len(sent) + 1
    return sentences

Ejemplo n.º 3

0

Mostrar archivo

    def save_as_collection(list_of_pymedext_documents: List[Document]):
        """save a list of pymedext document as a bioc collection .
        It will return a bioc collection object.

        :param list_of_pymedext_documents: a list of Document
        :returns:  a bioc collection object
        """
        this_bioc_collection = bioc.BioCCollection()
        for this_pymedext_doc in list_of_pymedext_documents:
            this_bioc_doc = bioc.BioCDocument()
            for annot in this_pymedext_doc.annotations:
                # print(annot.type)
                print(annot.source)
                if annot.type == "raw_text":
                    if this_bioc_collection.source == '':
                        this_bioc_collection.source = annot.source
                if annot.source == "BioCPassage":
                    print(annot.ngram)
                    print(annot.value)
                    this_passage = bioc.BioCPassage()
                    this_passage.text = annot.ngram
                    this_passage.offset = annot.span[0]
                    this_bioc_doc.add_passage(this_passage)
                    # passageAttributes to add
                elif annot.source == "BioCAnnotation":
                    this_annotation = bioc.BioCAnnotation()
                    this_annotation.infons = annot.attributes
                    this_annotation.id = annot.attributes["id"]
                    this_annotation.text = annot.ngram
                    thisLocation = bioc.BioCLocation(
                        annot.span[0], annot.span[1] - annot.span[0])
                    this_annotation.add_location(thisLocation)
                    this_bioc_doc.passages[-1].add_annotation(this_annotation)
            this_bioc_collection.add_document(this_bioc_doc)
        return (this_bioc_collection)

Ejemplo n.º 4

0

Mostrar archivo

def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set):

    with open(infile, 'r', encoding='utf-8') as fin:
        with open(outfile, 'w', encoding='utf8') as fout:
            collection = bioc.load(fin)
            for document in collection.documents:
                for passage in document.passages:
                    tag_result = bioTag(passage.text,
                                        biotag_dic,
                                        nn_model,
                                        onlyLongest=para_set['onlyLongest'],
                                        abbrRecog=para_set['abbrRecog'],
                                        Threshold=para_set['ML_Threshold'])
                    mention_num = 0
                    for ele in tag_result:
                        bioc_note = bioc.BioCAnnotation()
                        bioc_note.id = str(mention_num)
                        mention_num += 1
                        bioc_note.infons['identifier'] = ele[2]
                        bioc_note.infons['type'] = "Phenotype"
                        bioc_note.infons['score'] = ele[3]
                        start = int(ele[0])
                        last = int(ele[1])
                        loc = bioc.BioCLocation(offset=str(start),
                                                length=str(last - start))
                        bioc_note.locations.append(loc)
                        bioc_note.text = passage.text[start:last]
                        passage.annotations.append(bioc_note)
            bioc.dump(collection, fout, pretty_print=True)

Ejemplo n.º 5

0

Mostrar archivo

def convertKindredCorpusToBioCCollection(corpus):
	assert isinstance(corpus,kindred.Corpus)
	collection = bioc.BioCCollection()
	for kdoc in corpus.documents:
		assert isinstance(kdoc,kindred.Document)

		biocDoc = bioc.BioCDocument()
		collection.add_document(biocDoc)

		if 'id' in kdoc.metadata:
			biocDoc.id = kdoc.metadata['id']
		biocDoc.infons = kdoc.metadata

		passage = bioc.BioCPassage()
		passage.text = kdoc.text
		passage.offset = 0
		biocDoc.add_passage(passage)

		seenEntityIDs = set()
		kindredID2BiocID = {}
		for e in kdoc.entities:
			assert isinstance(e,kindred.Entity)

			a = bioc.BioCAnnotation()
			a.text = e.text
			a.infons = {'type':e.entityType}
			a.infons.update(e.metadata)

			if e.sourceEntityID is None:
				a.id = str(e.entityID)
			else:
				a.id = e.sourceEntityID

			assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id
			seenEntityIDs.add(a.id)
			kindredID2BiocID[e.entityID] = a.id

			for start,end in e.position:
				l = bioc.BioCLocation(offset=start, length=(end-start))
				a.locations.append(l)

			passage.annotations.append(a)

		for r in kdoc.relations:
			assert isinstance(r,kindred.Relation)
			biocR = bioc.BioCRelation()
			biocR.infons = {'type':r.relationType}
			
			entitiesInRelation = r.entities
			argNames = r.argNames
			if argNames is None:
				argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ]

			for argName,entity in zip(argNames,entitiesInRelation):
				node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID])
				biocR.nodes.append(node)

			passage.relations.append(biocR)

	return collection

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test.py Proyecto: pj0616/norm-1-30

def dump_results(doc_name, entities, opt):
    entity_id = 1
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = doc_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0

    for entity in entities:
        anno_entity = bioc.BioCAnnotation()
        passage.add_annotation(anno_entity)
        anno_entity.id = str(entity_id)
        entity_id += 1
        anno_entity.infons['type'] = entity.type
        anno_entity_location = bioc.BioCLocation(
            entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0])
        anno_entity.add_location(anno_entity_location)
        anno_entity.text = entity.name
        if len(entity.norm_ids) > 0:
            anno_entity.infons['UMLS code'] = entity.norm_ids[0]
            anno_entity.infons['UMLS term'] = entity.norm_names[0]
        else:
            anno_entity.infons['UMLS code'] = 'N/A'
            anno_entity.infons['UMLS term'] = 'N/A'

    with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w',
                     'UTF-8') as fp:
        bioc.dump(collection, fp)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: pubtator2bioc.py Proyecto: bionlplab/bioc

def pubtator2bioc_ann(ann: PubTatorAnn) -> bioc.BioCAnnotation:
    biocann = bioc.BioCAnnotation()
    biocann.id = 'T{}'.format(ann.start)
    biocann.infons['type'] = ann.type
    biocann.infons['concept_id'] = ann.id
    biocann.add_location(bioc.BioCLocation(ann.start, ann.end - ann.start))
    biocann.text = ann.text
    return biocann

Ejemplo n.º 8

0

Mostrar archivo

Archivo: dner_mm.py Proyecto: zhanglang1860/NegBio

def run_metamap_col(collection, mm, cuis=None):
    """
    Get CUIs from metamap.

    Args:
        collection(BioCCollection):
        mm(MetaMap): MetaMap instance

    Returns:
        BioCCollection
    """
    try:
        annIndex = itertools.count()
        sentence_map = collections.OrderedDict()
        for document in collection.documents:
            for passage in document.passages:
                for sentence in passage.sentences:
                    sentence_map['{}-{}'.format(document.id.replace('.', '-'),
                                                sentence.offset)] = (passage,
                                                                     sentence)

        sents = []
        ids = []
        for k in sentence_map:
            ids.append(k)
            sents.append(remove_newline(sentence_map[k][1].text))

        concepts, error = mm.extract_concepts(sents, ids)
        if error is None:
            for concept in concepts:
                concept_index = adapt_concept_index(concept.index)
                try:
                    if cuis is not None:
                        # if no CUI is returned for this concept - skip it
                        concept_cui = getattr(concept, 'cui', None)
                        if concept_cui not in cuis:
                            continue
                    m = re.match(r'(\d+)/(\d+)', concept.pos_info)
                    if m:
                        passage = sentence_map[concept_index][0]
                        sentence = sentence_map[concept_index][1]
                        start = int(m.group(1)) - 1
                        length = int(m.group(2))
                        ann = bioc.BioCAnnotation()
                        ann.id = str(next(annIndex))
                        ann.infons['CUI'] = concept.cui
                        ann.infons['semtype'] = concept.semtypes[1:-1]
                        ann.infons['term'] = concept.preferred_name
                        ann.infons['annotator'] = 'MetaMap'
                        ann.add_location(
                            bioc.BioCLocation(sentence.offset + start, length))
                        ann.text = sentence.text[start:start + length]
                        passage.annotations.append(ann)
                except:
                    logging.exception('')
    except:
        logging.exception("Cannot process %s", collection.source)
    return collection

Ejemplo n.º 9

0

Mostrar archivo

def convert_dg(dependency_graph, text: str, offset: int, ann_index: int = 0, rel_index: int = 0) \
        -> Tuple[List[bioc.BioCAnnotation], List[bioc.BioCRelation]]:
    """
    Convert dependency graph to annotations and relations
    """
    annotations = []
    relations = []
    annotation_id_map = {}
    start = 0
    for node in dependency_graph:
        if node.index in annotation_id_map:
            continue
        node_form = node.form
        index = text.find(node_form, start)
        if index == -1:
            node_form = adapt_value(node.form)
            index = text.find(node_form, start)
            if index == -1:
                logging.debug(
                    'Cannot convert parse tree to dependency graph at %d\n%d\n%s',
                    start, offset, str(dependency_graph))
                continue

        ann = bioc.BioCAnnotation()
        ann.id = 'T{}'.format(ann_index)
        ann.text = node_form
        ann.infons['tag'] = node.pos

        start = index

        ann.add_location(bioc.BioCLocation(start + offset, len(node_form)))
        annotations.append(ann)
        annotation_id_map[node.index] = ann_index
        ann_index += 1
        start += len(node_form)

    for node in dependency_graph:
        if node.head == 0:
            ann = annotations[annotation_id_map[node.index]]
            ann.infons['ROOT'] = True
            continue
        relation = bioc.BioCRelation()
        relation.id = 'R{}'.format(rel_index)
        relation.infons['dependency'] = node.deprel
        if node.extra:
            relation.infons['extra'] = node.extra
        if node.index in annotation_id_map and node.head in annotation_id_map:
            relation.add_node(
                bioc.BioCNode('T{}'.format(annotation_id_map[node.index]),
                              'dependant'))
            relation.add_node(
                bioc.BioCNode('T{}'.format(annotation_id_map[node.head]),
                              'governor'))
            relations.append(relation)
            rel_index += 1

    return annotations, relations

Ejemplo n.º 10

0

Mostrar archivo

def run_metamap(document, mm, cuis=None):
    """
    Get CUIs from metamap.

    Args:
        document(BioCDocument):
        mm(MetaMap): MetaMap instance

    Returns:
        BioCDocument
    """
    try:
        annIndex = itertools.count()
        sentence_map = collections.OrderedDict()
        for passage in document.passages:
            for sentence in passage.sentences:
                sentence_map[str(sentence.offset)] = (passage, sentence)

        sents = []
        ids = []
        for k in sentence_map:
            ids.append(k)
            sents.append(remove_newline(sentence_map[k][1].text))

        concepts, error = mm.extract_concepts(sents, ids)
        print('Done')
        if error is None:
            for concept in concepts:
                # print(concept)
                concept_index = adapt_concept_index(concept.index)
                try:
                    if cuis is not None and concept.cui not in cuis:
                        continue
                    m = re.match(r'(\d+)/(\d+)', concept.pos_info)
                    if m:
                        passage = sentence_map[concept_index][0]
                        sentence = sentence_map[concept_index][1]
                        start = int(m.group(1)) - 1
                        length = int(m.group(2))
                        ann = bioc.BioCAnnotation()
                        ann.id = str(next(annIndex))
                        ann.infons['CUI'] = concept.cui
                        ann.infons['semtype'] = concept.semtypes[1:-1]
                        ann.infons['term'] = concept.preferred_name
                        ann.infons['annotator'] = 'MetaMap'
                        ann.add_location(
                            bioc.BioCLocation(sentence.offset + start, length))
                        ann.text = sentence.text[start:start + length]
                        passage.annotations.append(ann)
                except:
                    logging.exception('')
    except:
        logging.exception("Cannot process %s", document.id)
    return document

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_negdetect.py Proyecto: yfpeng/negbio2

def test_neg_regex():
    text = 'findings: no pneumothorax.'
    assert is_neg_regex(text)

    d = text_to_bioc([text], type='d/p/s')
    a = bioc.BioCAnnotation()
    a.text = 'pneumothorax'
    a.add_location(bioc.BioCLocation(13, 12))
    d.passages[0].add_annotation(a)
    detector.__call__(d)
    assert d.passages[0].annotations[0].infons['negation'] == 'True'

Ejemplo n.º 12

0

Mostrar archivo

def test_location():
    base = bioc.BioCLocation(1, 10)
    assert base != 'foo'
    assert base.end == 11
    assert base.contains(9)
    assert not base.contains(11)

    loc = bioc.BioCLocation(1, 10)
    assert base == loc

    loc = bioc.BioCLocation(2, 9)
    assert base != loc
    assert loc in base
    assert base not in loc

    locs = {base, loc}
    assert base in locs
    assert loc in locs

    with pytest.raises(TypeError):
        assert 'foo' in base

Ejemplo n.º 13

0

Mostrar archivo

def translateNCRFPPintoBioc(doc_token, predict_results, file_name):
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = file_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0
    entity_id = 1

    sent_num = len(predict_results)
    for idx in range(sent_num):
        sent_length = len(predict_results[idx][0])
        sent_token = doc_token[(doc_token['sent_idx'] == idx)]

        assert sent_token.shape[0] == sent_length, "file {}, sent {}".format(
            file_name, idx)
        labelSequence = []

        for idy in range(sent_length):
            token = sent_token.iloc[idy]
            label = predict_results[idx][0][idy]
            labelSequence.append(label)

            if label[0] == 'S' or label[0] == 'B':
                anno_entity = bioc.BioCAnnotation()
                passage.add_annotation(anno_entity)
                anno_entity.id = str(entity_id)
                anno_entity.infons['type'] = label[2:]
                anno_entity_location = bioc.BioCLocation(
                    token['start'], token['end'] - token['start'])
                anno_entity.add_location(anno_entity_location)
                anno_entity.text = token['text']
                entity_id += 1

            elif label[0] == 'M' or label[0] == 'E':
                if checkWrongState(labelSequence):
                    anno_entity = passage.annotations[-1]

                    whitespacetoAdd = token['start'] - anno_entity.locations[
                        0].end
                    for _ in range(whitespacetoAdd):
                        anno_entity.text += " "
                    anno_entity.text += token['text']
                    anno_entity.locations[0].length = token[
                        'end'] - anno_entity.locations[0].offset

    bioc_file = open(file_name + ".bioc.xml", 'w')
    bioc.dump(collection, bioc_file)
    bioc_file.close()

Ejemplo n.º 14

0

Mostrar archivo

Archivo: utils.py Proyecto: jakelever/biotext

def strip_annotation_markers(
        text: str,
        annotations_map: Dict[str,
                              str]) -> Tuple[str, List[bioc.BioCAnnotation]]:
    """
    Given a set of annotations, remove any which are found in the current text and return
    the new string as well as the positions of the annotations in the transformed string
    """
    matched_annotations: List[Tuple[int, int.str]] = []
    for ann_marker in annotations_map:
        # citation in brackets
        patterns = [
            (r'[^\S\t]?[\(\[\{]' + re.escape(ann_marker) + r'[\)\]\}]',
             0),  # citation in brackets
            (
                r'[^\S\t]' + re.escape(ann_marker) + r'\.',
                1,
            ),  # citation at end of sentence, remove extra whitespace
            (
                r'[^\S\t]' + re.escape(ann_marker) + r'[^\S\t]',
                1,
            ),  # citation surrounded by whitespace
            (re.escape(ann_marker), 0),  # citation by itself
        ]
        for pattern, end_offset in patterns:
            match = re.search(pattern, text)
            if match:
                matched_annotations.append(
                    (match.start(), match.end() - end_offset, ann_marker))
                break

    transformed_annotations: List[bioc.BioCAnnotation] = []
    transformed_text = text
    offset = 0

    for start, end, marker in matched_annotations:
        ann = bioc.BioCAnnotation()
        ann.id = marker
        ann.infons['citation_text'] = annotations_map[marker]
        ann.infons['type'] = 'citation'
        transformed_text = transformed_text[:start -
                                            offset] + transformed_text[end -
                                                                       offset:]

        # since the token place-holder is removed, must be start - 1 (and previous offset) for the new position
        ann.add_location(bioc.BioCLocation(start - offset - 1, 0))

        offset += end - start
        transformed_annotations.append(ann)
    return transformed_text, transformed_annotations

Ejemplo n.º 15

0

Mostrar archivo

    def add_match(self, impression, sentence, ann_index, phrase, observation,
                  start, end):
        """Add the match data and metadata to the impression object
        in place."""
        annotation = bioc.BioCAnnotation()
        annotation.id = ann_index
        annotation.infons['CUI'] = None
        annotation.infons['semtype'] = None
        annotation.infons['term'] = phrase
        annotation.infons[OBSERVATION] = observation
        annotation.infons['annotator'] = 'Phrase'
        length = end - start
        annotation.add_location(
            bioc.BioCLocation(sentence.offset + start, length))
        annotation.text = sentence.text[start:start + length]

        impression.annotations.append(annotation)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: preprocessing.py Proyecto: zxlzr/BLUE_Benchmark

def tokenize_text(text, id):
    sentences = []
    doc = nlp(text)
    for sent in doc.sents:
        sentence = bioc.BioCSentence()
        sentence.infons['filename'] = id
        sentence.offset = sent.start_char
        sentence.text = text[sent.start_char:sent.end_char]
        sentences.append(sentence)
        i = 0
        for token in sent:
            for t, start, end in split_punct(token.text, token.idx):
                ann = bioc.BioCAnnotation()
                ann.id = f'a{i}'
                ann.text = t
                ann.add_location(bioc.BioCLocation(start, end - start))
                sentence.add_annotation(ann)
                i += 1
    return sentences

Ejemplo n.º 17

0

Mostrar archivo

def test_clean_sentences():
    cleanup = CleanUp()

    doc = text_to_bioc(['No pneumothorax.', 'No pneumothorax.'], type='d/p/s')
    p = doc.passages[0]
    for i in range(10, 0, -1):
        ann = bioc.BioCAnnotation()
        ann.add_location(bioc.BioCLocation(i, 1))
        p.add_annotation(ann)

    assert len(doc.passages[0].sentences) == 2
    doc = cleanup.__call__(doc)
    assert len(doc.passages[0].sentences) == 0
    assert len(doc.passages[0].annotations) == 10
    for i in range(10):
        assert doc.passages[0].annotations[i].total_span.offset == 10 - i

    doc = cleanup.__call__(doc, sort_anns=True)
    for i in range(10):
        assert doc.passages[0].annotations[i].total_span.offset == i + 1

Ejemplo n.º 18

0

Mostrar archivo

def create_bioc_document_from_document_json(document):
    b_document = bioc.BioCDocument()
    b_document.id = document['sourceid']
    passage = bioc.BioCPassage()
    passage.text = document['text']
    passage.offset = 0
    annotation_user_map = {}
    for denotation in document['denotations']:
        annotation_user_map[denotation['id']] = denotation['userId']
        if denotation['userId'] != 0:
            continue
        annotation = bioc.BioCAnnotation()
        annotation.id = denotation['id']
        location = bioc.BioCLocation(0, 0)
        location.offset = denotation['span']['begin']
        location.length = denotation['span']['end'] - denotation['span'][
            'begin']
        annotation.locations.append(location)
        annotation.text = document['text'][
            denotation['span']['begin']:denotation['span']['end']]
        annotation.infons = denotation['obj']
        passage.add_annotation(annotation)
    for relation in document['relations']:
        subj_from_current_user = annotation_user_map[relation['subj']] == 0
        obj_from_current_user = annotation_user_map[relation['obj']] == 0
        if not (subj_from_current_user and obj_from_current_user):
            continue
        b_relation = bioc.BioCRelation()
        b_relation.id = relation['id']
        start_node = bioc.BioCNode('', '')
        end_node = bioc.BioCNode('', '')
        start_node.refid = relation['subj']
        end_node.refid = relation['obj']
        b_relation.add_node(start_node)
        b_relation.add_node(end_node)
        b_relation.infons = relation['pred']
        passage.add_relation(b_relation)
    b_document.add_passage(passage)
    return b_document

Ejemplo n.º 19

0

Mostrar archivo

Archivo: dner_regex.py Proyecto: yfpeng/negbio2

 def __call__(self, document, *args, **kwargs):
     annotation_index = itertools.count()
     for passage in document.passages:
         for sentence in passage.sentences:
             obs_phrases = self.observation2mention_phrases.items()
             for observation, phrases in obs_phrases:
                 for phrase in phrases:
                     pattern = self.compile_pattern(phrase)
                     for match in pattern.finditer(sentence.text):
                         start, end = match.span(0)
                         if self.overlaps_with_unmention(sentence, observation, start, end):
                             continue
                         annotation = bioc.BioCAnnotation()
                         annotation.id = str(next(annotation_index))
                         annotation.infons['term'] = phrase
                         annotation.infons["observation"] = observation
                         annotation.infons['annotator'] = 'RegEx'
                         annotation.infons['vocab'] = self.vocab_name
                         annotation.add_location(bioc.BioCLocation(sentence.offset + start,
                                                                   end - start))
                         annotation.text = sentence.text[start:end]
                         passage.annotations.append(annotation)
     return document

Ejemplo n.º 20

0

Mostrar archivo

							elif start2 > end1:
								pass
							else:
								overlapping = True
								break

						if not overlapping:
							nonoverlapping.append ((start1,end1))

					for start,end in nonoverlapping:
						for annotationType,conceptids in candidates[(start,end)].items():
							conceptid = conceptids = ";".join(sorted(list(set(conceptids))))

							a = bioc.BioCAnnotation()
							a.text = passage.text[start:end]
							a.infons = {'type':annotationType, 'conceptid': conceptid}
							a.id = 'T%d' % currentID
							currentID += 1

							if end <= start:
								continue

							biocLoc = bioc.BioCLocation(offset=passage.offset+start, length=(end-start))
							a.locations.append(biocLoc)
							passage.annotations.append(a)

			writer.write_document(doc)

	print ('Done!')

Ejemplo n.º 21

0

Mostrar archivo

def evaluate_via_bioc(test_docs,
                      crf,
                      extractor,
                      prediction_dir,
                      made_base_dir=None):
    print('Total documents for evaluation : {}'.format(len(test_docs)))

    if not os.path.exists(prediction_dir):
        os.makedirs(prediction_dir)

    existing_files = glob.glob('{0}/*'.format(prediction_dir))
    existing_files_removed = 0
    for f in existing_files:
        os.remove(f)
        existing_files_removed += 1

    print('Existing files removed : {}'.format(existing_files_removed))

    prediction_documents_written = 0
    reference_filenames = []
    for test_doc in test_docs:
        #print('Working on document : {}'.format(test_doc.filename))

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        document.id = test_doc.filename
        collection.add_document(document)
        passage = bioc.BioCPassage()
        passage.offset = 0
        document.add_passage(passage)

        next_annotation_id = 1

        # now an annotation can be written for each label prediction
        for sentence in test_doc.tokenized_doc.sentences:
            sentence_tokens = []
            # gather tokens in a sentence
            for token_offset_pair in sentence:
                token = test_doc.text[
                    token_offset_pair[0]:token_offset_pair[1]]
                sentence_tokens.append(token)
            if len(sentence_tokens) == 0:
                continue

            sentence_features = extractor.sent2features(sentence_tokens)
            sentence_pred = crf.predict([sentence_features])[0]

            if len(sentence_pred) != len(sentence):
                print('Sentence Features Length : {}'.format(
                    len(sentence_features)))
                print('Sentence Pred Length : {}'.format(len(sentence_pred)))
                print('Sentence Length : {}'.format(len(sentence)))

            # walk manually through the predictions and add spans as appropriate
            token_idx = 0
            while token_idx < len(sentence_pred):
                token_pred = sentence_pred[token_idx]
                if token_pred != 'O':
                    base_label = token_pred.replace('B-', '').replace('I-', '')
                    start_offset = sentence[token_idx][0]
                    end_offset = sentence[token_idx][1]
                    # now let's look to the right as long as we see tokens which are part of this same label
                    while token_idx + 1 < len(sentence_pred) and sentence_pred[
                            token_idx + 1] == ('I-' + base_label):
                        # advance the token
                        token_idx += 1
                        # update the end of this span
                        end_offset = sentence[token_idx][1]

                    # finally we have an annotation that we can add
                    annotation = bioc.BioCAnnotation()

                    annotation.infons['type'] = base_label
                    annotation.text = test_doc.text[start_offset:end_offset]
                    # current reference replaces newlines with literal '\n'
                    annotation.text = annotation.text.replace('\n',
                                                              '\\n').replace(
                                                                  '\r', '\\r')
                    annotation.id = str(next_annotation_id)
                    location = bioc.BioCLocation(start_offset,
                                                 end_offset - start_offset)

                    next_annotation_id += 1
                    annotation.add_location(location)
                    passage.add_annotation(annotation)

                # advance the token no matter what happened above
                token_idx += 1

        prediction_filename = os.path.join(
            prediction_dir, '{}.bioc.xml'.format(test_doc.filename))

        if made_base_dir is not None:
            reference_filename = os.path.join(
                os.path.join(made_base_dir, 'annotations'),
                '{}.bioc.xml'.format(test_doc.filename))
            reference_filenames.append(reference_filename)

        with open(prediction_filename, 'w') as fp:
            bioc.dump(collection, fp)
            prediction_documents_written += 1

    print('Total prediction documents written : {}'.format(
        prediction_documents_written))

    # finally we can invoke some evaluation (if enabled)
    if made_base_dir is not None:
        annotation_dir = os.path.join(made_base_dir, 'annotations')
        text_dir = os.path.join(made_base_dir, 'corpus')
        # first param can be an actual directory (string) or a list of filepaths
        get_f_scores(reference_filenames, prediction_dir, text_dir)

Ejemplo n.º 22

0

Mostrar archivo

def predict(opt, data):

    seq_model = SeqModel(data)
    if opt.test_in_cpu:
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst}))


    seq_wordseq = WordSequence(data, False, True, True, True)
    if opt.test_in_cpu:
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    classify_model = ClassifyModel(data)
    if opt.test_in_cpu:
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst}))

    classify_wordseq = WordSequence(data, True, False, True, False)
    if opt.test_in_cpu:
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.']


    # for idx in tqdm(range(len(input_files))):
    for idx in range(len(input_files)):

        start = time.time()
        fileName = join(opt.input,input_files[idx])
        doc_name = input_files[idx]

        doc_token = processOneFile(fileName)

        doc = generateDataForOneFile(doc_token)

        raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet,
                                                                   data.feature_alphabets, data.label_alphabet,
                                                                   data.number_normalized,
                                                                   data.MAX_SENTENCE_LENGTH)

        decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids)


        entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text


        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data)

        relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'argument 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'argument 2')
            bioc_relation.add_node(node2)


        with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp:
            bioc.dump(collection, fp)

        end = time.time()
        logging.info("process %s complete with %.2fs" % (input_files[idx], end-start))



    logging.info("test finished")

Ejemplo n.º 23

0

Mostrar archivo

Archivo: align.py Proyecto: flywind2/pgxmine

                                pass
                            else:
                                overlapping = True
                                break

                        if not overlapping:
                            nonoverlapping.append((start1, end1))

                    for start, end in nonoverlapping:
                        for annotationType, conceptids in candidates[(
                                start, end)].items():
                            conceptid = conceptids = ";".join(
                                sorted(list(set(conceptids))))

                            a = bioc.BioCAnnotation()
                            a.text = passage.text[start:end]
                            a.infons = {
                                'type': annotationType,
                                'conceptid': conceptid
                            }
                            a.id = 'T%d' % currentID
                            currentID += 1
                            a.locations.append(
                                bioc.BioCLocation(offset=start,
                                                  length=(end - start)))
                            passage.annotations.append(a)

            writer.writedocument(doc)

    print('Done!')

Ejemplo n.º 24

0

Mostrar archivo

Archivo: trainandtest.py Proyecto: foxlf823/ADExtractor

def test2(test_token, test_entity, test_relation, test_name, result_dumpdir):
    logging.info("loading ... vocab")
    relation_vocab = pickle.load(
        open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb'))

    logging.info("loading ... result")
    results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb"))

    for i in tqdm(range(len(test_relation))):

        doc_entity = test_entity[i]
        doc_name = test_name[i]

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for _, entity in doc_entity.iterrows():
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity['id']
            anno_entity.infons['type'] = entity['type']
            anno_entity_location = bioc.BioCLocation(
                entity['start'], entity['end'] - entity['start'])
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity['text']

        relation_id = 1
        for result in results:

            if doc_name == result['doc_name']:

                former = doc_entity[(
                    doc_entity['id'] == result['former_id'])].iloc[0]
                latter = doc_entity[(
                    doc_entity['id'] == result['latter_id'])].iloc[0]

                relation_type = relation_vocab.lookup_id2str(result['type'])
                if relation_type == '<unk>':
                    continue
                elif my_utils.relationConstraint1(relation_type,
                                                  former['type'],
                                                  latter['type']) == False:
                    continue
                else:
                    bioc_relation = bioc.BioCRelation()
                    passage.add_relation(bioc_relation)
                    bioc_relation.id = str(relation_id)
                    relation_id += 1
                    bioc_relation.infons['type'] = relation_type

                    node1 = bioc.BioCNode(former['id'], 'annotation 1')
                    bioc_relation.add_node(node1)
                    node2 = bioc.BioCNode(latter['id'], 'annotation 2')
                    bioc_relation.add_node(node2)

        with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)

Ejemplo n.º 25

0

Mostrar archivo

 def location(this, json_loc):
     loc = bioc.BioCLocation()
     loc.offset = str(json_loc['offset'])
     loc.length = str(json_loc['length'])
     return loc

Ejemplo n.º 26

0

Mostrar archivo

Archivo: shared_soft.py Proyecto: foxlf823/e2e_ner_re

def test(data, opt, predict_dir):
    test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData(
        data.test_dir)

    # evaluate on test data and output results in bioc format, one doc one file

    data.load(opt.data_file)
    data.MAX_SENTENCE_LENGTH = -1
    data.show_data_summary()

    data.fix_alphabet()
    seq_model = SeqModel(data)
    seq_model.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'model.pkl')))
    ner_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \
                         data.feature_emb_dims[data.feature_name2id['[POS]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i))))
        ner_hiddenlist.append(temp)

    ner_wordrep = WordRep(data, False, True, True, data.use_char)
    ner_wordrep.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl')))

    classify_model = ClassifyModel(data)
    classify_model.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'model.pkl')))
    re_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\
                         2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i))))
        re_hiddenlist.append(temp)

    re_wordrep = WordRep(data, True, False, True, False)
    re_wordrep.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'wordrep.pkl')))

    for i in tqdm(range(len(test_name))):
        doc_name = test_name[i]
        doc_token = test_token[i]
        doc_entity = test_entity[i]

        if opt.use_gold_ner:
            entities = []
            for _, e in doc_entity.iterrows():
                entity = Entity()
                entity.create(e['id'], e['type'], e['start'], e['end'],
                              e['text'], e['sent_idx'], e['tf_start'],
                              e['tf_end'])
                entities.append(entity)
        else:

            ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity)

            data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer(
                ncrf_data, data.word_alphabet, data.char_alphabet,
                data.feature_alphabets, data.label_alphabet,
                data.number_normalized, data.MAX_SENTENCE_LENGTH)

            decode_results = ner_evaluateWhenTest(data, ner_wordrep,
                                                  ner_hiddenlist, seq_model)

            entities = ner.translateNCRFPPintoEntities(doc_token,
                                                       decode_results,
                                                       doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start,
                                                     entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text

        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(
            doc_token, entities, doc_name, data)

        relations = re_evaluateWhenTest(
            re_wordrep, re_hiddenlist, classify_model, test_X, data,
            test_other,
            data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'annotation 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'annotation 2')
            bioc_relation.add_node(node2)

        with open(os.path.join(predict_dir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)