Exemple #1
0
def dump_results(doc_name, entities, opt):
    entity_id = 1
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = doc_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0

    for entity in entities:
        anno_entity = bioc.BioCAnnotation()
        passage.add_annotation(anno_entity)
        anno_entity.id = str(entity_id)
        entity_id += 1
        anno_entity.infons['type'] = entity.type
        anno_entity_location = bioc.BioCLocation(
            entity.spans[0][0], entity.spans[0][1] - entity.spans[0][0])
        anno_entity.add_location(anno_entity_location)
        anno_entity.text = entity.name
        if len(entity.norm_ids) > 0:
            anno_entity.infons['UMLS code'] = entity.norm_ids[0]
            anno_entity.infons['UMLS term'] = entity.norm_names[0]
        else:
            anno_entity.infons['UMLS code'] = 'N/A'
            anno_entity.infons['UMLS term'] = 'N/A'

    with codecs.open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w',
                     'UTF-8') as fp:
        bioc.dump(collection, fp)
Exemple #2
0
def text2document(id, text, split_document=True):
    """

    Args:
        id (str): BioCDocument id
        text (str): text
        split_document(bool): True if splits the passages according to the section titles.

    Returns:
        BioCDocument
    """
    document = bioc.BioCDocument()
    document.id = id
    text = printable(text).replace('\r\n', '\n')

    if split_document:
        last_section = None
        for start, end, section in split_passage(text):
            passage = bioc.BioCPassage()
            passage.offset = start
            passage.text = text[start:end]
            if section is None:
                section = last_section
            passage.infons['title'] = section
            document.add_passage(passage)
            last_section = section
    else:
        passage = bioc.BioCPassage()
        passage.offset = 0
        passage.text = text
        document.add_passage(passage)

    return document
Exemple #3
0
def get_one_document(text):
    d = bioc.BioCDocument()
    p = bioc.BioCPassage()
    p.text = text
    p.offset = 0
    d.add_passage(p)
    return d
Exemple #4
0
def convertKindredCorpusToBioCCollection(corpus):
	assert isinstance(corpus,kindred.Corpus)
	collection = bioc.BioCCollection()
	for kdoc in corpus.documents:
		assert isinstance(kdoc,kindred.Document)

		biocDoc = bioc.BioCDocument()
		collection.add_document(biocDoc)

		if 'id' in kdoc.metadata:
			biocDoc.id = kdoc.metadata['id']
		biocDoc.infons = kdoc.metadata

		passage = bioc.BioCPassage()
		passage.text = kdoc.text
		passage.offset = 0
		biocDoc.add_passage(passage)

		seenEntityIDs = set()
		kindredID2BiocID = {}
		for e in kdoc.entities:
			assert isinstance(e,kindred.Entity)

			a = bioc.BioCAnnotation()
			a.text = e.text
			a.infons = {'type':e.entityType}
			a.infons.update(e.metadata)

			if e.sourceEntityID is None:
				a.id = str(e.entityID)
			else:
				a.id = e.sourceEntityID

			assert not a.id in seenEntityIDs, "Multiple entities with the same ID (%s) found" % a.id
			seenEntityIDs.add(a.id)
			kindredID2BiocID[e.entityID] = a.id

			for start,end in e.position:
				l = bioc.BioCLocation(offset=start, length=(end-start))
				a.locations.append(l)

			passage.annotations.append(a)

		for r in kdoc.relations:
			assert isinstance(r,kindred.Relation)
			biocR = bioc.BioCRelation()
			biocR.infons = {'type':r.relationType}
			
			entitiesInRelation = r.entities
			argNames = r.argNames
			if argNames is None:
				argNames = [ "arg%d" % i for i,_ in enumerate(entitiesInRelation) ]

			for argName,entity in zip(argNames,entitiesInRelation):
				node = bioc.BioCNode(role=argName, refid=kindredID2BiocID[entity.entityID])
				biocR.nodes.append(node)

			passage.relations.append(biocR)

	return collection
Exemple #5
0
    def save_as_collection(list_of_pymedext_documents: List[Document]):
        """save a list of pymedext document as a bioc collection .
        It will return a bioc collection object.

        :param list_of_pymedext_documents: a list of Document
        :returns:  a bioc collection object
        """
        this_bioc_collection = bioc.BioCCollection()
        for this_pymedext_doc in list_of_pymedext_documents:
            this_bioc_doc = bioc.BioCDocument()
            for annot in this_pymedext_doc.annotations:
                # print(annot.type)
                print(annot.source)
                if annot.type == "raw_text":
                    if this_bioc_collection.source == '':
                        this_bioc_collection.source = annot.source
                if annot.source == "BioCPassage":
                    print(annot.ngram)
                    print(annot.value)
                    this_passage = bioc.BioCPassage()
                    this_passage.text = annot.ngram
                    this_passage.offset = annot.span[0]
                    this_bioc_doc.add_passage(this_passage)
                    # passageAttributes to add
                elif annot.source == "BioCAnnotation":
                    this_annotation = bioc.BioCAnnotation()
                    this_annotation.infons = annot.attributes
                    this_annotation.id = annot.attributes["id"]
                    this_annotation.text = annot.ngram
                    thisLocation = bioc.BioCLocation(
                        annot.span[0], annot.span[1] - annot.span[0])
                    this_annotation.add_location(thisLocation)
                    this_bioc_doc.passages[-1].add_annotation(this_annotation)
            this_bioc_collection.add_document(this_bioc_doc)
        return (this_bioc_collection)
Exemple #6
0
def writeMarcXMLRecordToBiocFile(record,biocWriter):
	metadata = record['008'].value()
	language = metadata[35:38]
	if language != 'eng':
		return

	recordid = record['001'].value()

	title = record.title()
	textSources = [title]

	abstract = None
	if '520' in record and 'a' in record['520']:
		abstract = record['520']['a']
		textSources.append(abstract)

	#print recordid, language, title, abstract
	biocDoc = bioc.BioCDocument()
	biocDoc.id = recordid

	offset = 0
	for textSource in textSources:
		if isinstance(textSource,six.string_types):
			textSource = trimSentenceLengths(textSource)
			passage = bioc.BioCPassage()
			passage.text = textSource
			passage.offset = offset
			offset += len(textSource)
			biocDoc.add_passage(passage)

	biocWriter.writedocument(biocDoc)
Exemple #7
0
def pubmedxml2bioc(source):
	for pmDoc in processMedlineFile(source):
		biocDoc = bioc.BioCDocument()
		biocDoc.id = pmDoc["pmid"]
		biocDoc.infons['title'] = " ".join(pmDoc["title"])
		biocDoc.infons['pmid'] = pmDoc["pmid"]
		biocDoc.infons['year'] = pmDoc["pubYear"]
		biocDoc.infons['month'] = pmDoc["pubMonth"]
		biocDoc.infons['day'] = pmDoc["pubDay"]
		biocDoc.infons['journal'] = pmDoc["journal"]
		biocDoc.infons['journalISO'] = pmDoc["journalISO"]
		biocDoc.infons['authors'] = ", ".join(pmDoc["authors"])
		biocDoc.infons['chemicals'] = pmDoc['chemicals']
		biocDoc.infons['meshHeadings'] = pmDoc['meshHeadings']

		offset = 0
		for section in ["title","abstract"]:
			for textSource in pmDoc[section]:
				textSource = trimSentenceLengths(textSource)
				passage = bioc.BioCPassage()
				passage.infons['section'] = section
				passage.text = textSource
				passage.offset = offset
				offset += len(textSource)
				biocDoc.add_passage(passage)

		yield biocDoc
Exemple #8
0
def split_document(document, pattern=None):
    """
    Split one report into sections. Section splitting is a deterministic consequence of section titles.

    Args:
        document(BioCDocument): one document that contains one passage.
        pattern: the regular expression patterns for section titles.

    Returns:
        BioCDocument: a new BioCDocument instance
    """
    if pattern is None:
        pattern = SECTION_TITLES

    new_document = bioc.BioCDocument()
    new_document.id = document.id
    new_document.infons = document.infons

    text = document.passages[0].text
    offset = document.passages[0].offset

    def create_passage(start, end, title=None):
        passage = bioc.BioCPassage()
        passage.offset = start + offset
        passage.text = text[start:end]
        if title is not None:
            passage.infons['title'] = title[:-1].strip(
            ) if title[-1] == ':' else title.strip()
            passage.infons['type'] = 'title_1'
        strip(passage)
        return passage

    start = 0
    for matcher in pattern.finditer(text):
        logging.debug('Match: %s', matcher.group())
        # add last
        end = matcher.start()
        if end != start:
            passage = create_passage(start, end)
            if not is_empty(passage):
                new_document.add_passage(passage)

        start = end

        # add title
        end = matcher.end()
        passage = create_passage(start, end, text[start:end])
        if not is_empty(passage):
            new_document.add_passage(passage)

        start = end

    # add last piece
    end = len(text)
    if start < end:
        passage = create_passage(start, end)
        if not is_empty(passage):
            new_document.add_passage(passage)
    return new_document
Exemple #9
0
 def document(this, json_doc):
     doc = bioc.BioCDocument()
     doc.id = json_doc['id']
     doc.infons = json_doc['infons']
     doc.passages = [this.passage(p)
                     for p in json_doc['passages']]
     doc.relations = [this.relation(r)
                      for r in json_doc['relations']]
     return doc
Exemple #10
0
def pubtator2bioc(pubdoc: PubTator):
    doc = bioc.BioCDocument()
    doc.id = pubdoc.pmid
    doc.text = '%s\n%s' % (pubdoc.title, pubdoc.abstract)
    for ann in pubdoc.annotations:
        biocann = pubtator2bioc_ann(ann)
        doc.add_annotation(biocann)
    for i, rel in enumerate(pubdoc.relations):
        biocrel = pubtator2bioc_rel(rel)
        biocrel.id = 'R%s' % i
        doc.add_relation(biocrel)
    return doc
Exemple #11
0
def text_to_bioc(list_of_text, type, **kwargs):
    if type == 'p/s':
        offset = 0
        passage = bioc.BioCPassage()
        passage.offset = offset
        for s in list_of_text:
            sentence = bioc.BioCSentence()
            sentence.offset = offset
            sentence.text = s
            offset += len(s) + 1
            passage.add_sentence(sentence)
        return passage
    elif type == 'd/p/s':
        document = bioc.BioCDocument()
        passage = text_to_bioc(list_of_text, 'p/s')
        document.add_passage(passage)
        return document
    elif type == 'c/d/p/s':
        c = bioc.BioCCollection()
        d = text_to_bioc(list_of_text, 'd/p/s')
        c.add_document(d)
        return c
    elif type == 'd/p':
        document = bioc.BioCDocument()
        offset = 0
        for s in list_of_text:
            passage = bioc.BioCPassage()
            passage.offset = offset
            offset += len(s) + 1
            passage.text = s
            document.add_passage(passage)
        return document
    elif type == 'c/d/p':
        c = bioc.BioCCollection()
        d = text_to_bioc(list_of_text, 'd/p')
        c.add_document(d)
        return c
    else:
        raise KeyError
Exemple #12
0
def translateNCRFPPintoBioc(doc_token, predict_results, file_name):
    collection = bioc.BioCCollection()
    document = bioc.BioCDocument()
    collection.add_document(document)
    document.id = file_name
    passage = bioc.BioCPassage()
    document.add_passage(passage)
    passage.offset = 0
    entity_id = 1

    sent_num = len(predict_results)
    for idx in range(sent_num):
        sent_length = len(predict_results[idx][0])
        sent_token = doc_token[(doc_token['sent_idx'] == idx)]

        assert sent_token.shape[0] == sent_length, "file {}, sent {}".format(
            file_name, idx)
        labelSequence = []

        for idy in range(sent_length):
            token = sent_token.iloc[idy]
            label = predict_results[idx][0][idy]
            labelSequence.append(label)

            if label[0] == 'S' or label[0] == 'B':
                anno_entity = bioc.BioCAnnotation()
                passage.add_annotation(anno_entity)
                anno_entity.id = str(entity_id)
                anno_entity.infons['type'] = label[2:]
                anno_entity_location = bioc.BioCLocation(
                    token['start'], token['end'] - token['start'])
                anno_entity.add_location(anno_entity_location)
                anno_entity.text = token['text']
                entity_id += 1

            elif label[0] == 'M' or label[0] == 'E':
                if checkWrongState(labelSequence):
                    anno_entity = passage.annotations[-1]

                    whitespacetoAdd = token['start'] - anno_entity.locations[
                        0].end
                    for _ in range(whitespacetoAdd):
                        anno_entity.text += " "
                    anno_entity.text += token['text']
                    anno_entity.locations[0].length = token[
                        'end'] - anno_entity.locations[0].offset

    bioc_file = open(file_name + ".bioc.xml", 'w')
    bioc.dump(collection, bioc_file)
    bioc_file.close()
Exemple #13
0
    def to_bioc_document(self) -> bioc.BioCDocument:
        doc = bioc.BioCDocument()
        doc.infons['url'] = self.url
        doc.infons['figure id'] = self.id
        doc.infons['files'] = json.dumps(self.files)

        self.caption.infons['type'] = 'caption'
        doc.add_passage(self.caption)

        for p in self.text:
            p.infons['type'] = 'text'
            doc.add_passage(p)

        return doc
Exemple #14
0
def save_predictions(ids, relevant, confidence, output):
    collection = bioc.BioCCollection()
    collection.source = 'PubMed'
    now = datetime.datetime.now()
    collection.date = '{}{:02d}{:02d}'.format(now.year, now.month, now.day)
    collection.key = 'collection.key'
    for i, id in enumerate(ids):
        document = bioc.BioCDocument()
        document.id = id
        document.infons['relevant'] = 'no' if relevant[i] == 0 else 'yes'
        if relevant[i] == 1:
            document.infons['confidence'] = '{:.2f}'.format(confidence[i][0])
        else:
            document.infons['confidence'] = '{:.2f}'.format(
                1 - confidence[i][0])
        collection.add_document(document)

    bioc.dump(collection, open(output, 'w'), pretty_print=True)
Exemple #15
0
    def test_convert_to_vec(self, bioc_doc_input):
        # Arrange
        expected_vec = []
        sut = BiocSentences()
        doc = bioc.BioCDocument()
        # construct a bioc doc object given the input array
        for p in bioc_doc_input:
            bioc_p = bioc.BioCPassage()
            doc.add_passage(bioc_p)
            for s in p:
                bioc_sent = BioCSentence()
                bioc_sent.text = s
                bioc_p.add_sentence(bioc_sent)
                expected_vec.append(s)

        # Act
        actual = sut.convert_to_vec(doc)

        # Assert
        self.assertEqual(expected_vec, actual)
Exemple #16
0
def pmcxml2bioc(pmcxmlFilename, biocFilename):
    try:
        with bioc.BioCXMLDocumentWriter(biocFilename) as writer:
            for pmcDoc in processPMCFile(pmcxmlFilename):
                biocDoc = bioc.BioCDocument()
                biocDoc.id = pmcDoc["pmid"]
                biocDoc.infons['title'] = " ".join(
                    pmcDoc["textSources"]["title"])
                biocDoc.infons['pmid'] = pmcDoc["pmid"]
                biocDoc.infons['pmcid'] = pmcDoc["pmcid"]
                biocDoc.infons['doi'] = pmcDoc["doi"]
                biocDoc.infons['year'] = pmcDoc["pubYear"]
                biocDoc.infons['month'] = pmcDoc["pubMonth"]
                biocDoc.infons['day'] = pmcDoc["pubDay"]
                biocDoc.infons['journal'] = pmcDoc["journal"]
                biocDoc.infons['journalISO'] = pmcDoc["journalISO"]

                offset = 0
                for groupName, textSourceGroup in pmcDoc["textSources"].items(
                ):
                    subsection = None
                    for textSource in textSourceGroup:
                        textSource = trimSentenceLengths(textSource)
                        passage = bioc.BioCPassage()

                        subsectionCheck = textSource.lower().strip(
                            '01234567890. ')
                        if subsectionCheck in allowedSubsections:
                            subsection = subsectionCheck

                        passage.infons['section'] = groupName
                        passage.infons['subsection'] = subsection
                        passage.text = textSource
                        passage.offset = offset
                        offset += len(textSource)
                        biocDoc.add_passage(passage)

                writer.write_document(biocDoc)
    except etree.ParseError:
        raise RuntimeError("Parsing error in PMC xml file: %s" %
                           pmcxmlFilename)
Exemple #17
0
def text2document(id, text):
    """
    Convert text to a BioCDocument instance

    Args:
        id (str): BioCDocument id
        text (str): text

    Returns:
        BioCDocument: a BioCDocument instance
    """
    document = bioc.BioCDocument()
    document.id = id
    text = printable(text).replace('\r\n', '\n')

    passage = bioc.BioCPassage()
    passage.offset = 0
    passage.text = text
    document.add_passage(passage)

    return document
Exemple #18
0
def pubmedxml2bioc(
    source: Union[str, TextIO],
    tag_handlers: Dict[str, TagHandlerFunction] = {},
    trim_sentences=True,
) -> Iterable[bioc.BioCDocument]:
    """
    Args:
        source: path to the MEDLINE xml file
    """
    for pm_doc in process_medline_file(source, tag_handlers=tag_handlers):
        bioc_doc = bioc.BioCDocument()
        bioc_doc.id = pm_doc["pmid"]
        bioc_doc.infons["title"] = " ".join(pm_doc["title"])
        bioc_doc.infons["pmid"] = pm_doc["pmid"]
        bioc_doc.infons["pmcid"] = pm_doc["pmcid"]
        bioc_doc.infons["doi"] = pm_doc["doi"]
        bioc_doc.infons["year"] = pm_doc["pubYear"]
        bioc_doc.infons["month"] = pm_doc["pubMonth"]
        bioc_doc.infons["day"] = pm_doc["pubDay"]
        bioc_doc.infons["journal"] = pm_doc["journal"]
        bioc_doc.infons["journalISO"] = pm_doc["journalISO"]
        bioc_doc.infons["authors"] = ", ".join(pm_doc["authors"])
        bioc_doc.infons["chemicals"] = pm_doc["chemicals"]
        bioc_doc.infons["meshHeadings"] = pm_doc["meshHeadings"]
        bioc_doc.infons["supplementaryMesh"] = pm_doc["supplementaryMesh"]
        bioc_doc.infons["publicationTypes"] = pm_doc["publicationTypes"]

        offset = 0
        for section in ["title", "abstract"]:
            for text_source in pm_doc[section]:
                if trim_sentences:
                    text_source = trim_sentence_lengths(text_source)
                passage = bioc.BioCPassage()
                passage.infons["section"] = section
                passage.text = text_source
                passage.offset = offset
                offset += len(text_source)
                bioc_doc.add_passage(passage)

        yield bioc_doc
Exemple #19
0
def create_bioc_document_from_document_json(document):
    b_document = bioc.BioCDocument()
    b_document.id = document['sourceid']
    passage = bioc.BioCPassage()
    passage.text = document['text']
    passage.offset = 0
    annotation_user_map = {}
    for denotation in document['denotations']:
        annotation_user_map[denotation['id']] = denotation['userId']
        if denotation['userId'] != 0:
            continue
        annotation = bioc.BioCAnnotation()
        annotation.id = denotation['id']
        location = bioc.BioCLocation(0, 0)
        location.offset = denotation['span']['begin']
        location.length = denotation['span']['end'] - denotation['span'][
            'begin']
        annotation.locations.append(location)
        annotation.text = document['text'][
            denotation['span']['begin']:denotation['span']['end']]
        annotation.infons = denotation['obj']
        passage.add_annotation(annotation)
    for relation in document['relations']:
        subj_from_current_user = annotation_user_map[relation['subj']] == 0
        obj_from_current_user = annotation_user_map[relation['obj']] == 0
        if not (subj_from_current_user and obj_from_current_user):
            continue
        b_relation = bioc.BioCRelation()
        b_relation.id = relation['id']
        start_node = bioc.BioCNode('', '')
        end_node = bioc.BioCNode('', '')
        start_node.refid = relation['subj']
        end_node.refid = relation['obj']
        b_relation.add_node(start_node)
        b_relation.add_node(end_node)
        b_relation.infons = relation['pred']
        passage.add_relation(b_relation)
    b_document.add_passage(passage)
    return b_document
Exemple #20
0
def uimaxmi2bioc(xmiFilename, biocFilename):
	tree = etree.parse(xmiFilename)
	root = tree.getroot()

	metadataNode = root.find('{http:///de/tudarmstadt/ukp/dkpro/core/api/metadata/type.ecore}DocumentMetaData')
	documentTitle = metadataNode.attrib['documentTitle']

	contentNode = root.find('{http:///uima/cas.ecore}Sofa')
	content = contentNode.attrib['sofaString']

	with bioc.iterwrite(biocFilename) as writer:
		biocDoc = bioc.BioCDocument()
		biocDoc.id = None
		biocDoc.infons['title'] = documentTitle

		passage = bioc.BioCPassage()
		passage.infons['section'] = 'article'
		passage.text = content
		passage.offset = 0
		biocDoc.add_passage(passage)

		writer.writedocument(biocDoc)
    def to_bioc_xml(self, outdir):
        outpath = os.path.join(outdir, self.file_name + '.bioc.xml')
        writer = bioc.BioCWriter()
        writer.collection = bioc.BioCCollection()
        collection = writer.collection
        document = bioc.BioCDocument()
        document.id = self.file_name

        passage = bioc.BioCPassage()
        passage.offset = '0'
        document.add_passage(passage)
        collection.add_document(document)

        # Add annotations that already have bioc annotations
        for anno in self.get_annotations():
            passage.add_annotation(anno.bioc_anno)

        for relat in self.get_relations():
            # Create new BioCRelation
            relation = bioc.bioc_relation.BioCRelation()
            relation.id = relat.id
            relation.put_infon('type', relat.type)

            # Reference that nodes that contain the annotations
            node1 = bioc.bioc_node.BioCNode()
            node1.role = 'annotation 1'
            node1.refid = relat.annotation_1.id
            relation.add_node(node1)

            node2 = bioc.bioc_node.BioCNode()
            node2.role = 'annotation 2'
            node2.refid = relat.annotation_2.id
            relation.add_node(node2)

            passage.add_relation(relation)

        writer.write(outpath)
Exemple #22
0
def evaluate_via_bioc(test_docs,
                      crf,
                      extractor,
                      prediction_dir,
                      made_base_dir=None):
    print('Total documents for evaluation : {}'.format(len(test_docs)))

    if not os.path.exists(prediction_dir):
        os.makedirs(prediction_dir)

    existing_files = glob.glob('{0}/*'.format(prediction_dir))
    existing_files_removed = 0
    for f in existing_files:
        os.remove(f)
        existing_files_removed += 1

    print('Existing files removed : {}'.format(existing_files_removed))

    prediction_documents_written = 0
    reference_filenames = []
    for test_doc in test_docs:
        #print('Working on document : {}'.format(test_doc.filename))

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        document.id = test_doc.filename
        collection.add_document(document)
        passage = bioc.BioCPassage()
        passage.offset = 0
        document.add_passage(passage)

        next_annotation_id = 1

        # now an annotation can be written for each label prediction
        for sentence in test_doc.tokenized_doc.sentences:
            sentence_tokens = []
            # gather tokens in a sentence
            for token_offset_pair in sentence:
                token = test_doc.text[
                    token_offset_pair[0]:token_offset_pair[1]]
                sentence_tokens.append(token)
            if len(sentence_tokens) == 0:
                continue

            sentence_features = extractor.sent2features(sentence_tokens)
            sentence_pred = crf.predict([sentence_features])[0]

            if len(sentence_pred) != len(sentence):
                print('Sentence Features Length : {}'.format(
                    len(sentence_features)))
                print('Sentence Pred Length : {}'.format(len(sentence_pred)))
                print('Sentence Length : {}'.format(len(sentence)))

            # walk manually through the predictions and add spans as appropriate
            token_idx = 0
            while token_idx < len(sentence_pred):
                token_pred = sentence_pred[token_idx]
                if token_pred != 'O':
                    base_label = token_pred.replace('B-', '').replace('I-', '')
                    start_offset = sentence[token_idx][0]
                    end_offset = sentence[token_idx][1]
                    # now let's look to the right as long as we see tokens which are part of this same label
                    while token_idx + 1 < len(sentence_pred) and sentence_pred[
                            token_idx + 1] == ('I-' + base_label):
                        # advance the token
                        token_idx += 1
                        # update the end of this span
                        end_offset = sentence[token_idx][1]

                    # finally we have an annotation that we can add
                    annotation = bioc.BioCAnnotation()

                    annotation.infons['type'] = base_label
                    annotation.text = test_doc.text[start_offset:end_offset]
                    # current reference replaces newlines with literal '\n'
                    annotation.text = annotation.text.replace('\n',
                                                              '\\n').replace(
                                                                  '\r', '\\r')
                    annotation.id = str(next_annotation_id)
                    location = bioc.BioCLocation(start_offset,
                                                 end_offset - start_offset)

                    next_annotation_id += 1
                    annotation.add_location(location)
                    passage.add_annotation(annotation)

                # advance the token no matter what happened above
                token_idx += 1

        prediction_filename = os.path.join(
            prediction_dir, '{}.bioc.xml'.format(test_doc.filename))

        if made_base_dir is not None:
            reference_filename = os.path.join(
                os.path.join(made_base_dir, 'annotations'),
                '{}.bioc.xml'.format(test_doc.filename))
            reference_filenames.append(reference_filename)

        with open(prediction_filename, 'w') as fp:
            bioc.dump(collection, fp)
            prediction_documents_written += 1

    print('Total prediction documents written : {}'.format(
        prediction_documents_written))

    # finally we can invoke some evaluation (if enabled)
    if made_base_dir is not None:
        annotation_dir = os.path.join(made_base_dir, 'annotations')
        text_dir = os.path.join(made_base_dir, 'corpus')
        # first param can be an actual directory (string) or a list of filepaths
        get_f_scores(reference_filenames, prediction_dir, text_dir)
Exemple #23
0
def test(data, opt, predict_dir):
    test_token, test_entity, test_relation, test_name = preprocess.loadPreprocessData(
        data.test_dir)

    # evaluate on test data and output results in bioc format, one doc one file

    data.load(opt.data_file)
    data.MAX_SENTENCE_LENGTH = -1
    data.show_data_summary()

    data.fix_alphabet()
    seq_model = SeqModel(data)
    seq_model.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'model.pkl')))
    ner_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim+data.HP_char_hidden_dim+data.feature_emb_dims[data.feature_name2id['[Cap]']]+ \
                         data.feature_emb_dims[data.feature_name2id['[POS]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.ner_dir, 'hidden_{}.pkl'.format(i))))
        ner_hiddenlist.append(temp)

    ner_wordrep = WordRep(data, False, True, True, data.use_char)
    ner_wordrep.load_state_dict(
        torch.load(os.path.join(opt.ner_dir, 'wordrep.pkl')))

    classify_model = ClassifyModel(data)
    classify_model.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'model.pkl')))
    re_hiddenlist = []
    for i in range(opt.hidden_num):
        if i == 0:
            input_size = data.word_emb_dim + data.feature_emb_dims[data.feature_name2id['[POS]']]+\
                         2*data.re_feature_emb_dims[data.re_feature_name2id['[POSITION]']]
            output_size = data.HP_hidden_dim
        else:
            input_size = data.HP_hidden_dim
            output_size = data.HP_hidden_dim

        temp = HiddenLayer(data, input_size, output_size)
        temp.load_state_dict(
            torch.load(os.path.join(opt.re_dir, 'hidden_{}.pkl'.format(i))))
        re_hiddenlist.append(temp)

    re_wordrep = WordRep(data, True, False, True, False)
    re_wordrep.load_state_dict(
        torch.load(os.path.join(opt.re_dir, 'wordrep.pkl')))

    for i in tqdm(range(len(test_name))):
        doc_name = test_name[i]
        doc_token = test_token[i]
        doc_entity = test_entity[i]

        if opt.use_gold_ner:
            entities = []
            for _, e in doc_entity.iterrows():
                entity = Entity()
                entity.create(e['id'], e['type'], e['start'], e['end'],
                              e['text'], e['sent_idx'], e['tf_start'],
                              e['tf_end'])
                entities.append(entity)
        else:

            ncrf_data = ner.generateDataForOneDoc(doc_token, doc_entity)

            data.raw_texts, data.raw_Ids = ner.read_instanceFromBuffer(
                ncrf_data, data.word_alphabet, data.char_alphabet,
                data.feature_alphabets, data.label_alphabet,
                data.number_normalized, data.MAX_SENTENCE_LENGTH)

            decode_results = ner_evaluateWhenTest(data, ner_wordrep,
                                                  ner_hiddenlist, seq_model)

            entities = ner.translateNCRFPPintoEntities(doc_token,
                                                       decode_results,
                                                       doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start,
                                                     entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text

        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(
            doc_token, entities, doc_name, data)

        relations = re_evaluateWhenTest(
            re_wordrep, re_hiddenlist, classify_model, test_X, data,
            test_other,
            data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'annotation 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'annotation 2')
            bioc_relation.add_node(node2)

        with open(os.path.join(predict_dir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)
Exemple #24
0
def predict(opt, data):

    seq_model = SeqModel(data)
    if opt.test_in_cpu:
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_model.load_state_dict(torch.load(os.path.join(opt.output, 'ner_model.pkl'), map_location={cuda_src:cuda_dst}))


    seq_wordseq = WordSequence(data, False, True, True, True)
    if opt.test_in_cpu:
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        seq_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 'ner_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    classify_model = ClassifyModel(data)
    if opt.test_in_cpu:
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_model.load_state_dict(torch.load(os.path.join(opt.output, 're_model.pkl'), map_location={cuda_src:cuda_dst}))

    classify_wordseq = WordSequence(data, True, False, True, False)
    if opt.test_in_cpu:
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location='cpu'))
    else:
        cuda_src = 'cuda:{}'.format(opt.old_gpu)
        cuda_dst = 'cuda:{}'.format(opt.gpu)
        classify_wordseq.load_state_dict(torch.load(os.path.join(opt.output, 're_wordseq.pkl'), map_location={cuda_src:cuda_dst}))

    input_files = [f for f in listdir(opt.input) if isfile(join(opt.input,f)) and f[0]!='.']


    # for idx in tqdm(range(len(input_files))):
    for idx in range(len(input_files)):

        start = time.time()
        fileName = join(opt.input,input_files[idx])
        doc_name = input_files[idx]

        doc_token = processOneFile(fileName)

        doc = generateDataForOneFile(doc_token)

        raw_texts, raw_Ids = read_instance(doc, data.word_alphabet, data.char_alphabet,
                                                                   data.feature_alphabets, data.label_alphabet,
                                                                   data.number_normalized,
                                                                   data.MAX_SENTENCE_LENGTH)

        decode_results = evaluateWhenTest(data, seq_wordseq, seq_model, raw_Ids)


        entities = ner.translateNCRFPPintoEntities(doc_token, decode_results, doc_name)

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for entity in entities:
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity.id
            anno_entity.infons['type'] = entity.type
            anno_entity_location = bioc.BioCLocation(entity.start, entity.getlength())
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity.text


        test_X, test_other = relation_extraction.getRelationInstanceForOneDoc(doc_token, entities, doc_name, data)

        relations = relation_extraction.evaluateWhenTest(classify_wordseq, classify_model, test_X, data, test_other, data.re_feature_alphabets[data.re_feature_name2id['[RELATION]']])

        for relation in relations:
            bioc_relation = bioc.BioCRelation()
            passage.add_relation(bioc_relation)
            bioc_relation.id = relation.id
            bioc_relation.infons['type'] = relation.type

            node1 = bioc.BioCNode(relation.node1.id, 'argument 1')
            bioc_relation.add_node(node1)
            node2 = bioc.BioCNode(relation.node2.id, 'argument 2')
            bioc_relation.add_node(node2)


        with open(os.path.join(opt.predict, doc_name + ".bioc.xml"), 'w') as fp:
            bioc.dump(collection, fp)

        end = time.time()
        logging.info("process %s complete with %.2fs" % (input_files[idx], end-start))



    logging.info("test finished")
Exemple #25
0
def pmcxml2bioc(
    source: Union[str, TextIO],
    tag_handlers: Dict[str, TagHandlerFunction] = {},
    trim_sentences: bool = False,
    all_xml_path_infon: bool = False,
    mark_citations: bool = True,
) -> Iterator[Iterable[bioc.BioCDocument]]:
    """
    Convert a PMC XML file into its Bioc equivalent

    Args:
        source: The text or file handle containing the PMC XML
        tag_handlers: custom overrides for handling specific XML tags.
        trim_sentences: Trim text content to a maximum sentence length.
        all_xml_path_infon: Add a xml_path infon element to every passages to describe where in the XML heirarchy this text is from (Will always add to table/figure elements even without flag)
        mark_citations: Add 0-length bioc annotations for in-text citations

    Raises:
        RuntimeError: On any parsing errors

    Returns:
        An iterator over the newly generated Bioc documents
    """
    try:
        for pmc_doc in process_pmc_file(source, tag_handlers=tag_handlers):
            bioc_doc = bioc.BioCDocument()
            bioc_doc.id = pmc_doc["pmid"]
            bioc_doc.infons["title"] = " ".join(
                [p.text for p in pmc_doc["textSources"]["title"]])
            bioc_doc.infons["pmid"] = pmc_doc["pmid"]
            bioc_doc.infons["pmcid"] = pmc_doc["pmcid"]
            bioc_doc.infons["doi"] = pmc_doc["doi"]
            bioc_doc.infons["year"] = pmc_doc["pubYear"]
            bioc_doc.infons["month"] = pmc_doc["pubMonth"]
            bioc_doc.infons["day"] = pmc_doc["pubDay"]
            bioc_doc.infons["journal"] = pmc_doc["journal"]
            bioc_doc.infons["journalISO"] = pmc_doc["journalISO"]

            offset = 0
            for group_name, text_source_group in pmc_doc["textSources"].items(
            ):
                subsection = None
                for chunk in text_source_group:
                    text_source, annotations = strip_annotation_markers(
                        chunk.text, pmc_doc['annotations'])

                    if trim_sentences:
                        text_source = trim_sentence_lengths(text_source)

                    passage = bioc.BioCPassage()

                    subsection_check = text_source.lower().strip(
                        "01234567890. ")
                    if subsection_check in allowed_subsections:
                        subsection = subsection_check

                    passage.infons["section"] = group_name
                    passage.infons["subsection"] = subsection

                    if chunk.xml_path:
                        if all_xml_path_infon or set(
                                chunk.xml_path.split('/')) & {
                                    'thead',
                                    'tbody',
                                    'fig',
                                }:
                            passage.infons["xml_path"] = chunk.xml_path

                    passage.text = text_source
                    passage.offset = offset

                    if not trim_sentences and mark_citations:
                        for annotation in annotations:
                            for location in annotation.locations:
                                location.offset += offset
                            passage.add_annotation(annotation)

                    offset += len(text_source)
                    bioc_doc.add_passage(passage)

            yield bioc_doc

    except etree.ParseError:
        raise RuntimeError("Parsing error in PMC xml file: %s" % source)
Exemple #26
0
def test2(test_token, test_entity, test_relation, test_name, result_dumpdir):
    logging.info("loading ... vocab")
    relation_vocab = pickle.load(
        open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), 'rb'))

    logging.info("loading ... result")
    results = pickle.load(open(os.path.join(opt.output, 'results.pkl'), "rb"))

    for i in tqdm(range(len(test_relation))):

        doc_entity = test_entity[i]
        doc_name = test_name[i]

        collection = bioc.BioCCollection()
        document = bioc.BioCDocument()
        collection.add_document(document)
        document.id = doc_name
        passage = bioc.BioCPassage()
        document.add_passage(passage)
        passage.offset = 0

        for _, entity in doc_entity.iterrows():
            anno_entity = bioc.BioCAnnotation()
            passage.add_annotation(anno_entity)
            anno_entity.id = entity['id']
            anno_entity.infons['type'] = entity['type']
            anno_entity_location = bioc.BioCLocation(
                entity['start'], entity['end'] - entity['start'])
            anno_entity.add_location(anno_entity_location)
            anno_entity.text = entity['text']

        relation_id = 1
        for result in results:

            if doc_name == result['doc_name']:

                former = doc_entity[(
                    doc_entity['id'] == result['former_id'])].iloc[0]
                latter = doc_entity[(
                    doc_entity['id'] == result['latter_id'])].iloc[0]

                relation_type = relation_vocab.lookup_id2str(result['type'])
                if relation_type == '<unk>':
                    continue
                elif my_utils.relationConstraint1(relation_type,
                                                  former['type'],
                                                  latter['type']) == False:
                    continue
                else:
                    bioc_relation = bioc.BioCRelation()
                    passage.add_relation(bioc_relation)
                    bioc_relation.id = str(relation_id)
                    relation_id += 1
                    bioc_relation.infons['type'] = relation_type

                    node1 = bioc.BioCNode(former['id'], 'annotation 1')
                    bioc_relation.add_node(node1)
                    node2 = bioc.BioCNode(latter['id'], 'annotation 2')
                    bioc_relation.add_node(node2)

        with open(os.path.join(result_dumpdir, doc_name + ".bioc.xml"),
                  'w') as fp:
            bioc.dump(collection, fp)