コード例 #1
0
    def get_bioc_relations(self, docid, relations):

        # <relation id="5618#7534">
        #     <infon key="Gene1">5618</infon>
        #     <infon key="Gene2">7534</infon>
        #     <infon key="relation">PPIm</infon>
        # </relation>

        biocDoc=BioCDocument()
        biocDoc.id = docid
        biocDoc.relations=[]

        for rel in relations:
            bioc_rel = BioCRelation()
            rel_list = list(rel)
            infon = {}
            infon["relation"] = "PPIm"
            infon["Gene1"] = rel_list[0]

            infon["Gene2"] = rel_list[0]
            if len(rel_list) == 2:
                infon["Gene2"] = rel_list[1]

            bioc_rel.id="{}#{}".format(infon["Gene1"], infon["Gene2"])
            bioc_rel.infons=infon
            biocDoc.relations.append(bioc_rel)

        return biocDoc
コード例 #2
0
def brat2bioc_doc(bratdoc: BratDocument) -> BioCDocument:
    biocdoc = BioCDocument()
    biocdoc.id = bratdoc.id
    biocdoc.text = bratdoc.text
    # entity
    for bratentity in bratdoc.entities:
        biocdoc.add_annotation(brat2bioc_entity(bratentity))
    # relation
    for bratrelation in bratdoc.relations:
        biocdoc.add_relation(brat2bioc_relation(bratrelation))
    # event
    for bratevent in bratdoc.events:
        biocdoc.add_relation(brat2bioc_event(bratevent))
    # equiv
    for i, brat_equiv in enumerate(bratdoc.equiv_relations):
        brat_equiv.id = '%s%s' % (brat_equiv.id, i)
        biocdoc.add_relation(brat2bioc_equiv(brat_equiv))
    # attribute
    for bratatt in bratdoc.attributes:
        ann = biocdoc.get(bratatt.refid)
        ann.infons['note_id'] = bratatt.id
        ann.infons['attributes'] = ' '.join(sorted(bratatt.attributes))
    # note
    for bratnote in bratdoc.notes:
        ann = biocdoc.get(bratnote.refid)
        ann.infons['note_id'] = bratnote.id
        ann.infons['type'] = bratnote.type
        ann.infons['note'] = bratnote.text
    return biocdoc
コード例 #3
0
 def __parse_document(self, tree):
     document = BioCDocument()
     document.id = tree.findtext('id')
     document.infons = self.__parse_infons(tree)
     for child in tree.findall('passage'):
         document.add_passage(self.__parse_passage(child))
     for child in tree.findall('annotation'):
         document.add_annotation(self.__parse_annotation(child))
     for child in tree.findall('relation'):
         document.add_relation(self.__parse_relation(child))
     return document
コード例 #4
0
def convert_pubtator(input_path, output_path):
    """Convert pubtators annotation list to BioC XML
    Keyword Arguments:
    input_file -- the path of pubtators annotation file
    output_file -- the path to output the BioC XML file
    """

    # Set up BioCWriter to write specifically Pubtator
    # Can change to incorporate other sources besides pubtator
    writer = BioCWriter()
    writer.collection = BioCCollection()
    collection = writer.collection
    collection.date = time.strftime("%Y/%m/%d")
    collection.source = "Pubtator"
    collection.key = "Pubtator.key"

    opener = utilities.get_opener(output_path)
    with opener(output_path, 'wb') as xml_file:

        # Have to manually do this because hangs otherwise
        # Write the head of the xml file
        xml_shell = writer.tostring('UTF-8')
        *xml_head, xml_tail = xml_shell.rstrip().split(b'\n')
        for line in xml_head:
            xml_file.write(line + b'\n')

        article_generator = read_bioconcepts2pubtator_offsets(input_path)
        # Write each article in BioC format
        for article in tqdm.tqdm(article_generator):
            document = BioCDocument()
            document.id = article["pubmed_id"]

            title_passage = BioCPassage()
            title_passage.put_infon('type', 'title')
            title_passage.offset = '0'
            title_passage.text = article["title"]

            abstract_passage = BioCPassage()
            abstract_passage.put_infon('type', 'abstract')
            abstract_passage.offset = article["abstract"]
            abstract_passage.text = article["abstract"]

            id_index = 0
            for tag in article["title_annot"]:
                title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            for tag in article["abstract_annot"]:
                abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            document.add_passage(title_passage)
            document.add_passage(abstract_passage)

            step_parent = E('collection')
            writer._build_documents([document], step_parent)
            xml_file.write(tostring(step_parent[0], pretty_print=True))
            step_parent.clear()

        # Write the closing tag of the xml document
        xml_file.write(xml_tail + b'\n')
コード例 #5
0
def convert_pubtator(input_file, output_file=None):
    """Convert pubtators annotation list to BioC XML

    Keyword Arguments:
    input_file -- the path of pubtators annotation file
    output_file -- the path to output the converted text
    """
    if output_file is None:
        output_file = "bioc-converted-docs.xml"

    # Set up BioCWriter to write specifically Pubtator
    # Can change to incorporate other sources besides pubtator
    writer = BioCWriter()
    writer.collection = BioCCollection()
    collection = writer.collection
    collection.date = time.strftime("%Y/%m/%d")
    collection.source = "Pubtator"
    collection.key = "Pubtator.key"

    with open(output_file, 'wb') as g:

        # Have to manually do this because hangs otherwise
        # Write the head of the xml file
        xml_header = writer.tostring('UTF-8')
        xml_tail = '</collection>\n'
        xml_head = xml_header[:-len(xml_tail)]
        g.write(xml_head)

        article_generator = bioconcepts2pubtator_offsets(input_file)
        # Write each article in BioC format
        for article in tqdm.tqdm(article_generator):
            document = BioCDocument()
            document.id = article["Document ID"]

            title_passage = BioCPassage()
            title_passage.put_infon('type', 'title')
            title_passage.offset = '0'
            title_passage.text = article["Title"]

            abstract_passage = BioCPassage()
            abstract_passage.put_infon('type', 'abstract')
            abstract_passage.offset = str(article["Abstract"])
            abstract_passage.text = article["Abstract"]

            id_index = 0
            for tag in article["Title_Annot"]:
                title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            for tag in article["Abstract_Annot"]:
                abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            document.add_passage(title_passage)
            document.add_passage(abstract_passage)

            step_parent = E('collection')
            writer._build_documents([document], step_parent)
            g.write(tostring(step_parent[0], pretty_print=True))
            step_parent.clear()

        # Write the closing tag of the xml document
        g.write(xml_tail)