def write_text_bioc(self, output_path):
     bioc_writer = BioCWriter(output_path)
     bioc_collection = BioCCollection()
     # Insert option for either writing text only or annotations?
     # to keep document as it is:
     #collection.add_document(self.document)
     bioc_document = BioCDocument()
     for passage in self.abstract_dict.keys():
         bioc_passage = BioCPassage()
         bioc_passage.text = self.abstract_dict[passage]
         bioc_document.add_passage(bioc_passage)
     bioc_collection.add_document(bioc_document)
     
     print 'BioC output path', output_path
     bioc_writer.collection = bioc_collection
     bioc_writer.write()
Ejemplo n.º 2
0
    def test_should_get_gene_names_normalised(self, list_dict, expected_genes):
        # Arrange
        sut = BiocAnnotationGenes()
        bioc_doc = BioCDocument()
        bioc_passage = BioCPassage()
        bioc_doc.add_passage(bioc_passage)

        for dict in list_dict:
            annotation = BioCAnnotation()
            annotation.infons = dict
            bioc_passage.add_annotation(annotation)

        # act
        actual = sut.get_gene_names_normalised(bioc_doc)

        # assert
        self.assertEqual(set(expected_genes), actual)
Ejemplo n.º 3
0
    def test_should_get_gene_names_to_normalised_dict(
            self, list_gene_dict_in_passage, expected_dict):
        # Arrange
        sut = BiocAnnotationGenes()
        bioc_doc = BioCDocument()
        for list_gene_dict in list_gene_dict_in_passage:
            bioc_passage = BioCPassage()
            bioc_doc.add_passage(bioc_passage)

            for dict in list_gene_dict:
                annotation = BioCAnnotation()
                annotation.text = dict["text"]
                annotation.infons = dict
                bioc_passage.add_annotation(annotation)

        # act
        actual = sut.get_gene_names_to_normalised_dict(bioc_doc)

        # assert
        self.assertEqual(expected_dict, actual)
Ejemplo n.º 4
0
 def __parse_passage(self, tree):
     passage = BioCPassage()
     passage.offset = int(tree.findtext('offset'))
     passage.infons = self.__parse_infons(tree)
     if tree.find('text') is not None:
         passage.text = tree.findtext('text')
     for child in tree.findall('sentence'):
         passage.add_sentence(self.__parse_sentence(child))
     for child in tree.findall('annotation'):
         passage.add_annotation(self.__parse_annotation(child))
     for child in tree.findall('relation'):
         passage.add_relation(self.__parse_relation(child))
     return passage
Ejemplo n.º 5
0
 def __read(self):
     while self.__has_next():
         event, elem = self.__next_event()
         if self.__state == 0:
             if event == 'start':
                 if elem.tag == 'collection':
                     self.__state = 1
                     self.__collection = BioCCollection()
                     # collection information
         elif self.__state == 1:
             if event == 'start':
                 if elem.tag == 'document':
                     self.__document = BioCDocument()
                     self.__state = 2
             elif event == 'end':
                 if elem.tag == 'source':
                     self.__collection.source = elem.text
                 elif elem.tag == 'date':
                     self.__collection.date = elem.text
                 elif elem.tag == 'key':
                     self.__collection.key = elem.text
                 elif elem.tag == 'infon':
                     self.__collection.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'collection':
                     self.__state = 0
                     self.__document = None
                     self.__passage = None
                     self.__sentence = None
         elif self.__state == 2:
             if event == 'start':
                 if elem.tag == 'passage':
                     self.__passage = BioCPassage()
                     self.__state = 3
                 elif elem.tag == 'annotation':
                     self.__document.add_annotation(self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__document.add_relation(self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'id':
                     self.__document.id = elem.text
                 elif elem.tag == 'infon':
                     self.__document.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'document':
                     self.__state = 1
                     return
         elif self.__state == 3:
             if event == 'start':
                 if elem.tag == 'sentence':
                     self.__sentence = BioCSentence()
                     self.__state = 4
                 elif elem.tag == 'annotation':
                     self.__passage.add_annotation(self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__passage.add_relation(self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'offset':
                     self.__passage.offset = int(elem.text)
                 elif elem.tag == 'text':
                     self.__passage.text = elem.text
                 elif elem.tag == 'infon':
                     self.__passage.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'passage':
                     self.__state = 2
                     if self.__passage is not None:
                         self.__document.add_passage(self.__passage)
         elif self.__state == 4:
             if event == 'start':
                 if elem.tag == 'annotation':
                     self.__sentence.add_annotation(self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__sentence.add_relation(self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'offset':
                     self.__sentence.offset = int(elem.text)
                 elif elem.tag == 'text':
                     self.__sentence.text = elem.text
                 elif elem.tag == 'infon':
                     self.__sentence.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'sentence':
                     self.__state = 3
                     if self.__sentence is not None:
                         self.__passage.add_sentence(self.__sentence)
Ejemplo n.º 6
0
class BioCXMLDocumentReader:
    """
    Reader for the BioC XML format, one document per iteration.
    """

    def __init__(self, source: Union[str, BinaryIO]):
        # if not isinstance(file, str):
        #     file = str(file)
        self.file = source
        self.__context = iter(etree.iterparse(self.file, events=('start', 'end')))
        self.__state = 0
        self.__event = None
        self.__elem = None
        self.__read()

    def __iter__(self):
        return self

    def __next__(self):
        """
        Reads one BioC document from the XML file.

        Returns:
            BioCDocument: the BioC document
        """
        if self.__document is None:
            raise StopIteration
        else:
            document = self.__document
            self.__read()
            return document

    def __read(self):
        while self.__has_next():
            event, elem = self.__next_event()
            if self.__state == 0:
                if event == 'start':
                    if elem.tag == 'collection':
                        self.__state = 1
                        self.__collection = BioCCollection()
                        # collection information
            elif self.__state == 1:
                if event == 'start':
                    if elem.tag == 'document':
                        self.__document = BioCDocument()
                        self.__state = 2
                elif event == 'end':
                    if elem.tag == 'source':
                        self.__collection.source = elem.text
                    elif elem.tag == 'date':
                        self.__collection.date = elem.text
                    elif elem.tag == 'key':
                        self.__collection.key = elem.text
                    elif elem.tag == 'infon':
                        self.__collection.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'collection':
                        self.__state = 0
                        self.__document = None
                        self.__passage = None
                        self.__sentence = None
            elif self.__state == 2:
                if event == 'start':
                    if elem.tag == 'passage':
                        self.__passage = BioCPassage()
                        self.__state = 3
                    elif elem.tag == 'annotation':
                        self.__document.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__document.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'id':
                        self.__document.id = elem.text
                    elif elem.tag == 'infon':
                        self.__document.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'document':
                        self.__state = 1
                        return
            elif self.__state == 3:
                if event == 'start':
                    if elem.tag == 'sentence':
                        self.__sentence = BioCSentence()
                        self.__state = 4
                    elif elem.tag == 'annotation':
                        self.__passage.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__passage.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__passage.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__passage.text = elem.text
                    elif elem.tag == 'infon':
                        self.__passage.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'passage':
                        self.__state = 2
                        if self.__passage is not None:
                            self.__document.add_passage(self.__passage)
            elif self.__state == 4:
                if event == 'start':
                    if elem.tag == 'annotation':
                        self.__sentence.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__sentence.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__sentence.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__sentence.text = elem.text
                    elif elem.tag == 'infon':
                        self.__sentence.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'sentence':
                        self.__state = 3
                        if self.__sentence is not None:
                            self.__passage.add_sentence(self.__sentence)

    def __read_annotation(self, start_elem):
        ann = BioCAnnotation()
        ann.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'text':
                    ann.text = elem.text
                elif elem.tag == 'infon':
                    ann.infons[elem.get('key')] = elem.text
                elif elem.tag == 'location':
                    ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length'))))
                elif elem.tag == 'annotation':
                    return ann
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __read_relation(self, start_elem):
        rel = BioCRelation()
        rel.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'infon':
                    rel.infons[elem.get('key')] = elem.text
                elif elem.tag == 'node':
                    rel.add_node(BioCNode(elem.get('refid'), elem.get('role')))
                if elem.tag == 'relation':
                    return rel
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __has_next(self):
        try:
            self.__event, self.__elem = next(self.__context)
            return True
        except StopIteration:
            self.__event = None
            self.__elem = None
            return False

    def __next_event(self):
        return self.__event, self.__elem

    def get_collection_info(self) -> BioCCollection:
        """
        Reads the collection information: encoding, version, DTD, source, date, key, infons, etc.

        Returns:
            the BioC collection that contains only information
        """
        return self.__collection
Ejemplo n.º 7
0
def convert_pubtator(input_path, output_path):
    """Convert pubtators annotation list to BioC XML
    Keyword Arguments:
    input_file -- the path of pubtators annotation file
    output_file -- the path to output the BioC XML file
    """

    # Set up BioCWriter to write specifically Pubtator
    # Can change to incorporate other sources besides pubtator
    writer = BioCWriter()
    writer.collection = BioCCollection()
    collection = writer.collection
    collection.date = time.strftime("%Y/%m/%d")
    collection.source = "Pubtator"
    collection.key = "Pubtator.key"

    opener = utilities.get_opener(output_path)
    with opener(output_path, 'wb') as xml_file:

        # Have to manually do this because hangs otherwise
        # Write the head of the xml file
        xml_shell = writer.tostring('UTF-8')
        *xml_head, xml_tail = xml_shell.rstrip().split(b'\n')
        for line in xml_head:
            xml_file.write(line + b'\n')

        article_generator = read_bioconcepts2pubtator_offsets(input_path)
        # Write each article in BioC format
        for article in tqdm.tqdm(article_generator):
            document = BioCDocument()
            document.id = article["pubmed_id"]

            title_passage = BioCPassage()
            title_passage.put_infon('type', 'title')
            title_passage.offset = '0'
            title_passage.text = article["title"]

            abstract_passage = BioCPassage()
            abstract_passage.put_infon('type', 'abstract')
            abstract_passage.offset = article["abstract"]
            abstract_passage.text = article["abstract"]

            id_index = 0
            for tag in article["title_annot"]:
                title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            for tag in article["abstract_annot"]:
                abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            document.add_passage(title_passage)
            document.add_passage(abstract_passage)

            step_parent = E('collection')
            writer._build_documents([document], step_parent)
            xml_file.write(tostring(step_parent[0], pretty_print=True))
            step_parent.clear()

        # Write the closing tag of the xml document
        xml_file.write(xml_tail + b'\n')
Ejemplo n.º 8
0
def convert_pubtator(input_file, output_file=None):
    """Convert pubtators annotation list to BioC XML

    Keyword Arguments:
    input_file -- the path of pubtators annotation file
    output_file -- the path to output the converted text
    """
    if output_file is None:
        output_file = "bioc-converted-docs.xml"

    # Set up BioCWriter to write specifically Pubtator
    # Can change to incorporate other sources besides pubtator
    writer = BioCWriter()
    writer.collection = BioCCollection()
    collection = writer.collection
    collection.date = time.strftime("%Y/%m/%d")
    collection.source = "Pubtator"
    collection.key = "Pubtator.key"

    with open(output_file, 'wb') as g:

        # Have to manually do this because hangs otherwise
        # Write the head of the xml file
        xml_header = writer.tostring('UTF-8')
        xml_tail = '</collection>\n'
        xml_head = xml_header[:-len(xml_tail)]
        g.write(xml_head)

        article_generator = bioconcepts2pubtator_offsets(input_file)
        # Write each article in BioC format
        for article in tqdm.tqdm(article_generator):
            document = BioCDocument()
            document.id = article["Document ID"]

            title_passage = BioCPassage()
            title_passage.put_infon('type', 'title')
            title_passage.offset = '0'
            title_passage.text = article["Title"]

            abstract_passage = BioCPassage()
            abstract_passage.put_infon('type', 'abstract')
            abstract_passage.offset = str(article["Abstract"])
            abstract_passage.text = article["Abstract"]

            id_index = 0
            for tag in article["Title_Annot"]:
                title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            for tag in article["Abstract_Annot"]:
                abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index))
                id_index += 1

            document.add_passage(title_passage)
            document.add_passage(abstract_passage)

            step_parent = E('collection')
            writer._build_documents([document], step_parent)
            g.write(tostring(step_parent[0], pretty_print=True))
            step_parent.clear()

        # Write the closing tag of the xml document
        g.write(xml_tail)
Ejemplo n.º 9
0
 def __read(self):
     while self.__has_next():
         event, elem = self.__next_event()
         if self.__state == 0:
             if event == 'start':
                 if elem.tag == 'collection':
                     self.__state = 1
                     self.__collection = BioCCollection()
                     # collection information
         elif self.__state == 1:
             if event == 'start':
                 if elem.tag == 'document':
                     self.__document = BioCDocument()
                     self.__state = 2
             elif event == 'end':
                 if elem.tag == 'source':
                     self.__collection.source = elem.text
                 elif elem.tag == 'date':
                     self.__collection.date = elem.text
                 elif elem.tag == 'key':
                     self.__collection.key = elem.text
                 elif elem.tag == 'infon':
                     self.__collection.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'collection':
                     self.__state = 0
                     self.__document = None
                     self.__passage = None
                     self.__sentence = None
         elif self.__state == 2:
             if event == 'start':
                 if elem.tag == 'passage':
                     self.__passage = BioCPassage()
                     self.__state = 3
                 elif elem.tag == 'annotation':
                     self.__document.add_annotation(self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__document.add_relation(self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'id':
                     self.__document.id = elem.text
                 elif elem.tag == 'infon':
                     self.__document.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'document':
                     # Remove previous element and it's ancestors
                     # Particularly useful for working with large xml files
                     # - Based on fast_iter modification of lxml context
                     # Ref: https://codereview.stackexchange.com/questions/2449/parsing-huge-xml-file-with-lxml-etree-iterparse-in-python
                     if self.__prev_elem is not None:
                         self.__prev_elem.clear()
                         for ancestor in self.__prev_elem.xpath('ancestor-or-self::*'):
                             while ancestor.getprevious() is not None and ancestor.getparent() is not None:
                                 del ancestor.getparent()[0]
                     self.__state = 1
                     return
         elif self.__state == 3:
             if event == 'start':
                 if elem.tag == 'sentence':
                     self.__sentence = BioCSentence()
                     self.__state = 4
                 elif elem.tag == 'annotation':
                     self.__passage.add_annotation(self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__passage.add_relation(self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'offset':
                     self.__passage.offset = int(elem.text)
                 elif elem.tag == 'text':
                     self.__passage.text = elem.text
                 elif elem.tag == 'infon':
                     self.__passage.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'passage':
                     self.__state = 2
                     if self.__passage is not None:
                         self.__document.add_passage(self.__passage)
         elif self.__state == 4:
             if event == 'start':
                 if elem.tag == 'annotation':
                     self.__sentence.add_annotation(self.__read_annotation(elem))
                 elif elem.tag == 'relation':
                     self.__sentence.add_relation(self.__read_relation(elem))
             elif event == 'end':
                 if elem.tag == 'offset':
                     self.__sentence.offset = int(elem.text)
                 elif elem.tag == 'text':
                     self.__sentence.text = elem.text
                 elif elem.tag == 'infon':
                     self.__sentence.infons[elem.get('key')] = elem.text
                 elif elem.tag == 'sentence':
                     self.__state = 3
                     if self.__sentence is not None:
                         self.__passage.add_sentence(self.__sentence)
Ejemplo n.º 10
0
class BioCXMLDocumentReader:
    """
    Reader for the BioC XML format, one document per iteration.
    """

    def __init__(self, source: Union[str, BinaryIO]):
        # if not isinstance(file, str):
        #     file = str(file)
        self.file = source
        self.__context = iter(etree.iterparse(self.file, events=('start', 'end')))
        self.__state = 0
        self.__event = None
        self.__elem = None
        self.__read()

    def __iter__(self):
        return self

    def __next__(self):
        """
        Reads one BioC document from the XML file.

        Returns:
            BioCDocument: the BioC document
        """
        if self.__document is None:
            raise StopIteration
        else:
            document = self.__document
            self.__read()
            return document

    def __read(self):
        while self.__has_next():
            event, elem = self.__next_event()
            if self.__state == 0:
                if event == 'start':
                    if elem.tag == 'collection':
                        self.__state = 1
                        self.__collection = BioCCollection()
                        # collection information
            elif self.__state == 1:
                if event == 'start':
                    if elem.tag == 'document':
                        self.__document = BioCDocument()
                        self.__state = 2
                elif event == 'end':
                    if elem.tag == 'source':
                        self.__collection.source = elem.text
                    elif elem.tag == 'date':
                        self.__collection.date = elem.text
                    elif elem.tag == 'key':
                        self.__collection.key = elem.text
                    elif elem.tag == 'infon':
                        self.__collection.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'collection':
                        self.__state = 0
                        self.__document = None
                        self.__passage = None
                        self.__sentence = None
            elif self.__state == 2:
                if event == 'start':
                    if elem.tag == 'passage':
                        self.__passage = BioCPassage()
                        self.__state = 3
                    elif elem.tag == 'annotation':
                        self.__document.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__document.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'id':
                        self.__document.id = elem.text
                    elif elem.tag == 'infon':
                        self.__document.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'document':
                        # Remove previous element and it's ancestors
                        # Particularly useful for working with large xml files
                        # - Based on fast_iter modification of lxml context
                        # Ref: https://codereview.stackexchange.com/questions/2449/parsing-huge-xml-file-with-lxml-etree-iterparse-in-python
                        if self.__prev_elem is not None:
                            self.__prev_elem.clear()
                            for ancestor in self.__prev_elem.xpath('ancestor-or-self::*'):
                                while ancestor.getprevious() is not None and ancestor.getparent() is not None:
                                    del ancestor.getparent()[0]
                        self.__state = 1
                        return
            elif self.__state == 3:
                if event == 'start':
                    if elem.tag == 'sentence':
                        self.__sentence = BioCSentence()
                        self.__state = 4
                    elif elem.tag == 'annotation':
                        self.__passage.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__passage.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__passage.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__passage.text = elem.text
                    elif elem.tag == 'infon':
                        self.__passage.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'passage':
                        self.__state = 2
                        if self.__passage is not None:
                            self.__document.add_passage(self.__passage)
            elif self.__state == 4:
                if event == 'start':
                    if elem.tag == 'annotation':
                        self.__sentence.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__sentence.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__sentence.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__sentence.text = elem.text
                    elif elem.tag == 'infon':
                        self.__sentence.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'sentence':
                        self.__state = 3
                        if self.__sentence is not None:
                            self.__passage.add_sentence(self.__sentence)

    def __read_annotation(self, start_elem):
        ann = BioCAnnotation()
        ann.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'text':
                    ann.text = elem.text
                elif elem.tag == 'infon':
                    ann.infons[elem.get('key')] = elem.text
                elif elem.tag == 'location':
                    ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length'))))
                elif elem.tag == 'annotation':
                    return ann
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __read_relation(self, start_elem):
        rel = BioCRelation()
        rel.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'infon':
                    rel.infons[elem.get('key')] = elem.text
                elif elem.tag == 'node':
                    rel.add_node(BioCNode(elem.get('refid'), elem.get('role')))
                if elem.tag == 'relation':
                    return rel
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __has_next(self):
        try:
            # Track reference to previous element in xml tree - useful for clearing xml element after processing
            if self.__elem is not None:
                self.__prev_elem = self.__elem

            self.__event, self.__elem = next(self.__context)
            return True
        except StopIteration:
            self.__event = None
            self.__elem = None
            return False

    def __next_event(self):
        return self.__event, self.__elem

    def get_collection_info(self) -> BioCCollection:
        """
        Reads the collection information: encoding, version, DTD, source, date, key, infons, etc.

        Returns:
            the BioC collection that contains only information
        """
        return self.__collection