Esempio n. 1
0
def brat2bioc_doc(bratdoc: BratDocument) -> BioCDocument:
    biocdoc = BioCDocument()
    biocdoc.id = bratdoc.id
    biocdoc.text = bratdoc.text
    # entity
    for bratentity in bratdoc.entities:
        biocdoc.add_annotation(brat2bioc_entity(bratentity))
    # relation
    for bratrelation in bratdoc.relations:
        biocdoc.add_relation(brat2bioc_relation(bratrelation))
    # event
    for bratevent in bratdoc.events:
        biocdoc.add_relation(brat2bioc_event(bratevent))
    # equiv
    for i, brat_equiv in enumerate(bratdoc.equiv_relations):
        brat_equiv.id = '%s%s' % (brat_equiv.id, i)
        biocdoc.add_relation(brat2bioc_equiv(brat_equiv))
    # attribute
    for bratatt in bratdoc.attributes:
        ann = biocdoc.get(bratatt.refid)
        ann.infons['note_id'] = bratatt.id
        ann.infons['attributes'] = ' '.join(sorted(bratatt.attributes))
    # note
    for bratnote in bratdoc.notes:
        ann = biocdoc.get(bratnote.refid)
        ann.infons['note_id'] = bratnote.id
        ann.infons['type'] = bratnote.type
        ann.infons['note'] = bratnote.text
    return biocdoc
Esempio n. 2
0
 def __parse_document(self, tree):
     document = BioCDocument()
     document.id = tree.findtext('id')
     document.infons = self.__parse_infons(tree)
     for child in tree.findall('passage'):
         document.add_passage(self.__parse_passage(child))
     for child in tree.findall('annotation'):
         document.add_annotation(self.__parse_annotation(child))
     for child in tree.findall('relation'):
         document.add_relation(self.__parse_relation(child))
     return document
Esempio n. 3
0
class BioCXMLDocumentReader:
    """
    Reader for the BioC XML format, one document per iteration.
    """

    def __init__(self, source: Union[str, BinaryIO]):
        # if not isinstance(file, str):
        #     file = str(file)
        self.file = source
        self.__context = iter(etree.iterparse(self.file, events=('start', 'end')))
        self.__state = 0
        self.__event = None
        self.__elem = None
        self.__read()

    def __iter__(self):
        return self

    def __next__(self):
        """
        Reads one BioC document from the XML file.

        Returns:
            BioCDocument: the BioC document
        """
        if self.__document is None:
            raise StopIteration
        else:
            document = self.__document
            self.__read()
            return document

    def __read(self):
        while self.__has_next():
            event, elem = self.__next_event()
            if self.__state == 0:
                if event == 'start':
                    if elem.tag == 'collection':
                        self.__state = 1
                        self.__collection = BioCCollection()
                        # collection information
            elif self.__state == 1:
                if event == 'start':
                    if elem.tag == 'document':
                        self.__document = BioCDocument()
                        self.__state = 2
                elif event == 'end':
                    if elem.tag == 'source':
                        self.__collection.source = elem.text
                    elif elem.tag == 'date':
                        self.__collection.date = elem.text
                    elif elem.tag == 'key':
                        self.__collection.key = elem.text
                    elif elem.tag == 'infon':
                        self.__collection.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'collection':
                        self.__state = 0
                        self.__document = None
                        self.__passage = None
                        self.__sentence = None
            elif self.__state == 2:
                if event == 'start':
                    if elem.tag == 'passage':
                        self.__passage = BioCPassage()
                        self.__state = 3
                    elif elem.tag == 'annotation':
                        self.__document.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__document.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'id':
                        self.__document.id = elem.text
                    elif elem.tag == 'infon':
                        self.__document.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'document':
                        self.__state = 1
                        return
            elif self.__state == 3:
                if event == 'start':
                    if elem.tag == 'sentence':
                        self.__sentence = BioCSentence()
                        self.__state = 4
                    elif elem.tag == 'annotation':
                        self.__passage.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__passage.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__passage.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__passage.text = elem.text
                    elif elem.tag == 'infon':
                        self.__passage.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'passage':
                        self.__state = 2
                        if self.__passage is not None:
                            self.__document.add_passage(self.__passage)
            elif self.__state == 4:
                if event == 'start':
                    if elem.tag == 'annotation':
                        self.__sentence.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__sentence.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__sentence.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__sentence.text = elem.text
                    elif elem.tag == 'infon':
                        self.__sentence.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'sentence':
                        self.__state = 3
                        if self.__sentence is not None:
                            self.__passage.add_sentence(self.__sentence)

    def __read_annotation(self, start_elem):
        ann = BioCAnnotation()
        ann.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'text':
                    ann.text = elem.text
                elif elem.tag == 'infon':
                    ann.infons[elem.get('key')] = elem.text
                elif elem.tag == 'location':
                    ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length'))))
                elif elem.tag == 'annotation':
                    return ann
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __read_relation(self, start_elem):
        rel = BioCRelation()
        rel.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'infon':
                    rel.infons[elem.get('key')] = elem.text
                elif elem.tag == 'node':
                    rel.add_node(BioCNode(elem.get('refid'), elem.get('role')))
                if elem.tag == 'relation':
                    return rel
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __has_next(self):
        try:
            self.__event, self.__elem = next(self.__context)
            return True
        except StopIteration:
            self.__event = None
            self.__elem = None
            return False

    def __next_event(self):
        return self.__event, self.__elem

    def get_collection_info(self) -> BioCCollection:
        """
        Reads the collection information: encoding, version, DTD, source, date, key, infons, etc.

        Returns:
            the BioC collection that contains only information
        """
        return self.__collection
Esempio n. 4
0
class BioCXMLDocumentReader:
    """
    Reader for the BioC XML format, one document per iteration.
    """

    def __init__(self, source: Union[str, BinaryIO]):
        # if not isinstance(file, str):
        #     file = str(file)
        self.file = source
        self.__context = iter(etree.iterparse(self.file, events=('start', 'end')))
        self.__state = 0
        self.__event = None
        self.__elem = None
        self.__read()

    def __iter__(self):
        return self

    def __next__(self):
        """
        Reads one BioC document from the XML file.

        Returns:
            BioCDocument: the BioC document
        """
        if self.__document is None:
            raise StopIteration
        else:
            document = self.__document
            self.__read()
            return document

    def __read(self):
        while self.__has_next():
            event, elem = self.__next_event()
            if self.__state == 0:
                if event == 'start':
                    if elem.tag == 'collection':
                        self.__state = 1
                        self.__collection = BioCCollection()
                        # collection information
            elif self.__state == 1:
                if event == 'start':
                    if elem.tag == 'document':
                        self.__document = BioCDocument()
                        self.__state = 2
                elif event == 'end':
                    if elem.tag == 'source':
                        self.__collection.source = elem.text
                    elif elem.tag == 'date':
                        self.__collection.date = elem.text
                    elif elem.tag == 'key':
                        self.__collection.key = elem.text
                    elif elem.tag == 'infon':
                        self.__collection.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'collection':
                        self.__state = 0
                        self.__document = None
                        self.__passage = None
                        self.__sentence = None
            elif self.__state == 2:
                if event == 'start':
                    if elem.tag == 'passage':
                        self.__passage = BioCPassage()
                        self.__state = 3
                    elif elem.tag == 'annotation':
                        self.__document.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__document.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'id':
                        self.__document.id = elem.text
                    elif elem.tag == 'infon':
                        self.__document.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'document':
                        # Remove previous element and it's ancestors
                        # Particularly useful for working with large xml files
                        # - Based on fast_iter modification of lxml context
                        # Ref: https://codereview.stackexchange.com/questions/2449/parsing-huge-xml-file-with-lxml-etree-iterparse-in-python
                        if self.__prev_elem is not None:
                            self.__prev_elem.clear()
                            for ancestor in self.__prev_elem.xpath('ancestor-or-self::*'):
                                while ancestor.getprevious() is not None and ancestor.getparent() is not None:
                                    del ancestor.getparent()[0]
                        self.__state = 1
                        return
            elif self.__state == 3:
                if event == 'start':
                    if elem.tag == 'sentence':
                        self.__sentence = BioCSentence()
                        self.__state = 4
                    elif elem.tag == 'annotation':
                        self.__passage.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__passage.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__passage.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__passage.text = elem.text
                    elif elem.tag == 'infon':
                        self.__passage.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'passage':
                        self.__state = 2
                        if self.__passage is not None:
                            self.__document.add_passage(self.__passage)
            elif self.__state == 4:
                if event == 'start':
                    if elem.tag == 'annotation':
                        self.__sentence.add_annotation(self.__read_annotation(elem))
                    elif elem.tag == 'relation':
                        self.__sentence.add_relation(self.__read_relation(elem))
                elif event == 'end':
                    if elem.tag == 'offset':
                        self.__sentence.offset = int(elem.text)
                    elif elem.tag == 'text':
                        self.__sentence.text = elem.text
                    elif elem.tag == 'infon':
                        self.__sentence.infons[elem.get('key')] = elem.text
                    elif elem.tag == 'sentence':
                        self.__state = 3
                        if self.__sentence is not None:
                            self.__passage.add_sentence(self.__sentence)

    def __read_annotation(self, start_elem):
        ann = BioCAnnotation()
        ann.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'text':
                    ann.text = elem.text
                elif elem.tag == 'infon':
                    ann.infons[elem.get('key')] = elem.text
                elif elem.tag == 'location':
                    ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length'))))
                elif elem.tag == 'annotation':
                    return ann
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __read_relation(self, start_elem):
        rel = BioCRelation()
        rel.id = start_elem.get('id')
        while self.__has_next():
            event, elem = self.__next_event()
            if event == 'start':
                pass
            elif event == 'end':
                if elem.tag == 'infon':
                    rel.infons[elem.get('key')] = elem.text
                elif elem.tag == 'node':
                    rel.add_node(BioCNode(elem.get('refid'), elem.get('role')))
                if elem.tag == 'relation':
                    return rel
        raise RuntimeError("should not reach here")  # pragma: no cover

    def __has_next(self):
        try:
            # Track reference to previous element in xml tree - useful for clearing xml element after processing
            if self.__elem is not None:
                self.__prev_elem = self.__elem

            self.__event, self.__elem = next(self.__context)
            return True
        except StopIteration:
            self.__event = None
            self.__elem = None
            return False

    def __next_event(self):
        return self.__event, self.__elem

    def get_collection_info(self) -> BioCCollection:
        """
        Reads the collection information: encoding, version, DTD, source, date, key, infons, etc.

        Returns:
            the BioC collection that contains only information
        """
        return self.__collection