def write_text_bioc(self, output_path): bioc_writer = BioCWriter(output_path) bioc_collection = BioCCollection() # Insert option for either writing text only or annotations? # to keep document as it is: #collection.add_document(self.document) bioc_document = BioCDocument() for passage in self.abstract_dict.keys(): bioc_passage = BioCPassage() bioc_passage.text = self.abstract_dict[passage] bioc_document.add_passage(bioc_passage) bioc_collection.add_document(bioc_document) print 'BioC output path', output_path bioc_writer.collection = bioc_collection bioc_writer.write()
def test_should_get_gene_names_normalised(self, list_dict, expected_genes): # Arrange sut = BiocAnnotationGenes() bioc_doc = BioCDocument() bioc_passage = BioCPassage() bioc_doc.add_passage(bioc_passage) for dict in list_dict: annotation = BioCAnnotation() annotation.infons = dict bioc_passage.add_annotation(annotation) # act actual = sut.get_gene_names_normalised(bioc_doc) # assert self.assertEqual(set(expected_genes), actual)
def test_should_get_gene_names_to_normalised_dict( self, list_gene_dict_in_passage, expected_dict): # Arrange sut = BiocAnnotationGenes() bioc_doc = BioCDocument() for list_gene_dict in list_gene_dict_in_passage: bioc_passage = BioCPassage() bioc_doc.add_passage(bioc_passage) for dict in list_gene_dict: annotation = BioCAnnotation() annotation.text = dict["text"] annotation.infons = dict bioc_passage.add_annotation(annotation) # act actual = sut.get_gene_names_to_normalised_dict(bioc_doc) # assert self.assertEqual(expected_dict, actual)
def __parse_passage(self, tree): passage = BioCPassage() passage.offset = int(tree.findtext('offset')) passage.infons = self.__parse_infons(tree) if tree.find('text') is not None: passage.text = tree.findtext('text') for child in tree.findall('sentence'): passage.add_sentence(self.__parse_sentence(child)) for child in tree.findall('annotation'): passage.add_annotation(self.__parse_annotation(child)) for child in tree.findall('relation'): passage.add_relation(self.__parse_relation(child)) return passage
def __read(self): while self.__has_next(): event, elem = self.__next_event() if self.__state == 0: if event == 'start': if elem.tag == 'collection': self.__state = 1 self.__collection = BioCCollection() # collection information elif self.__state == 1: if event == 'start': if elem.tag == 'document': self.__document = BioCDocument() self.__state = 2 elif event == 'end': if elem.tag == 'source': self.__collection.source = elem.text elif elem.tag == 'date': self.__collection.date = elem.text elif elem.tag == 'key': self.__collection.key = elem.text elif elem.tag == 'infon': self.__collection.infons[elem.get('key')] = elem.text elif elem.tag == 'collection': self.__state = 0 self.__document = None self.__passage = None self.__sentence = None elif self.__state == 2: if event == 'start': if elem.tag == 'passage': self.__passage = BioCPassage() self.__state = 3 elif elem.tag == 'annotation': self.__document.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__document.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'id': self.__document.id = elem.text elif elem.tag == 'infon': self.__document.infons[elem.get('key')] = elem.text elif elem.tag == 'document': self.__state = 1 return elif self.__state == 3: if event == 'start': if elem.tag == 'sentence': self.__sentence = BioCSentence() self.__state = 4 elif elem.tag == 'annotation': self.__passage.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__passage.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__passage.offset = int(elem.text) elif elem.tag == 'text': self.__passage.text = elem.text elif elem.tag == 'infon': self.__passage.infons[elem.get('key')] = elem.text elif elem.tag == 'passage': self.__state = 2 if self.__passage is not None: self.__document.add_passage(self.__passage) elif self.__state == 4: if event == 'start': if elem.tag == 'annotation': self.__sentence.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__sentence.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__sentence.offset = int(elem.text) elif elem.tag == 'text': self.__sentence.text = elem.text elif elem.tag == 'infon': self.__sentence.infons[elem.get('key')] = elem.text elif elem.tag == 'sentence': self.__state = 3 if self.__sentence is not None: self.__passage.add_sentence(self.__sentence)
class BioCXMLDocumentReader: """ Reader for the BioC XML format, one document per iteration. """ def __init__(self, source: Union[str, BinaryIO]): # if not isinstance(file, str): # file = str(file) self.file = source self.__context = iter(etree.iterparse(self.file, events=('start', 'end'))) self.__state = 0 self.__event = None self.__elem = None self.__read() def __iter__(self): return self def __next__(self): """ Reads one BioC document from the XML file. Returns: BioCDocument: the BioC document """ if self.__document is None: raise StopIteration else: document = self.__document self.__read() return document def __read(self): while self.__has_next(): event, elem = self.__next_event() if self.__state == 0: if event == 'start': if elem.tag == 'collection': self.__state = 1 self.__collection = BioCCollection() # collection information elif self.__state == 1: if event == 'start': if elem.tag == 'document': self.__document = BioCDocument() self.__state = 2 elif event == 'end': if elem.tag == 'source': self.__collection.source = elem.text elif elem.tag == 'date': self.__collection.date = elem.text elif elem.tag == 'key': self.__collection.key = elem.text elif elem.tag == 'infon': self.__collection.infons[elem.get('key')] = elem.text elif elem.tag == 'collection': self.__state = 0 self.__document = None self.__passage = None self.__sentence = None elif self.__state == 2: if event == 'start': if elem.tag == 'passage': self.__passage = BioCPassage() self.__state = 3 elif elem.tag == 'annotation': self.__document.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__document.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'id': self.__document.id = elem.text elif elem.tag == 'infon': self.__document.infons[elem.get('key')] = elem.text elif elem.tag == 'document': self.__state = 1 return elif self.__state == 3: if event == 'start': if elem.tag == 'sentence': self.__sentence = BioCSentence() self.__state = 4 elif elem.tag == 'annotation': self.__passage.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__passage.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__passage.offset = int(elem.text) elif elem.tag == 'text': self.__passage.text = elem.text elif elem.tag == 'infon': self.__passage.infons[elem.get('key')] = elem.text elif elem.tag == 'passage': self.__state = 2 if self.__passage is not None: self.__document.add_passage(self.__passage) elif self.__state == 4: if event == 'start': if elem.tag == 'annotation': self.__sentence.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__sentence.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__sentence.offset = int(elem.text) elif elem.tag == 'text': self.__sentence.text = elem.text elif elem.tag == 'infon': self.__sentence.infons[elem.get('key')] = elem.text elif elem.tag == 'sentence': self.__state = 3 if self.__sentence is not None: self.__passage.add_sentence(self.__sentence) def __read_annotation(self, start_elem): ann = BioCAnnotation() ann.id = start_elem.get('id') while self.__has_next(): event, elem = self.__next_event() if event == 'start': pass elif event == 'end': if elem.tag == 'text': ann.text = elem.text elif elem.tag == 'infon': ann.infons[elem.get('key')] = elem.text elif elem.tag == 'location': ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length')))) elif elem.tag == 'annotation': return ann raise RuntimeError("should not reach here") # pragma: no cover def __read_relation(self, start_elem): rel = BioCRelation() rel.id = start_elem.get('id') while self.__has_next(): event, elem = self.__next_event() if event == 'start': pass elif event == 'end': if elem.tag == 'infon': rel.infons[elem.get('key')] = elem.text elif elem.tag == 'node': rel.add_node(BioCNode(elem.get('refid'), elem.get('role'))) if elem.tag == 'relation': return rel raise RuntimeError("should not reach here") # pragma: no cover def __has_next(self): try: self.__event, self.__elem = next(self.__context) return True except StopIteration: self.__event = None self.__elem = None return False def __next_event(self): return self.__event, self.__elem def get_collection_info(self) -> BioCCollection: """ Reads the collection information: encoding, version, DTD, source, date, key, infons, etc. Returns: the BioC collection that contains only information """ return self.__collection
def convert_pubtator(input_path, output_path): """Convert pubtators annotation list to BioC XML Keyword Arguments: input_file -- the path of pubtators annotation file output_file -- the path to output the BioC XML file """ # Set up BioCWriter to write specifically Pubtator # Can change to incorporate other sources besides pubtator writer = BioCWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = time.strftime("%Y/%m/%d") collection.source = "Pubtator" collection.key = "Pubtator.key" opener = utilities.get_opener(output_path) with opener(output_path, 'wb') as xml_file: # Have to manually do this because hangs otherwise # Write the head of the xml file xml_shell = writer.tostring('UTF-8') *xml_head, xml_tail = xml_shell.rstrip().split(b'\n') for line in xml_head: xml_file.write(line + b'\n') article_generator = read_bioconcepts2pubtator_offsets(input_path) # Write each article in BioC format for article in tqdm.tqdm(article_generator): document = BioCDocument() document.id = article["pubmed_id"] title_passage = BioCPassage() title_passage.put_infon('type', 'title') title_passage.offset = '0' title_passage.text = article["title"] abstract_passage = BioCPassage() abstract_passage.put_infon('type', 'abstract') abstract_passage.offset = article["abstract"] abstract_passage.text = article["abstract"] id_index = 0 for tag in article["title_annot"]: title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 for tag in article["abstract_annot"]: abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 document.add_passage(title_passage) document.add_passage(abstract_passage) step_parent = E('collection') writer._build_documents([document], step_parent) xml_file.write(tostring(step_parent[0], pretty_print=True)) step_parent.clear() # Write the closing tag of the xml document xml_file.write(xml_tail + b'\n')
def convert_pubtator(input_file, output_file=None): """Convert pubtators annotation list to BioC XML Keyword Arguments: input_file -- the path of pubtators annotation file output_file -- the path to output the converted text """ if output_file is None: output_file = "bioc-converted-docs.xml" # Set up BioCWriter to write specifically Pubtator # Can change to incorporate other sources besides pubtator writer = BioCWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = time.strftime("%Y/%m/%d") collection.source = "Pubtator" collection.key = "Pubtator.key" with open(output_file, 'wb') as g: # Have to manually do this because hangs otherwise # Write the head of the xml file xml_header = writer.tostring('UTF-8') xml_tail = '</collection>\n' xml_head = xml_header[:-len(xml_tail)] g.write(xml_head) article_generator = bioconcepts2pubtator_offsets(input_file) # Write each article in BioC format for article in tqdm.tqdm(article_generator): document = BioCDocument() document.id = article["Document ID"] title_passage = BioCPassage() title_passage.put_infon('type', 'title') title_passage.offset = '0' title_passage.text = article["Title"] abstract_passage = BioCPassage() abstract_passage.put_infon('type', 'abstract') abstract_passage.offset = str(article["Abstract"]) abstract_passage.text = article["Abstract"] id_index = 0 for tag in article["Title_Annot"]: title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 for tag in article["Abstract_Annot"]: abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 document.add_passage(title_passage) document.add_passage(abstract_passage) step_parent = E('collection') writer._build_documents([document], step_parent) g.write(tostring(step_parent[0], pretty_print=True)) step_parent.clear() # Write the closing tag of the xml document g.write(xml_tail)
def __read(self): while self.__has_next(): event, elem = self.__next_event() if self.__state == 0: if event == 'start': if elem.tag == 'collection': self.__state = 1 self.__collection = BioCCollection() # collection information elif self.__state == 1: if event == 'start': if elem.tag == 'document': self.__document = BioCDocument() self.__state = 2 elif event == 'end': if elem.tag == 'source': self.__collection.source = elem.text elif elem.tag == 'date': self.__collection.date = elem.text elif elem.tag == 'key': self.__collection.key = elem.text elif elem.tag == 'infon': self.__collection.infons[elem.get('key')] = elem.text elif elem.tag == 'collection': self.__state = 0 self.__document = None self.__passage = None self.__sentence = None elif self.__state == 2: if event == 'start': if elem.tag == 'passage': self.__passage = BioCPassage() self.__state = 3 elif elem.tag == 'annotation': self.__document.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__document.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'id': self.__document.id = elem.text elif elem.tag == 'infon': self.__document.infons[elem.get('key')] = elem.text elif elem.tag == 'document': # Remove previous element and it's ancestors # Particularly useful for working with large xml files # - Based on fast_iter modification of lxml context # Ref: https://codereview.stackexchange.com/questions/2449/parsing-huge-xml-file-with-lxml-etree-iterparse-in-python if self.__prev_elem is not None: self.__prev_elem.clear() for ancestor in self.__prev_elem.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None and ancestor.getparent() is not None: del ancestor.getparent()[0] self.__state = 1 return elif self.__state == 3: if event == 'start': if elem.tag == 'sentence': self.__sentence = BioCSentence() self.__state = 4 elif elem.tag == 'annotation': self.__passage.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__passage.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__passage.offset = int(elem.text) elif elem.tag == 'text': self.__passage.text = elem.text elif elem.tag == 'infon': self.__passage.infons[elem.get('key')] = elem.text elif elem.tag == 'passage': self.__state = 2 if self.__passage is not None: self.__document.add_passage(self.__passage) elif self.__state == 4: if event == 'start': if elem.tag == 'annotation': self.__sentence.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__sentence.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__sentence.offset = int(elem.text) elif elem.tag == 'text': self.__sentence.text = elem.text elif elem.tag == 'infon': self.__sentence.infons[elem.get('key')] = elem.text elif elem.tag == 'sentence': self.__state = 3 if self.__sentence is not None: self.__passage.add_sentence(self.__sentence)
class BioCXMLDocumentReader: """ Reader for the BioC XML format, one document per iteration. """ def __init__(self, source: Union[str, BinaryIO]): # if not isinstance(file, str): # file = str(file) self.file = source self.__context = iter(etree.iterparse(self.file, events=('start', 'end'))) self.__state = 0 self.__event = None self.__elem = None self.__read() def __iter__(self): return self def __next__(self): """ Reads one BioC document from the XML file. Returns: BioCDocument: the BioC document """ if self.__document is None: raise StopIteration else: document = self.__document self.__read() return document def __read(self): while self.__has_next(): event, elem = self.__next_event() if self.__state == 0: if event == 'start': if elem.tag == 'collection': self.__state = 1 self.__collection = BioCCollection() # collection information elif self.__state == 1: if event == 'start': if elem.tag == 'document': self.__document = BioCDocument() self.__state = 2 elif event == 'end': if elem.tag == 'source': self.__collection.source = elem.text elif elem.tag == 'date': self.__collection.date = elem.text elif elem.tag == 'key': self.__collection.key = elem.text elif elem.tag == 'infon': self.__collection.infons[elem.get('key')] = elem.text elif elem.tag == 'collection': self.__state = 0 self.__document = None self.__passage = None self.__sentence = None elif self.__state == 2: if event == 'start': if elem.tag == 'passage': self.__passage = BioCPassage() self.__state = 3 elif elem.tag == 'annotation': self.__document.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__document.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'id': self.__document.id = elem.text elif elem.tag == 'infon': self.__document.infons[elem.get('key')] = elem.text elif elem.tag == 'document': # Remove previous element and it's ancestors # Particularly useful for working with large xml files # - Based on fast_iter modification of lxml context # Ref: https://codereview.stackexchange.com/questions/2449/parsing-huge-xml-file-with-lxml-etree-iterparse-in-python if self.__prev_elem is not None: self.__prev_elem.clear() for ancestor in self.__prev_elem.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None and ancestor.getparent() is not None: del ancestor.getparent()[0] self.__state = 1 return elif self.__state == 3: if event == 'start': if elem.tag == 'sentence': self.__sentence = BioCSentence() self.__state = 4 elif elem.tag == 'annotation': self.__passage.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__passage.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__passage.offset = int(elem.text) elif elem.tag == 'text': self.__passage.text = elem.text elif elem.tag == 'infon': self.__passage.infons[elem.get('key')] = elem.text elif elem.tag == 'passage': self.__state = 2 if self.__passage is not None: self.__document.add_passage(self.__passage) elif self.__state == 4: if event == 'start': if elem.tag == 'annotation': self.__sentence.add_annotation(self.__read_annotation(elem)) elif elem.tag == 'relation': self.__sentence.add_relation(self.__read_relation(elem)) elif event == 'end': if elem.tag == 'offset': self.__sentence.offset = int(elem.text) elif elem.tag == 'text': self.__sentence.text = elem.text elif elem.tag == 'infon': self.__sentence.infons[elem.get('key')] = elem.text elif elem.tag == 'sentence': self.__state = 3 if self.__sentence is not None: self.__passage.add_sentence(self.__sentence) def __read_annotation(self, start_elem): ann = BioCAnnotation() ann.id = start_elem.get('id') while self.__has_next(): event, elem = self.__next_event() if event == 'start': pass elif event == 'end': if elem.tag == 'text': ann.text = elem.text elif elem.tag == 'infon': ann.infons[elem.get('key')] = elem.text elif elem.tag == 'location': ann.add_location(BioCLocation(int(elem.get('offset')), int(elem.get('length')))) elif elem.tag == 'annotation': return ann raise RuntimeError("should not reach here") # pragma: no cover def __read_relation(self, start_elem): rel = BioCRelation() rel.id = start_elem.get('id') while self.__has_next(): event, elem = self.__next_event() if event == 'start': pass elif event == 'end': if elem.tag == 'infon': rel.infons[elem.get('key')] = elem.text elif elem.tag == 'node': rel.add_node(BioCNode(elem.get('refid'), elem.get('role'))) if elem.tag == 'relation': return rel raise RuntimeError("should not reach here") # pragma: no cover def __has_next(self): try: # Track reference to previous element in xml tree - useful for clearing xml element after processing if self.__elem is not None: self.__prev_elem = self.__elem self.__event, self.__elem = next(self.__context) return True except StopIteration: self.__event = None self.__elem = None return False def __next_event(self): return self.__event, self.__elem def get_collection_info(self) -> BioCCollection: """ Reads the collection information: encoding, version, DTD, source, date, key, infons, etc. Returns: the BioC collection that contains only information """ return self.__collection