def get_bioc_relations(self, docid, relations): # <relation id="5618#7534"> # <infon key="Gene1">5618</infon> # <infon key="Gene2">7534</infon> # <infon key="relation">PPIm</infon> # </relation> biocDoc=BioCDocument() biocDoc.id = docid biocDoc.relations=[] for rel in relations: bioc_rel = BioCRelation() rel_list = list(rel) infon = {} infon["relation"] = "PPIm" infon["Gene1"] = rel_list[0] infon["Gene2"] = rel_list[0] if len(rel_list) == 2: infon["Gene2"] = rel_list[1] bioc_rel.id="{}#{}".format(infon["Gene1"], infon["Gene2"]) bioc_rel.infons=infon biocDoc.relations.append(bioc_rel) return biocDoc
def brat2bioc_doc(bratdoc: BratDocument) -> BioCDocument: biocdoc = BioCDocument() biocdoc.id = bratdoc.id biocdoc.text = bratdoc.text # entity for bratentity in bratdoc.entities: biocdoc.add_annotation(brat2bioc_entity(bratentity)) # relation for bratrelation in bratdoc.relations: biocdoc.add_relation(brat2bioc_relation(bratrelation)) # event for bratevent in bratdoc.events: biocdoc.add_relation(brat2bioc_event(bratevent)) # equiv for i, brat_equiv in enumerate(bratdoc.equiv_relations): brat_equiv.id = '%s%s' % (brat_equiv.id, i) biocdoc.add_relation(brat2bioc_equiv(brat_equiv)) # attribute for bratatt in bratdoc.attributes: ann = biocdoc.get(bratatt.refid) ann.infons['note_id'] = bratatt.id ann.infons['attributes'] = ' '.join(sorted(bratatt.attributes)) # note for bratnote in bratdoc.notes: ann = biocdoc.get(bratnote.refid) ann.infons['note_id'] = bratnote.id ann.infons['type'] = bratnote.type ann.infons['note'] = bratnote.text return biocdoc
def __parse_document(self, tree): document = BioCDocument() document.id = tree.findtext('id') document.infons = self.__parse_infons(tree) for child in tree.findall('passage'): document.add_passage(self.__parse_passage(child)) for child in tree.findall('annotation'): document.add_annotation(self.__parse_annotation(child)) for child in tree.findall('relation'): document.add_relation(self.__parse_relation(child)) return document
def convert_pubtator(input_path, output_path): """Convert pubtators annotation list to BioC XML Keyword Arguments: input_file -- the path of pubtators annotation file output_file -- the path to output the BioC XML file """ # Set up BioCWriter to write specifically Pubtator # Can change to incorporate other sources besides pubtator writer = BioCWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = time.strftime("%Y/%m/%d") collection.source = "Pubtator" collection.key = "Pubtator.key" opener = utilities.get_opener(output_path) with opener(output_path, 'wb') as xml_file: # Have to manually do this because hangs otherwise # Write the head of the xml file xml_shell = writer.tostring('UTF-8') *xml_head, xml_tail = xml_shell.rstrip().split(b'\n') for line in xml_head: xml_file.write(line + b'\n') article_generator = read_bioconcepts2pubtator_offsets(input_path) # Write each article in BioC format for article in tqdm.tqdm(article_generator): document = BioCDocument() document.id = article["pubmed_id"] title_passage = BioCPassage() title_passage.put_infon('type', 'title') title_passage.offset = '0' title_passage.text = article["title"] abstract_passage = BioCPassage() abstract_passage.put_infon('type', 'abstract') abstract_passage.offset = article["abstract"] abstract_passage.text = article["abstract"] id_index = 0 for tag in article["title_annot"]: title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 for tag in article["abstract_annot"]: abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 document.add_passage(title_passage) document.add_passage(abstract_passage) step_parent = E('collection') writer._build_documents([document], step_parent) xml_file.write(tostring(step_parent[0], pretty_print=True)) step_parent.clear() # Write the closing tag of the xml document xml_file.write(xml_tail + b'\n')
def convert_pubtator(input_file, output_file=None): """Convert pubtators annotation list to BioC XML Keyword Arguments: input_file -- the path of pubtators annotation file output_file -- the path to output the converted text """ if output_file is None: output_file = "bioc-converted-docs.xml" # Set up BioCWriter to write specifically Pubtator # Can change to incorporate other sources besides pubtator writer = BioCWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = time.strftime("%Y/%m/%d") collection.source = "Pubtator" collection.key = "Pubtator.key" with open(output_file, 'wb') as g: # Have to manually do this because hangs otherwise # Write the head of the xml file xml_header = writer.tostring('UTF-8') xml_tail = '</collection>\n' xml_head = xml_header[:-len(xml_tail)] g.write(xml_head) article_generator = bioconcepts2pubtator_offsets(input_file) # Write each article in BioC format for article in tqdm.tqdm(article_generator): document = BioCDocument() document.id = article["Document ID"] title_passage = BioCPassage() title_passage.put_infon('type', 'title') title_passage.offset = '0' title_passage.text = article["Title"] abstract_passage = BioCPassage() abstract_passage.put_infon('type', 'abstract') abstract_passage.offset = str(article["Abstract"]) abstract_passage.text = article["Abstract"] id_index = 0 for tag in article["Title_Annot"]: title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 for tag in article["Abstract_Annot"]: abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 document.add_passage(title_passage) document.add_passage(abstract_passage) step_parent = E('collection') writer._build_documents([document], step_parent) g.write(tostring(step_parent[0], pretty_print=True)) step_parent.clear() # Write the closing tag of the xml document g.write(xml_tail)