def extract_annotations(xml_path, tsv_path): """ Extract the annotations from pubtator xml formatted file Outputs a TSV file with the following header terms: Document - the corresponding pubmed id Type - the type of term (i.e. Chemical, Disease, Gene etc.) ID - the appropiate MESH or NCBI ID if known Offset - the character position where the term starts End - the character position where the term ends Keywords arguments: xml_path -- The path to the xml data file tsv_path -- the path to output the formatted data """ xml_opener = utilities.get_opener(xml_path) csv_opener = utilities.get_opener(tsv_path) with xml_opener(xml_path, "rb") as xml_file, csv_opener(tsv_path, "wt") as tsv_file: fieldnames = ['pubmed_id', 'type', 'identifier', 'offset', 'end'] writer = csv.DictWriter(tsv_file, fieldnames=fieldnames, delimiter='\t') writer.writeheader() tag_generator = ET.iterparse(xml_file, tag="document", recover=True, encoding="utf-8") try: for event, document in tqdm.tqdm(tag_generator): pubmed_id = document[0].text # cycle through all the annotation tags contained within document tag for annotation in document.iter('annotation'): # not all annotations will contain an ID if len(annotation) <= 3: continue for infon in annotation.iter('infon'): if infon.attrib["key"] == "type": ant_type = infon.text else: if not infon.text: continue ant_id = re.sub("(MESH:|CVCL:)", "", str(infon.text)) location, = annotation.iter('location') offset = int(location.attrib['offset']) end = offset + int(location.attrib['length']) row = {'pubmed_id': pubmed_id, 'type': ant_type, 'identifier': ant_id, 'offset': offset, 'end': end} writer.writerow(row) # prevent memory overload document.clear() except Exception as e: print(e) print(document[0].text)
def read_bioconcepts2pubtator_offsets(path): """Bioconcepts to pubtator Yields an article that is a dictionary described in the article generator function. Keywords: path - the path to the bioconcepts2putator_offset file (obtained from pubtator's ftp site: ftp://ftp.ncbi.nlm.nih.gov/pub/lu/PubTator/) """ opener = utilities.get_opener(path) f = opener(path, "rt") lines = (line.rstrip() for line in f) for k, g in groupby(lines, key=bool): # Group articles based on empty lines as separators. Only pass # on non-empty lines. g = list(g) if g[0]: yield pubtator_stanza_to_article(g) f.close()
def convert_pubtator(input_path, output_path): """Convert pubtators annotation list to BioC XML Keyword Arguments: input_file -- the path of pubtators annotation file output_file -- the path to output the BioC XML file """ # Set up BioCWriter to write specifically Pubtator # Can change to incorporate other sources besides pubtator writer = BioCWriter() writer.collection = BioCCollection() collection = writer.collection collection.date = time.strftime("%Y/%m/%d") collection.source = "Pubtator" collection.key = "Pubtator.key" opener = utilities.get_opener(output_path) with opener(output_path, 'wb') as xml_file: # Have to manually do this because hangs otherwise # Write the head of the xml file xml_shell = writer.tostring('UTF-8') *xml_head, xml_tail = xml_shell.rstrip().split(b'\n') for line in xml_head: xml_file.write(line + b'\n') article_generator = read_bioconcepts2pubtator_offsets(input_path) # Write each article in BioC format for article in tqdm.tqdm(article_generator): document = BioCDocument() document.id = article["pubmed_id"] title_passage = BioCPassage() title_passage.put_infon('type', 'title') title_passage.offset = '0' title_passage.text = article["title"] abstract_passage = BioCPassage() abstract_passage.put_infon('type', 'abstract') abstract_passage.offset = article["abstract"] abstract_passage.text = article["abstract"] id_index = 0 for tag in article["title_annot"]: title_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 for tag in article["abstract_annot"]: abstract_passage.annotations.append(bioconcepts2pubtator_annotations(tag, id_index)) id_index += 1 document.add_passage(title_passage) document.add_passage(abstract_passage) step_parent = E('collection') writer._build_documents([document], step_parent) xml_file.write(tostring(step_parent[0], pretty_print=True)) step_parent.clear() # Write the closing tag of the xml document xml_file.write(xml_tail + b'\n')
def filter_tags(infile, outfile): """ This method filters pubtator tags to consist of only hetnet tags Keyword arguments: infile -- the name of the file to read outfile -- the name of the output file """ print_header = True hetnet_chemical_df = load_chemical_df() hetnet_disease_df = load_disease_df() hetnet_gene_df = load_gene_df() csv_opener = utilities.get_opener(outfile) with csv_opener(outfile, "wt") as tsv_file: for extracted_tag_df in tqdm.tqdm(get_tag_chunks(infile)): # Covert chemical IDs chemical_merged_df = pd.merge( extracted_tag_df[extracted_tag_df["type"] == "Chemical"], hetnet_chemical_df[["drugbank_id", "identifier"]], left_on="identifier", right_on="identifier") chemical_merged_df = chemical_merged_df.drop_duplicates() chemical_merged_df["type"] = "Compound" chemical_merged_df = chemical_merged_df[[ "pubmed_id", "type", "offset", "end", "drugbank_id" ]].rename(columns={"drugbank_id": "identifier"}) # Convert Disease IDs disease_merged_df = pd.merge( extracted_tag_df[extracted_tag_df["type"] == "Disease"], hetnet_disease_df[["doid_code", "resource_id"]], left_on="identifier", right_on="resource_id") disease_merged_df = disease_merged_df.drop_duplicates() disease_merged_df = disease_merged_df[[ "pubmed_id", "type", "offset", "end", "doid_code" ]].rename(columns={"doid_code": "identifier"}) # Verify Gene IDs are human genes gene_df = extracted_tag_df[extracted_tag_df["type"] == "Gene"] gene_final_df = gene_df[gene_df["identifier"].isin( hetnet_gene_df["GeneID"])] final_df = gene_final_df final_df = final_df.append(chemical_merged_df) final_df = final_df.append(disease_merged_df) if print_header: (final_df[["pubmed_id", "type", "identifier", "offset", "end"]].sort_values(["pubmed_id", "offset"]).to_csv(tsv_file, sep="\t", index=False)) print_header = False else: (final_df[["pubmed_id", "type", "identifier", "offset", "end"]].sort_values(["pubmed_id", "offset" ]).to_csv(tsv_file, sep="\t", index=False, header=False))