def make_pharmgkb_edge(self, fh: TextIO, line_data: dict) -> None: if set(self.edge_of_interest) != \ set([line_data['Entity1_type'], line_data['Entity2_type']]): raise PharmGKBInvalidEdge( "Trying to make edge that's not an edge of interest") if line_data['Entity1_type'] == 'Gene': gene_id = line_data['Entity1_id'] drug_id = line_data['Entity2_id'] else: gene_id = line_data['Entity2_id'] drug_id = line_data['Entity1_id'] gene_id = self.get_uniprot_id(this_id=gene_id) evidence = line_data['Evidence'] preferred_drug_id = self.make_preferred_drug_id( drug_id, self.drug_id_map) data = [ preferred_drug_id, self.drug_gene_edge_label, gene_id, self.drug_gene_edge_relation, self.source_name, 'biolink:Association', evidence ] write_node_edge_item(fh=fh, header=self.edge_header, data=data)
def make_pharmgkb_chemical_node(self, fh: TextIO, chem_id: str, name: str, biolink_type: str) -> None: """Write out node for gene :param fh: file handle to write out gene :param id: pharmgkb gene id :param name: gene name :param biolink_type: biolink type for Chemical :return: None """ preferred_drug_id = self.make_preferred_drug_id( chem_id, self.drug_id_map) data = [preferred_drug_id, name, biolink_type, self.source_name] write_node_edge_item(fh=fh, header=self.node_header, data=data)
def make_pharmgkb_gene_node(self, fh: TextIO, this_id: str, name: str, biolink_type: str) -> None: """Write out node for gene :param fh: file handle to write out gene :param this_id: pharmgkb gene id :param name: gene name :param biolink_type: biolink type for Gene (from make_gene_id_mapping_file()) :return: None """ gene_id = self.get_uniprot_id(this_id=this_id) data = [gene_id, name, biolink_type, self.source_name] write_node_edge_item(fh=fh, header=self.node_header, data=data)
def run(self, data_file: Optional[str] = None): """Method to run transform to ingest data from IntAct for viral/human PPIs""" data_files = list() if not data_file: data_files.append( os.path.join(self.input_base_dir, 'intact_coronavirus.zip')) else: data_files.append(data_file) zip_file = data_files[0] # for tsv output: output_node_file = os.path.join(self.output_dir, 'nodes.tsv') output_edge_file = os.path.join(self.output_dir, 'edges.tsv') # make directory in data/transformed os.makedirs(self.output_dir, exist_ok=True) with open(output_node_file, 'w') as node, \ open(output_edge_file, 'w') as edge: # write node.tsv header node.write('\t'.join(self.node_header) + '\n') edge.write('\t'.join(self.edge_header) + '\n') xml_tempdir = tempfile.mkdtemp() unzip_to_tempdir(zip_file, xml_tempdir) extracted_base_dir_list = os.listdir(xml_tempdir) file_path = os.path.join(xml_tempdir, extracted_base_dir_list[0]) for file in os.listdir(file_path): if not fnmatch.fnmatch(file, '*.xml'): logging.warning("Skipping non-xml file %s" % file) nodes_edges = self.parse_xml_to_nodes_edges( os.path.join(file_path, file)) # write out nodes for this_node in nodes_edges['nodes']: write_node_edge_item(fh=node, header=self.node_header, data=this_node) # write out edges for this_edge in nodes_edges['edges']: write_node_edge_item(fh=edge, header=self.edge_header, data=this_edge)
def run(self, data_file: Optional[str] = None): """Method is called and performs needed transformations to process the zhou host protein data, additional information on this data can be found in the comment at the top of this script.""" input_file = os.path.join(self.input_base_dir, '41421_2020_153_MOESM1_ESM.pdf') pubmed_curie_prefix = 'PMID:' gene_curie_prefix = 'NCBIGene:' publication_node_type = 'biolink:Publication' gene_node_type = 'biolink:Gene' virus_node_type = 'biolink:OrganismalEntity' # list of RO interactions: # https://raw.githubusercontent.com/oborel/obo-relations/master/subsets/ro-interaction.owl host_gene_vgene_edge_label = 'biolink:interacts_with' host_gene_vgene_relation = 'RO:0002437' ncbitaxon_curie_prefix = 'NCBITaxon:' corona_info = { 'IBV': {'taxon_id': 11120}, 'MHV': {'taxon_id': 502104}, 'HCoV-NL63': {'taxon_id': 277944}, 'HCoV-229E': {'taxon_id': 11137}, 'SARS': {'taxon_id': 227859}, 'MERS': {'taxon_id': 1335626}, } # for tsv output: output_node_file = os.path.join(self.output_dir, 'nodes.tsv') output_edge_file = os.path.join(self.output_dir, 'edges.tsv') # make directory in data/transformed os.makedirs(self.output_dir, exist_ok=True) fig_3_table_unformatted = io.read_pdf(input_file, output_format='json', pages=[5, 6, 7], multiple_tables=True) fig_3_table = multi_page_table_to_list(fig_3_table_unformatted) with open(output_node_file, 'w') as node, open(output_edge_file, 'w') as edge: # write node.tsv header node.write('\t'.join(self.node_header) + '\n') edge.write('\t'.join(self.edge_header) + '\n') for row in fig_3_table: if row['Coronavirus'] not in corona_info: raise Exception("Can't find info for coronavirus {}", row['Coronavirus']) this_corona_info = corona_info[row['Coronavirus']] corona_curie = ncbitaxon_curie_prefix + str(this_corona_info['taxon_id']) # WRITE NODES # virus write_node_edge_item(fh=node, header=self.node_header, data=[gene_curie_prefix + row['Host Gene ID'], row['Host Protein'], gene_node_type, self.source_name]) # host gene write_node_edge_item(fh=node, header=self.node_header, data=[corona_curie, row['Coronavirus'], virus_node_type, self.source_name]) # WRITE EDGES write_node_edge_item(fh=edge, header=self.edge_header, data=[ gene_curie_prefix + row['Host Gene ID'], host_gene_vgene_edge_label, corona_curie, host_gene_vgene_relation, self.source_name, 'biolink:Association', pubmed_curie_prefix + row['PubMed ID'] ]) return None
def run(self, data_file: str = None) -> None: """Method is called and performs needed transformations to process protein-protein interactions from the STRING DB data. Args: data_file: data file to parse Returns: None. """ if not data_file: data_file = os.path.join(self.input_base_dir, "9606.protein.links.full.v11.0.txt.gz") os.makedirs(self.output_dir, exist_ok=True) protein_node_type = "biolink:Protein" edge_label = "biolink:interacts_with" self.node_header = compress_json.local_load("node_header.json") edge_core_header = compress_json.local_load("edge_core_header.json") edge_additional_headers = compress_json.local_load( "edge_additional_headers.json") self.edge_header = edge_core_header + edge_additional_headers relation = 'RO:0002434' seen_proteins: Set = set() seen_genes: Set = set() # Required to align the node edge header of the gene # with the default header extra_header = [""] * (len(edge_additional_headers) + 1) with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(data_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_stringdb_interactions(line, header_items) proteins = [] for protein_name in ('protein1', 'protein2'): protein = get_item_by_priority(items_dict, [protein_name]) protein = '.'.join(protein.split('.')[1:]) proteins.append(protein) if protein in self.protein_gene_map: gene = self.protein_gene_map[protein] if gene not in seen_genes: seen_genes.add(gene) ensemble_gene = f"ENSEMBL:{gene}" gene_informations = self.gene_info_map[ self.ensembl2ncbi_map[gene]] write_node_edge_item( fh=node, header=self.node_header, data=[ ensemble_gene, gene_informations['symbol'], 'biolink:Gene', gene_informations['description'], f"NCBIGene:{self.ensembl2ncbi_map[gene]}" ]) write_node_edge_item( fh=edge, header=self.edge_header, data=[ ensemble_gene, "biolink:has_gene_product", protein, "RO:0002205", "NCBI", ] + extra_header) # write node data if protein not in seen_proteins: seen_proteins.add(protein) write_node_edge_item(fh=node, header=self.node_header, data=[ f"ENSEMBL:{protein}", "", protein_node_type, "", "" ]) # write edge data write_node_edge_item( fh=edge, header=self.edge_header, data=[ proteins[0], edge_label, proteins[1], relation, "STRING", items_dict['combined_score'] ] + [ items_dict.get(header, "") for header in edge_additional_headers ])
def run(self, data_file: Optional[str] = None) -> None: """Method is called and performs needed transformations to process protein-protein interactions from the STRING DB data. Args: data_file: data file to parse Returns: None. """ if not data_file: data_file = os.path.join(self.input_base_dir, "9606.protein.links.full.v11.0.txt.gz") os.makedirs(self.output_dir, exist_ok=True) protein_node_type = "biolink:Protein" edge_label = "biolink:interacts_with" self.node_header = compress_json.local_load("node_header.json") edge_core_header = compress_json.local_load("edge_core_header.json") edge_additional_headers = compress_json.local_load( "edge_additional_headers.json") self.edge_header = edge_core_header + edge_additional_headers relation = 'RO:0002434' seen_proteins: Set = set() seen_genes: Set = set() # Required to align the node edge header of the gene # with the default header self.extra_header = [""] * (len(edge_additional_headers) + 1) # make string ENSP to Uniprot id mapping dict string_to_uniprot_id_map = uniprot_make_name_to_id_mapping( os.path.join(self.input_base_dir, UNIPROT_ID_MAPPING)) with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(data_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_stringdb_interactions(line, header_items) proteins = [] for protein_name in ('protein1', 'protein2'): nat_string_id = get_item_by_priority( items_dict, [protein_name]) protein = '.'.join(nat_string_id.split('.')[1:]) proteins.append(protein) if protein in self.protein_gene_map: gene = self.protein_gene_map[protein] if gene not in seen_genes: seen_genes.add(gene) ensemble_gene = f"ENSEMBL:{gene}" gene_informations = self.gene_info_map[ self.ensembl2ncbi_map[gene]] write_node_edge_item( fh=node, header=self.node_header, data=[ ensemble_gene, gene_informations['symbol'], 'biolink:Gene', gene_informations['description'], f"NCBIGene:{self.ensembl2ncbi_map[gene]}", self.source_name ]) write_node_edge_item( fh=edge, header=self.edge_header, data=[ ensemble_gene, "biolink:has_gene_product", f"ENSEMBL:{protein}", "RO:0002205", "NCBI", "" ] + self.extra_header) # write node data if protein not in seen_proteins: seen_proteins.add(protein) # if we have an equivalent Uniprot ID for this Ensembl protein # ID make an xref edge, and a node for the Uniprot ID uniprot_curie = '' if protein in string_to_uniprot_id_map: uniprot_curie = \ f"UniProtKB:{string_to_uniprot_id_map[protein]}" uniprot_curie = collapse_uniprot_curie( uniprot_curie) write_node_edge_item( fh=node, header=self.node_header, data=[ f"ENSEMBL:{protein}", "", protein_node_type, "", uniprot_curie, # xref self.source_name ]) # write edge data write_node_edge_item( fh=edge, header=self.edge_header, data=[ f"ENSEMBL:{proteins[0]}", edge_label, f"ENSEMBL:{proteins[1]}", relation, "STRING", "biolink:Association", items_dict['combined_score'] ] + [ items_dict.get(header, "") for header in edge_additional_headers ])
def run(self, data_file: str = None) -> None: """Method is called and performs needed transformations to process protein-protein interactions from the STRING DB data. Args: data_file: data file to parse Returns: None. """ if not data_file: data_file = os.path.join(self.input_base_dir, "9606.protein.links.full.v11.0.txt.gz") os.makedirs(self.output_dir, exist_ok=True) protein_node_type = "biolink:Protein" edge_label = "biolink:interacts_with" self.node_header = ['id', 'name', 'category', 'description', 'alias'] edge_core_header = [ 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'combined_score' ] edge_additional_headers = [ 'neighborhood', 'neighborhood_transferred', 'fusion', 'cooccurence', 'homology', 'coexpression', 'coexpression_transferred', 'experiments', 'experiments_transferred', 'database', 'database_transferred', 'textmining', 'textmining_transferred' ] self.edge_header = edge_core_header + edge_additional_headers relation = 'RO:0002434' seen: List = [] with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(data_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_stringdb_interactions(line, header_items) protein1 = get_item_by_priority(items_dict, ['protein1']) protein1 = '.'.join(protein1.split('.')[1:]) if protein1 in self.protein_gene_map: gene1 = self.protein_gene_map[protein1] else: gene1 = None protein2 = get_item_by_priority(items_dict, ['protein2']) protein2 = '.'.join(protein2.split('.')[1:]) if protein2 in self.protein_gene_map: gene2 = self.protein_gene_map[protein2] else: gene2 = None if gene1 and gene1 not in seen: write_node_edge_item( fh=node, header=self.node_header, data=[ f"ENSEMBL:{gene1}", self.gene_info_map[ self.ensembl2ncbi_map[gene1]]['symbol'], 'biolink:Gene', self.gene_info_map[ self.ensembl2ncbi_map[gene1]]['description'], f"NCBIGene:{self.ensembl2ncbi_map[gene1]}" ]) write_node_edge_item( fh=edge, header=self.edge_header, data=[ f"ENSEMBL:{gene1}", "biolink:has_gene_product", protein1, "RO:0002205", "NCBI", "" ] + ["" for x in edge_additional_headers]) seen.append(gene1) if gene2 and gene2 not in seen: write_node_edge_item( fh=node, header=self.node_header, data=[ f"ENSEMBL:{gene2}", self.gene_info_map[ self.ensembl2ncbi_map[gene2]]['symbol'], 'biolink:Gene', self.gene_info_map[ self.ensembl2ncbi_map[gene2]]['description'], f"NCBIGene:{self.ensembl2ncbi_map[gene2]}" ]) write_node_edge_item( fh=edge, header=self.edge_header, data=[ f"ENSEMBL:{gene2}", "biolink:has_gene_product", protein2, "RO:0002205", "NCBI", "" ] + ["" for x in edge_additional_headers]) seen.append(gene2) # write node data if protein1 not in seen: write_node_edge_item(fh=node, header=self.node_header, data=[ f"ENSEMBL:{protein1}", "", protein_node_type, "", "" ]) if protein2 not in seen: write_node_edge_item(fh=node, header=self.node_header, data=[ f"ENSEMBL:{protein2}", "", protein_node_type, "", "" ]) seen.append(protein1) seen.append(protein2) # write edge data edge_data = [ protein1, edge_label, protein2, relation, "STRING", items_dict['combined_score'] ] for x in edge_additional_headers: edge_data.append(items_dict[x] if x in items_dict else "") write_node_edge_item(fh=edge, header=self.edge_header, data=edge_data)
def run(self, data_file: Optional[str] = None, species: str = "H**o sapiens") -> None: """Method is called and performs needed transformations to process the Drug Central data, additional information on this data can be found in the comment at the top of this script""" interactions_file = os.path.join(self.input_base_dir, "drug.target.interaction.tsv.gz") tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip") os.makedirs(self.output_dir, exist_ok=True) drug_node_type = "biolink:Drug" gene_curie_prefix = "UniProtKB:" drug_curie_prefix = "DrugCentral:" gene_node_type = "biolink:Gene" drug_gene_edge_label = "biolink:interacts_with" drug_gene_edge_relation = "RO:0002436" # molecularly interacts with self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'comment' ] # unzip tcrd.zip and get tchem and tclin filenames tempdir = tempfile.mkdtemp() (tclin_file, tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir) tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot') tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot') with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(interactions_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_drug_central_line(line, header_items) if 'ORGANISM' not in items_dict or items_dict[ 'ORGANISM'] != species: continue # get gene ID try: gene_id_string = get_item_by_priority( items_dict, ['ACCESSION']) gene_ids = gene_id_string.split('|') except ItemInDictNotFound: # lines with no ACCESSION entry only contain drug info, no target # info - not ingesting these continue # get drug ID drug_id = drug_curie_prefix + get_item_by_priority( items_dict, ['STRUCT_ID']) # WRITE NODES # drug - ['id', 'name', 'category'] write_node_edge_item(fh=node, header=self.node_header, data=[ drug_id, items_dict['DRUG_NAME'], drug_node_type, str(False), str(False) ]) for gene_id in gene_ids: gene_id = gene_curie_prefix + gene_id is_tclin = True if gene_ids[0] in tclin_dict else False is_tchem = True if gene_ids[0] in tchem_dict else False write_node_edge_item(fh=node, header=self.node_header, data=[ gene_id, items_dict['GENE'], gene_node_type, str(is_tclin), str(is_tchem) ]) # WRITE EDGES # ['subject', 'edge_label', 'object', 'relation', 'provided_by', # 'comment'] write_node_edge_item(fh=edge, header=self.edge_header, data=[ drug_id, drug_gene_edge_label, gene_id, drug_gene_edge_relation, self.source_name, items_dict['ACT_COMMENT'] ]) return None
def run(self, data_file: Optional[str] = None, species: str = "H**o sapiens") -> None: """Method is called and performs needed transformations to process the Drug Central data, additional information on this data can be found in the comment at the top of this script""" if data_file is None: data_file = "drug.target.interaction.tsv.gz" interactions_file = os.path.join(self.input_base_dir, data_file) os.makedirs(self.output_dir, exist_ok=True) drug_node_type = "biolink:Drug" uniprot_curie_prefix = "UniProtKB:" drug_curie_prefix = "DrugCentral:" protein_node_type = "biolink:Protein" drug_protein_edge_label = "biolink:molecularly_interacts_with" drug_protein_edge_relation = "RO:0002436" # molecularly interacts with self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'comment', 'type' ] with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(interactions_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) seen_proteins: dict = defaultdict(int) seen_drugs: dict = defaultdict(int) for line in interactions: items_dict = parse_drug_central_line(line, header_items) if 'ORGANISM' not in items_dict or items_dict[ 'ORGANISM'] != species: continue # get protein ID try: protein_dict = items_dict_to_protein_data_dict(items_dict) except ItemInDictNotFound: # lines with no ACCESSION entry only contain drug info, no target # info - not ingesting these continue except ValueError: logging.error("Value error while parsing line") continue # get drug ID drug_id = drug_curie_prefix + get_item_by_priority( items_dict, ['STRUCT_ID']) # Write drug node if drug_id not in seen_drugs: write_node_edge_item( fh=node, header=self.node_header, data=[ drug_id, items_dict['DRUG_NAME'], drug_node_type, '', # TDL (not applicable for drugs) self.source_name ]) seen_drugs[drug_id] += 1 for key, (uniprot_id, name, tdl) in protein_dict.items(): protein_id = uniprot_curie_prefix + uniprot_id if protein_id not in seen_proteins: write_node_edge_item(fh=node, header=self.node_header, data=[ protein_id, name, protein_node_type, tdl, self.source_name ]) seen_proteins[protein_id] += 1 # WRITE EDGES write_node_edge_item( fh=edge, header=self.edge_header, data=[ drug_id, drug_protein_edge_label, protein_id, drug_protein_edge_relation, self.source_name, items_dict['ACT_COMMENT'], 'biolink:Association' ]) return None
def run(self) -> None: """Method is called and performs needed transformations to process the Drug Central data, additional information on this data can be found in the comment at the top of this script""" interactions_file = os.path.join(self.input_base_dir, "drug.target.interaction.tsv.gz") os.makedirs(self.output_dir, exist_ok=True) drug_node_type = "biolink:Drug" gene_node_type = "biolink:Gene" drug_gene_edge_label = "biolink:interacts_with" drug_gene_edge_relation = "RO:0002436" # molecularly interacts with self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'comment' ] with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(interactions_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_drug_central_line(line, header_items) # get gene ID try: gene_id = get_item_by_priority(items_dict, ['ACCESSION']) except ItemInDictNotFound: # lines with no ACCESSION entry only contain drug info, no target # info - not ingesting these logging.info( "No gene information for this line:\n{}\nskipping". format(line)) continue # get drug ID drug_id = get_item_by_priority( items_dict, ['ACT_SOURCE_URL', 'MOA_SOURCE_URL', 'DRUG_NAME']) # WRITE NODES # drug - ['id', 'name', 'category'] write_node_edge_item( fh=node, header=self.node_header, data=[drug_id, items_dict['DRUG_NAME'], drug_node_type]) write_node_edge_item( fh=node, header=self.node_header, data=[gene_id, items_dict['GENE'], gene_node_type]) # WRITE EDGES # ['subject', 'edge_label', 'object', 'relation', 'comment'] write_node_edge_item(fh=edge, header=self.edge_header, data=[ drug_id, drug_gene_edge_label, gene_id, drug_gene_edge_relation, items_dict['ACT_COMMENT'] ]) return None