def run(self, data_file: Optional[str] = None): rel_zip_file_name = os.path.join(self.input_base_dir, "relationships.zip") relationship_file_name = "relationships.tsv" gene_mapping_zip_file = os.path.join(self.input_base_dir, "pharmgkb_genes.zip") gene_mapping_file_name = "genes.tsv" drug_mapping_zip_file = os.path.join(self.input_base_dir, "pharmgkb_drugs.zip") drug_mapping_file_name = "drugs.tsv" # # file stuff # # get relationship file (what we are ingesting here) # TODO: unlink relationship_tempdir and gene_id_tempdir relationship_tempdir = tempfile.mkdtemp() relationship_file_path = os.path.join(relationship_tempdir, relationship_file_name) unzip_to_tempdir(rel_zip_file_name, relationship_tempdir) if not os.path.exists(relationship_file_path): raise PharmGKBFileError( "Can't find relationship file needed for ingest") # get mapping file for gene ids gene_id_tempdir = tempfile.mkdtemp() gene_mapping_file_path = os.path.join(gene_id_tempdir, gene_mapping_file_name) unzip_to_tempdir(gene_mapping_zip_file, gene_id_tempdir) if not os.path.exists(gene_mapping_file_path): raise PharmGKBFileError( "Can't find gene map file needed for ingest") self.gene_id_map = self.make_id_mapping_file(gene_mapping_file_path) # get mapping file for drug ids drug_id_tempdir = tempfile.mkdtemp() drug_mapping_file_path = os.path.join(drug_id_tempdir, drug_mapping_file_name) unzip_to_tempdir(drug_mapping_zip_file, drug_id_tempdir) if not os.path.exists(drug_mapping_file_path): raise PharmGKBFileError( "Can't find drug map file needed for ingest") self.drug_id_map = self.make_id_mapping_file(drug_mapping_file_path) # # read in and transform relationship.tsv # with open(relationship_file_path) as relationships, \ open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge: # write headers (change default node/edge headers if necessary node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") rel_header = parse_header(relationships.readline()) for line in relationships: line_data = self.parse_pharmgkb_line(line, rel_header) if set(self.edge_of_interest) == \ set([line_data['Entity1_type'], line_data['Entity2_type']]): # # Make nodes for drug and chemical # for entity_id, entity_name, entity_type in [ [ line_data['Entity1_id'], line_data['Entity1_name'], line_data['Entity1_type'] ], [ line_data['Entity2_id'], line_data['Entity2_name'], line_data['Entity2_type'] ] ]: if entity_type == 'Gene': self.make_pharmgkb_gene_node( fh=node, this_id=entity_id, name=entity_name, biolink_type=self.gene_node_type) elif entity_type == 'Chemical': self.make_pharmgkb_chemical_node( fh=node, chem_id=entity_id, name=entity_name, biolink_type=self.drug_node_type) else: raise PharmKGBInvalidNodeType( "Node type isn't gene or chemical!") # # Make edge # self.make_pharmgkb_edge(fh=edge, line_data=line_data)
def test_parse_drug_central_line(self, key, value): header = parse_header(self.dti_fh.readline()) line = self.dti_fh.readline() parsed = parse_drug_central_line(line, header) self.assertTrue(key in parsed) self.assertEqual(value, parsed[key])
def run(self, data_file: Optional[str] = None, species: str = "H**o sapiens") -> None: """Method is called and performs needed transformations to process the Drug Central data, additional information on this data can be found in the comment at the top of this script""" if data_file is None: data_file = "drug.target.interaction.tsv.gz" interactions_file = os.path.join(self.input_base_dir, data_file) os.makedirs(self.output_dir, exist_ok=True) drug_node_type = "biolink:Drug" uniprot_curie_prefix = "UniProtKB:" drug_curie_prefix = "DrugCentral:" protein_node_type = "biolink:Protein" drug_protein_edge_label = "biolink:molecularly_interacts_with" drug_protein_edge_relation = "RO:0002436" # molecularly interacts with self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'comment', 'type' ] with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(interactions_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) seen_proteins: dict = defaultdict(int) seen_drugs: dict = defaultdict(int) for line in interactions: items_dict = parse_drug_central_line(line, header_items) if 'ORGANISM' not in items_dict or items_dict[ 'ORGANISM'] != species: continue # get protein ID try: protein_dict = items_dict_to_protein_data_dict(items_dict) except ItemInDictNotFound: # lines with no ACCESSION entry only contain drug info, no target # info - not ingesting these continue except ValueError: logging.error("Value error while parsing line") continue # get drug ID drug_id = drug_curie_prefix + get_item_by_priority( items_dict, ['STRUCT_ID']) # Write drug node if drug_id not in seen_drugs: write_node_edge_item( fh=node, header=self.node_header, data=[ drug_id, items_dict['DRUG_NAME'], drug_node_type, '', # TDL (not applicable for drugs) self.source_name ]) seen_drugs[drug_id] += 1 for key, (uniprot_id, name, tdl) in protein_dict.items(): protein_id = uniprot_curie_prefix + uniprot_id if protein_id not in seen_proteins: write_node_edge_item(fh=node, header=self.node_header, data=[ protein_id, name, protein_node_type, tdl, self.source_name ]) seen_proteins[protein_id] += 1 # WRITE EDGES write_node_edge_item( fh=edge, header=self.edge_header, data=[ drug_id, drug_protein_edge_label, protein_id, drug_protein_edge_relation, self.source_name, items_dict['ACT_COMMENT'], 'biolink:Association' ]) return None
def run(self, data_file: Optional[str] = None, species: str = "H**o sapiens") -> None: """Method is called and performs needed transformations to process the Drug Central data, additional information on this data can be found in the comment at the top of this script""" interactions_file = os.path.join(self.input_base_dir, "drug.target.interaction.tsv.gz") tclin_chem_zip_file = os.path.join(self.input_base_dir, "tcrd.zip") os.makedirs(self.output_dir, exist_ok=True) drug_node_type = "biolink:Drug" gene_curie_prefix = "UniProtKB:" drug_curie_prefix = "DrugCentral:" gene_node_type = "biolink:Gene" drug_gene_edge_label = "biolink:interacts_with" drug_gene_edge_relation = "RO:0002436" # molecularly interacts with self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'provided_by', 'comment' ] # unzip tcrd.zip and get tchem and tclin filenames tempdir = tempfile.mkdtemp() (tclin_file, tchem_file) = unzip_and_get_tclin_tchem(tclin_chem_zip_file, tempdir) tclin_dict: dict = tsv_to_dict(tclin_file, 'uniprot') tchem_dict: dict = tsv_to_dict(tchem_file, 'uniprot') with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(interactions_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_drug_central_line(line, header_items) if 'ORGANISM' not in items_dict or items_dict[ 'ORGANISM'] != species: continue # get gene ID try: gene_id_string = get_item_by_priority( items_dict, ['ACCESSION']) gene_ids = gene_id_string.split('|') except ItemInDictNotFound: # lines with no ACCESSION entry only contain drug info, no target # info - not ingesting these continue # get drug ID drug_id = drug_curie_prefix + get_item_by_priority( items_dict, ['STRUCT_ID']) # WRITE NODES # drug - ['id', 'name', 'category'] write_node_edge_item(fh=node, header=self.node_header, data=[ drug_id, items_dict['DRUG_NAME'], drug_node_type, str(False), str(False) ]) for gene_id in gene_ids: gene_id = gene_curie_prefix + gene_id is_tclin = True if gene_ids[0] in tclin_dict else False is_tchem = True if gene_ids[0] in tchem_dict else False write_node_edge_item(fh=node, header=self.node_header, data=[ gene_id, items_dict['GENE'], gene_node_type, str(is_tclin), str(is_tchem) ]) # WRITE EDGES # ['subject', 'edge_label', 'object', 'relation', 'provided_by', # 'comment'] write_node_edge_item(fh=edge, header=self.edge_header, data=[ drug_id, drug_gene_edge_label, gene_id, drug_gene_edge_relation, self.source_name, items_dict['ACT_COMMENT'] ]) return None
def run(self) -> None: """Method is called and performs needed transformations to process the Drug Central data, additional information on this data can be found in the comment at the top of this script""" interactions_file = os.path.join(self.input_base_dir, "drug.target.interaction.tsv.gz") os.makedirs(self.output_dir, exist_ok=True) drug_node_type = "biolink:Drug" gene_node_type = "biolink:Gene" drug_gene_edge_label = "biolink:interacts_with" drug_gene_edge_relation = "RO:0002436" # molecularly interacts with self.edge_header = [ 'subject', 'edge_label', 'object', 'relation', 'comment' ] with open(self.output_node_file, 'w') as node, \ open(self.output_edge_file, 'w') as edge, \ gzip.open(interactions_file, 'rt') as interactions: node.write("\t".join(self.node_header) + "\n") edge.write("\t".join(self.edge_header) + "\n") header_items = parse_header(interactions.readline()) for line in interactions: items_dict = parse_drug_central_line(line, header_items) # get gene ID try: gene_id = get_item_by_priority(items_dict, ['ACCESSION']) except ItemInDictNotFound: # lines with no ACCESSION entry only contain drug info, no target # info - not ingesting these logging.info( "No gene information for this line:\n{}\nskipping". format(line)) continue # get drug ID drug_id = get_item_by_priority( items_dict, ['ACT_SOURCE_URL', 'MOA_SOURCE_URL', 'DRUG_NAME']) # WRITE NODES # drug - ['id', 'name', 'category'] write_node_edge_item( fh=node, header=self.node_header, data=[drug_id, items_dict['DRUG_NAME'], drug_node_type]) write_node_edge_item( fh=node, header=self.node_header, data=[gene_id, items_dict['GENE'], gene_node_type]) # WRITE EDGES # ['subject', 'edge_label', 'object', 'relation', 'comment'] write_node_edge_item(fh=edge, header=self.edge_header, data=[ drug_id, drug_gene_edge_label, gene_id, drug_gene_edge_relation, items_dict['ACT_COMMENT'] ]) return None