def process_variant_to_gene_relationships(self, variant_nodes: list, writer: WriterDelegator): all_results = self.genetics_services.get_variant_to_gene(self.crawl_for_service, variant_nodes) for source_node_id, results in all_results.items(): # convert the simple edges and nodes to rags objects and write them to the graph for (edge, node) in results: gene_node = KNode(node.id, type=node.type, name=node.name, properties=node.properties) if self.recreate_sv_node: variant_node = KNode(source_node_id, type= node_types.SEQUENCE_VARIANT) variant_node.add_export_labels([node_types.SEQUENCE_VARIANT]) writer.write_node(variant_node) if gene_node.id not in self.written_genes: writer.write_node(gene_node) self.written_genes.add(gene_node.id) predicate = LabeledID(identifier=edge.predicate_id, label=edge.predicate_label) gene_edge = KEdge(source_id=source_node_id, target_id=gene_node.id, provided_by=edge.provided_by, ctime=edge.ctime, original_predicate=predicate, # standard_predicate=predicate, input_id=edge.input_id, properties=edge.properties) writer.write_edge(gene_edge) logger.info(f'added {len(results)} variant relationships for {source_node_id}')
def parse_nodes(self, limit=0): """ Parse nodes. :param limit: for testing reads first n nodes from file :return: dict with node_id as key and KNode as value """ print('parsing nodes...') limit_counter = 0 with open(os.path.join(self.cord_dir, 'nodes.txt')) as nodes_file: reader = csv.DictReader(nodes_file, delimiter='\t') for raw_node in reader: # transform headers to knode attrbutes labels = raw_node.get('semantic_type') labels = labels.replace(']', '').replace('[', '').replace( '\\', '').replace("'", '') labels = labels.split(',') node = KNode({ 'id': raw_node.get('normalized_curie'), 'type': labels[0], 'name': raw_node.get('name'), 'properties': { 'input_term': raw_node.get('input_term') } }) node.add_export_labels(labels) limit_counter += 1 if limit and limit_counter > limit: break yield limit_counter - 1, node
def start_build(self) -> list: # Entry point variant_list = self.get_all_variants_and_synonymns() if not variant_list: logger.info('No Sequence variant nodes found from graph.') variant_subset = [] with self.writerDelegator as writer: # for each variant for var in variant_list: # check to see if we have all the data elements we need. element [0] is the ID, element [1] is the synonym list if len(var) == 2: # create a variant node variant_curie = var[0] # get the synonym data from the graph DB call variant_syn_set = set(var[1]) variant_node = KNode(variant_curie, type=node_types.SEQUENCE_VARIANT) variant_node.add_synonyms(variant_syn_set) variant_node.add_export_labels([node_types.SEQUENCE_VARIANT]) variant_subset.append(variant_node) if len(variant_subset) == 1000: self.process_variant_to_gene_relationships(variant_nodes=variant_subset, writer=writer) variant_subset = [] if variant_subset: # for left overs self.process_variant_to_gene_relationships(variant_nodes=variant_subset, writer=writer)
def parse_dict_to_knode(nn_dict: dict) -> KNode: node = KNode( id=nn_dict.get('id', {}).get('identifier', ''), name=nn_dict.get('id', {}).get('label', ''), type=nn_dict.get('type', ['named_thing'])[0], ) node.add_synonyms( set( map(lambda x: LabeledID(**x), nn_dict.get('equivalent_identifiers', [])))) node.add_export_labels(nn_dict.get('type', ['named_thing'])) return node
def test_write_node_with_export_lables(): # assert that a node will be queued to its export types node = KNode('CURIE:1', type=node_types.NAMED_THING) all_types = [node_types.CHEMICAL_SUBSTANCE, node_types.NAMED_THING] node.add_export_labels(all_types) bf = BufferedWriter(rosetta_mock) bf.write_node(node) assert node.id in bf.written_nodes key = node.export_labels assert key in bf.node_queues queue = bf.node_queues[key] assert node.id in queue
def get_nodes_from_file(self, file_name, delimiter: str): if not file_name: return with open(file_name) as nodes_file: reader = csv.DictReader(nodes_file, delimiter=delimiter) for raw_node in reader: labels = list( filter(lambda x: x, raw_node['category'].split('|'))) if not len(labels): labels = ['named_thing'] id = raw_node['id'] name = raw_node['name'] node = KNode(id, type=labels[0], name=name) node.add_export_labels(labels) yield node
def parse_drug_bank_items(self): import requests from contextlib import closing drug_bank_parsed_tsv = 'https://raw.githubusercontent.com/TranslatorIIPrototypes/CovidDrugBank/master/trials.txt' items = [] tsv_file = requests.get(drug_bank_parsed_tsv, ).text.split('\n') reader = csv.DictReader(tsv_file, delimiter="\t") for row in reader: items.append(row) drug_ids = '&'.join([f"curie={item['source']}" for item in items]) normalize_url = f"https://nodenormalization-sri.renci.org/get_normalized_nodes?{drug_ids}" response = requests.get(normalize_url).json() nodes = [] export_labels_fallback = requests.get( 'https://bl-lookup-sri.renci.org/bl/chemical_substance/ancestors?version=latest' ).json() export_labels_fallback.append('chemical_substance') for drug_id in response: node = None if response[drug_id] == None: node = KNode(drug_id, type='chemical_substance') node.add_export_labels(export_labels_fallback) else: # else use synonimized id so edges are merged prefered_curie = response[drug_id]['id']['identifier'] node = KNode(prefered_curie, type="chemical_substance") nodes.append(node) self.writer.write_node(node) self.writer.flush() ## 'manually write in_clinical_trial_for edges query = lambda source_id, target_id, count: f""" MATCH (a:chemical_substance{{id: '{source_id}'}}) , (b:disease{{id:'{target_id}'}}) Merge (a)-[e:in_clinical_trial_for{{id: apoc.util.md5([a.id, b.id, 'ROBOKOVID:in_clinical_trial_for']), predicate_id: 'ROBOKOVID:in_clinical_trial_for'}}]->(b) SET e.edge_source = "https://www.drugbank.ca/covid-19" SET e.relation_label = "in_clinical_trial_for" SET e.source_database = "drugbank" SET e.predicate_id = "ROBOKOVID:in_clinical_trial_for" SET e.relation = "in_clinical_trial_for" SET e.count = {count} """ with self.rosetta.type_graph.driver.session() as session: for source_node, row in zip(nodes, items): q = query(source_node.id, row['object'], row['count']) # assuming MONDO:0100096 is in there session.run(q)
def parse_gwas_file(self, gwas_catalog): try: # get column headers file_headers = gwas_catalog[0].split('\t') pub_med_index = file_headers.index('PUBMEDID') p_value_index = file_headers.index('P-VALUE') snps_index = file_headers.index('SNPS') trait_ids_index = file_headers.index('MAPPED_TRAIT_URI') except (IndexError, ValueError) as e: logger.error(f'GWAS Catalog failed to prepopulate_cache ({e})') return [] corrupted_lines = 0 missing_variant_ids = 0 missing_phenotype_ids = 0 variant_to_pheno_cache = defaultdict(set) progress_counter = 0 total_lines = len(gwas_catalog) trait_uri_pattern = re.compile(r'[^,\s]+') snp_pattern = re.compile(r'[^,;x*\s]+') for line in gwas_catalog[1:]: line = line.split('\t') try: # get pubmed id pubmed_id = line[pub_med_index] # get p-value p_value = float(line[p_value_index]) if p_value == 0: p_value = sys.float_info.min # get all traits (possible phenotypes) trait_uris = trait_uri_pattern.findall(line[trait_ids_index]) # find all sequence variants snps = snp_pattern.findall(line[snps_index]) except (IndexError, ValueError) as e: corrupted_lines += 1 logger.warning(f'GWASCatalog corrupted line: {e}') continue if not (trait_uris and snps): corrupted_lines += 1 logger.warning(f'GWASCatalog corrupted line: {line}') continue else: traits = [] for trait_uri in trait_uris: try: trait_id = trait_uri.rsplit('/', 1)[1] # ids show up like EFO_123, Orphanet_123, HP_123 if trait_id.startswith('EFO'): curie_trait_id = f'EFO:{trait_id[4:]}' elif trait_id.startswith('Orp'): curie_trait_id = f'ORPHANET:{trait_id[9:]}' elif trait_id.startswith('HP'): curie_trait_id = f'HP:{trait_id[3:]}' elif trait_id.startswith('NCIT'): curie_trait_id = f'NCIT:{trait_id[5:]}' elif trait_id.startswith('MONDO'): curie_trait_id = f'MONDO:{trait_id[6:]}' elif trait_id.startswith('GO'): # Biological process or activity # 5k+ of these missing_phenotype_ids += 1 continue else: missing_phenotype_ids += 1 logger.warning( f'{trait_uri} not a recognized trait format') continue traits.append(curie_trait_id) except IndexError as e: logger.warning( f'trait uri index error:({trait_uri}) not splittable' ) variant_nodes = set() for n, snp in enumerate(snps): if snp.startswith('rs'): dbsnp_curie = f'DBSNP:{snp}' variant_node = KNode(dbsnp_curie, type=node_types.SEQUENCE_VARIANT) # adding an export label, this will ensure that it will go into the proper queue # hence we can do batch normalization in the writer. variant_node.add_export_labels( [node_types.SEQUENCE_VARIANT]) variant_nodes.add(variant_node) else: missing_variant_ids += 1 pass if traits and variant_nodes: props = {'p_value': p_value} for variant_node in variant_nodes: self.writer.write_node(variant_node) for trait_id in traits: variant_to_pheno_edge, phenotype_node = self.create_variant_to_phenotype_components( variant_node, trait_id, None, pubmed_id=pubmed_id, properties=props) self.writer.write_node(phenotype_node) self.writer.write_edge(variant_to_pheno_edge) progress_counter += 1 if progress_counter % 1000 == 0: percent_complete = (progress_counter / total_lines) * 100 logger.info(f'GWASCatalog progress: {int(percent_complete)}%')
def create_gtex_graph(self, data_directory: str, file_name: str, namespace: str, is_sqtl: bool = False) -> object: # init the return value ret_val: object = None # init a progress counter line_counter = 0 try: # get the full path to the input file full_file_path = f'{data_directory}{file_name}' logger.info( f'Creating GTEx graph data elements from file: {full_file_path}' ) # walk through the gtex data file and create/write nodes and edges to the graph with WriterDelegator(self.rosetta) as graph_writer: # init these outside of try catch block curie_hgvs = None curie_uberon = None curie_ensembl = None # open the file and start reading with open(full_file_path, 'r') as inFH: # open up a csv reader csv_reader = csv.reader(inFH) # read the header header_line = next(csv_reader) # find relevant indices tissue_name_index = header_line.index('tissue_name') tissue_uberon_index = header_line.index('tissue_uberon') hgvs_index = header_line.index('HGVS') ensembl_id_index = header_line.index('gene_id') pval_nominal_index = header_line.index('pval_nominal') pval_slope_index = header_line.index('slope') try: # for the rest of the lines in the file for line in csv_reader: # increment the counter line_counter += 1 # get the data elements tissue_name = line[tissue_name_index] uberon = line[tissue_uberon_index] hgvs = line[hgvs_index] ensembl = line[ensembl_id_index].split(".", 1)[0] pval_nominal = line[pval_nominal_index] slope = line[pval_slope_index] # create curies for the various id values curie_hgvs = f'HGVS:{hgvs}' curie_uberon = f'UBERON:{uberon}' curie_ensembl = f'ENSEMBL:{ensembl}' # create variant, gene and GTEx nodes with the HGVS, ENSEMBL or UBERON expression as the id and name variant_node = KNode( curie_hgvs, name=hgvs, type=node_types.SEQUENCE_VARIANT) variant_node.add_export_labels( [node_types.SEQUENCE_VARIANT]) gene_node = KNode(curie_ensembl, name=ensembl, type=node_types.GENE) gene_node.add_export_labels([node_types.GENE]) gtex_node = KNode( curie_uberon, name=tissue_name, type=node_types.ANATOMICAL_ENTITY) if is_sqtl: # sqtl variant to gene always uses the same predicate predicate = self.variant_gene_sqtl_predicate else: # for eqtl use the polarity of slope to get the direction of expression. # positive value increases expression, negative decreases try: if float(slope) > 0.0: predicate = self.increases_expression_predicate else: predicate = self.decreases_expression_predicate except ValueError as e: logger.error( f"Error casting slope to a float on line {line_counter} (slope - {slope}) {e}" ) continue # get a MD5 hash int of the composite hyper edge ID hyper_edge_id = self.gtu.get_hyper_edge_id( uberon, ensembl, hgvs) # set the properties for the edge edge_properties = [ ensembl, pval_nominal, slope, namespace ] ########################## # data details are ready. write all edges and nodes to the graph DB. ########################## # write out the sequence variant node graph_writer.write_node(variant_node) # write out the gene node if gene_node.id not in self.written_genes: graph_writer.write_node(gene_node) self.written_genes.add(gene_node.id) # write out the anatomical gtex node if gtex_node.id not in self.written_anatomical_entities: graph_writer.write_node(gtex_node) self.written_anatomical_entities.add( gtex_node.id) # associate the sequence variant node with an edge to the gtex anatomy node self.gtu.write_new_association( graph_writer, variant_node, gtex_node, self.variant_anatomy_predicate, hyper_edge_id, None, True) # associate the gene node with an edge to the gtex anatomy node self.gtu.write_new_association( graph_writer, gene_node, gtex_node, self.gene_anatomy_predicate, 0, None, False) # associate the sequence variant node with an edge to the gene node. also include the GTEx properties self.gtu.write_new_association( graph_writer, variant_node, gene_node, predicate, hyper_edge_id, edge_properties, True) # output some feedback for the user if (line_counter % 250000) == 0: logger.info( f'Processed {line_counter} variants.') # reset written nodes list to avoid memory overflow if len(self.written_anatomical_entities ) == self.max_nodes: self.written_anatomical_entities = set() if len(self.written_genes) == self.max_nodes: self.written_genes = set() except (KeyError, IndexError) as e: logger.error( f'Exception caught trying to process variant: {curie_hgvs}-{curie_uberon}-{curie_ensembl} at data line: {line_counter}. Exception: {e}, Line: {line}' ) except Exception as e: logger.error(f'Exception caught: Exception: {e}') ret_val = e # output some final feedback for the user logger.info(f'Building complete. Processed {line_counter} variants.') # return to the caller return ret_val
def sequence_variant_to_sequence_variant(self, variant_node): ld_url = '/ld/human/' options_url = '?r2=0.8' population = '1000GENOMES:phase_3:MXL' return_results = [] # with self.redis.pipeline() as redis_pipe: dbsnp_curie_ids = variant_node.get_synonyms_by_prefix('DBSNP') for dbsnp_curie in dbsnp_curie_ids: variant_id = Text.un_curie(dbsnp_curie) query_url = f'{self.url}{ld_url}{variant_id}/{population}{options_url}' query_response = requests.get( query_url, headers={"Content-Type": "application/json"}) if query_response.status_code == 200: query_json = query_response.json() variant_results = self.parse_ld_variants_from_ensembl( query_json) for variant_info in variant_results: new_variant_id = variant_info[0] r_squared = variant_info[1] props = {'r2': r_squared} new_variant_curie = f'DBSNP:{new_variant_id}' new_variant_node = KNode(new_variant_curie, type=node_types.SEQUENCE_VARIANT) new_variant_node.add_export_labels( [node_types.SEQUENCE_VARIANT]) edge = self.create_edge( variant_node, new_variant_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props) return_results.append((edge, new_variant_node)) # new_rsid_node = None # is_new_dbsnp = False # synonyms = self.cache.get(f'synonymize({new_variant_curie})') # if synonyms is None: # new_rsid_node = KNode(new_variant_curie, name=f'{new_variant_id}', type=node_types.SEQUENCE_VARIANT) # synonyms = self.clingen.get_synonyms_by_other_ids(new_rsid_node) # redis_pipe.set(f'synonymize({new_variant_curie})', pickle.dumps(synonyms)) # is_new_dbsnp = True # caid_count = 0 # caid_node = None # for synonym in synonyms: # if Text.get_curie(synonym.identifier) == 'CAID': # caid_count += 1 # caid_node = KNode(synonym.identifier, name=f'{synonym.label}', type=node_types.SEQUENCE_VARIANT) # edge = self.create_edge(variant_node, caid_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props) # return_results.append((edge, caid_node)) # found_caid = True # if caid_count > 2 we can't cache it easily right now so we skip it and let synonymizer do it later # if caid_count == 1 and is_new_dbsnp: # assume we didn't cache the CAID yet if the dbsnp is new and do it if needed # if self.cache.get(f'synonymize({caid_node.id})') is None: # redis_pipe.set(f'synonymize({caid_node.id})', pickle.dumps(synonyms)) # elif caid_count == 0: # if not new_rsid_node: # new_rsid_node = KNode(new_variant_curie, name=f'{new_variant_id}', type=node_types.SEQUENCE_VARIANT) # edge = self.create_edge(variant_node, new_rsid_node, 'ensembl.sequence_variant_to_sequence_variant', dbsnp_curie, self.var_to_var_predicate, url=query_url, properties=props) # return_results.append((edge, new_rsid_node)) #elif query_response.status_code == 429: # handle the rate limiting by waiting and retrying # else: logger.error( f'Ensembl returned a non-200 response for {variant_node.id}: {query_response.status_code})' ) # redis_pipe.execute() return return_results