def make_kg2_graph(input_file_name: str, test_mode: bool = False): ensembl_data = kg2_util.load_json(input_file_name) nodes = [] edges = [] genebuild_str = ensembl_data['genebuild'] update_date = genebuild_str.split('/')[1] gene_ctr = 0 ontology_curie_id = ENSEMBL_KB_CURIE_ID ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI, 'Ensembl Genes', kg2_util.BIOLINK_CATEGORY_DATA_FILE, update_date, ontology_curie_id) nodes.append(ens_kp_node) for gene_dict in ensembl_data['genes']: gene_ctr += 1 if test_mode and gene_ctr > 10000: break ensembl_gene_id = gene_dict['id'] description = gene_dict.get('description', None) gene_symbol = gene_dict.get('name', None) other_synonyms = [] xrefs = gene_dict.get('xrefs', None) if xrefs is not None: other_synonyms = list( set([ xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id ])) node_dict = make_node(ensembl_gene_id, description, gene_symbol, update_date, other_synonyms) nodes.append(node_dict) ensembl_gene_curie_id = node_dict['id'] taxon_id_int = gene_dict.get('taxon_id', None) assert taxon_id_int == 9606, "unexpected taxon ID" edges.append( kg2_util.make_edge_biolink( ensembl_gene_curie_id, kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int), kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID, update_date)) hgnc_list = gene_dict.get('HGNC', None) if hgnc_list is not None: for hgnc_curie in hgnc_list: edges.append( kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie, kg2_util.CURIE_ID_OWL_SAME_AS, kg2_util.EDGE_LABEL_OWL_SAME_AS, ENSEMBL_KB_CURIE_ID, update_date)) return {'nodes': nodes, 'edges': edges}
def make_kg2_graph(input_file_name: str, test_mode: bool = False): ensembl_data = kg2_util.load_json(input_file_name) nodes = [] edges = [] genebuild_str = ensembl_data['genebuild'] update_date = genebuild_str.split('/')[1] gene_ctr = 0 for gene_dict in ensembl_data['genes']: gene_ctr += 1 if test_mode and gene_ctr > 10000: break ensembl_gene_id = gene_dict['id'] description = gene_dict.get('description', None) gene_symbol = gene_dict.get('name', None) other_synonyms = [] xrefs = gene_dict.get('xrefs', None) if xrefs is not None: other_synonyms = list(set([xref['primary_id'] for xref in xrefs if xref['primary_id'] != ensembl_gene_id])) node_dict = make_node(ensembl_gene_id, description, gene_symbol, update_date, other_synonyms) nodes.append(node_dict) ensembl_gene_curie_id = node_dict['id'] taxon_id_int = gene_dict.get('taxon_id', None) assert taxon_id_int == 9606, "unexpected taxon ID" edges.append(kg2_util.make_edge(ensembl_gene_curie_id, 'NCBITaxon:' + str(taxon_id_int), 'gene_found_in_organism', ENSEMBL_KB_IRI, update_date)) hgnc_list = gene_dict.get('HGNC', None) if hgnc_list is not None: for hgnc_curie in hgnc_list: edges.append(kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie, 'xref', ENSEMBL_KB_IRI, update_date)) return {'nodes': nodes, 'edges': edges}
import argparse import kg2_util def make_arg_parser(): arg_parser = argparse.ArgumentParser( description= 'sample_subgraph.py: sample a smaller subgraph of a KG in JSON format') arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str, nargs=1) arg_parser.add_argument('outputFile', type=str, nargs=1) return arg_parser if __name__ == '__main__': args = make_arg_parser().parse_args() input_file_name = args.inputFile output_file_name = args.outputFile graph = kg2_util.load_json(input_file_name) nodes = [graph['nodes'][i] for i in range(0, len(graph['nodes']), 5)] nodes_id_set = set([node['id'] for node in nodes]) edges = [ edge for edge in graph['edges'] if edge['subject'] in nodes_id_set and edge['object'] in nodes_id_set ] kg2_util.save_json({'nodes': nodes, 'edges': edges}, output_file_name)