Esempio n. 1
0
def make_kg2(curies_to_categories: dict,
             uri_to_curie_shortener: callable,
             map_category_label_to_iri: callable,
             owl_urls_and_files: tuple,
             output_file_name: str,
             test_mode: bool = False):

    owl_file_information_dict_list = []

    # for each OWL file (or URL for an OWL file) described in the YAML config file...
    for ont_source_info_dict in owl_urls_and_files:
        if ont_source_info_dict['download']:
            # get the OWL file onto the local file system and get a full path to it
            local_file_name = kg2_util.download_file_if_not_exist_locally(
                ont_source_info_dict['url'], ont_source_info_dict['file'])
        else:
            local_file_name = ont_source_info_dict['file']
            assert os.path.exists(ont_source_info_dict['file'])
        # load the OWL file dadta into an ontobio.ontol.Ontology data structure and information dictionary
        [ont, metadata_dict] = load_owl_file_return_ontology_and_metadata(
            local_file_name, ont_source_info_dict['url'],
            ont_source_info_dict['title'])
        metadata_dict['ontology'] = ont
        owl_file_information_dict_list.append(metadata_dict)

    kg2_util.log_message('Calling make_nodes_dict_from_ontologies_list')

    nodes_dict = make_nodes_dict_from_ontologies_list(
        owl_file_information_dict_list, curies_to_categories,
        uri_to_curie_shortener, map_category_label_to_iri)

    kg2_util.log_message('Calling make_map_of_node_ontology_ids_to_curie_ids')

    map_of_node_ontology_ids_to_curie_ids = make_map_of_node_ontology_ids_to_curie_ids(
        nodes_dict)

    kg2_util.log_message('Calling get_rels_dict')

    # get a dictionary of all relationships including xrefs as relationships
    all_rels_dict = get_rels_dict(nodes_dict, owl_file_information_dict_list,
                                  uri_to_curie_shortener,
                                  map_of_node_ontology_ids_to_curie_ids)

    kg2_dict = dict()
    kg2_dict['edges'] = [rel_dict for rel_dict in all_rels_dict.values()]
    kg2_util.log_message('Number of edges: ' + str(len(kg2_dict['edges'])))
    kg2_dict['nodes'] = list(nodes_dict.values())
    kg2_util.log_message('Number of nodes: ' + str(len(kg2_dict['nodes'])))
    del nodes_dict

    # delete xrefs from all_nodes_dict
    for node_dict in kg2_dict['nodes']:
        del node_dict['xrefs']
        del node_dict['ontology node ids']

    kg2_util.log_message('Saving JSON file')
    kg2_util.save_json(kg2_dict, output_file_name, test_mode)

if __name__ == '__main__':
    args = make_arg_parser().parse_args()
    mysql_config_file = args.mysqlConfigFile
    mysql_db_name = args.mysqlDBName
    test_mode = args.test
    connection = pymysql.connect(read_default_file=mysql_config_file,
                                 db=mysql_db_name)
    preds_dict = dict()
    sql_statement = (
        "SELECT PMID, SUBJECT_CUI, PREDICATE, OBJECT_CUI, DP, SENTENCE, SUBJECT_SCORE, "
        "OBJECT_SCORE, DATE_FORMAT(CURR_TIMESTAMP, '%Y-%m-%d %H:%i:%S') FROM ((PREDICATION NATURAL JOIN CITATIONS) "
        "NATURAL JOIN SENTENCE) NATURAL JOIN PREDICATION_AUX")
    if test_mode:
        sql_statement += " LIMIT 10000"
    results = {
        'data_dictionary': [
            'pmid', 'subject_cui_str', 'predicate', 'object_cui_str',
            'pub_date', 'sentence', 'subject_score', 'object_score',
            'curr_timestamp'
        ]
    }

    with connection.cursor() as cursor:
        cursor.execute(sql_statement)
        results['rows'] = cursor.fetchall()
    connection.close()
    output_file_name = args.outputFile
    kg2_util.save_json(results, output_file_name, test_mode)
        metabolite_count += 1

        if metabolite_count <= 10000:
            hmdb_id = metabolite["accession"]
            nodes.append(make_node(metabolite, hmdb_id))
            for edge in make_disease_edges(metabolite, hmdb_id):
                edges.append(edge)
            for edge in make_protein_edges(metabolite, hmdb_id):
                edges.append(edge)
            for edge in make_equivalencies(metabolite, hmdb_id):
                edges.append(edge)
            for edge in make_property_edges(metabolite, hmdb_id):
                edges.append(edge)
        else:
            break

    file_update_date = convert_date(os.path.getmtime(args.inputFile))
    hmdb_kp_node = kg2_util.make_node(HMDB_PROVIDED_BY_CURIE_ID, HMDB_KB_IRI,
                                      "Human Metabolome Database",
                                      kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                      file_update_date,
                                      HMDB_PROVIDED_BY_CURIE_ID)
    nodes.append(hmdb_kp_node)
    print("Saving JSON at", date())
    kg2_util.save_json({
        "nodes": nodes,
        "edges": edges
    }, args.outputFile, args.test)
    print("Finished saving JSON at", date())
    print("Script finished at", date())
    for edge in get_physical_entity_characteristics(connection, test):
        edges.append(edge)
    for edge in get_members_of_set(connection, test):
        edges.append(edge)
    for edge in get_species(connection, test):
        edges.append(edge)
    return edges


if __name__ == '__main__':
    args = get_args()

    connection = pymysql.connect(read_default_file=args.mysqlConfigFile,
                                 db=args.mysqlDBName)

    run_sql("SET SESSION group_concat_max_len=35000", connection)
    run_sql("SET SESSION sort_buffer_size=256000000", connection)

    nodes = get_nodes(connection, args.test)
    edges = get_edges(connection, args.test)

    kp_node = kg2_util.make_node(REACTOME_KB_CURIE_ID, REACTOME_KB_IRI,
                                 'Reactome',
                                 kg2_util.BIOLINK_CATEGORY_DATA_FILE, None,
                                 REACTOME_KB_CURIE_ID)
    nodes.append(kp_node)

    graph = {'nodes': nodes, 'edges': edges}

    kg2_util.save_json(graph, args.outputFile, args.test)
Esempio n. 5
0
                              update_date,
                              other_synonyms)
        nodes.append(node_dict)
        ensembl_gene_curie_id = node_dict['id']
        taxon_id_int = gene_dict.get('taxon_id', None)
        assert taxon_id_int == 9606, "unexpected taxon ID"
        edges.append(kg2_util.make_edge(ensembl_gene_curie_id,
                                        'NCBITaxon:' + str(taxon_id_int),
                                        'gene_found_in_organism',
                                        ENSEMBL_KB_IRI,
                                        update_date))
        hgnc_list = gene_dict.get('HGNC', None)
        if hgnc_list is not None:
            for hgnc_curie in hgnc_list:
                edges.append(kg2_util.make_edge(ensembl_gene_curie_id,
                                                hgnc_curie,
                                                'xref',
                                                ENSEMBL_KB_IRI,
                                                update_date))
    return {'nodes': nodes,
            'edges': edges}


if __name__ == '__main__':
    args = get_args()
    input_file_name = args.inputFile[0]
    output_file_name = args.outputFile[0]
    test_mode = args.test
    graph = make_kg2_graph(input_file_name, test_mode)
    kg2_util.save_json(graph, output_file_name, test_mode)
Esempio n. 6
0
import argparse
import kg2_util


def make_arg_parser():
    arg_parser = argparse.ArgumentParser(
        description=
        'sample_subgraph.py: sample a smaller subgraph of a KG in JSON format')
    arg_parser.add_argument('--test',
                            dest='test',
                            action="store_true",
                            default=False)
    arg_parser.add_argument('inputFile', type=str, nargs=1)
    arg_parser.add_argument('outputFile', type=str, nargs=1)
    return arg_parser


if __name__ == '__main__':
    args = make_arg_parser().parse_args()
    input_file_name = args.inputFile
    output_file_name = args.outputFile
    graph = kg2_util.load_json(input_file_name)
    nodes = [graph['nodes'][i] for i in range(0, len(graph['nodes']), 5)]
    nodes_id_set = set([node['id'] for node in nodes])
    edges = [
        edge for edge in graph['edges']
        if edge['subject'] in nodes_id_set and edge['object'] in nodes_id_set
    ]
    kg2_util.save_json({'nodes': nodes, 'edges': edges}, output_file_name)
Esempio n. 7
0
    kg_edges_file_names = args.kgFileNewEdges
    test_mode = args.test
    output_file_name = args.outputFile[0]
    kg = json.load(open(kg_file_name, 'r'))
    kg_orphan_edges = {'edges': []}
    for kg_edges_file_name in kg_edges_file_names:
        kg_orphan_edges_new = []
        ctr_edges_added = 0
        kg_edges_new = json.load(open(kg_edges_file_name, 'r'))
        nodes_dict = {node['id']: node for node in kg['nodes']}
        for rel_dict in kg_edges_new['edges']:
            subject_curie = rel_dict['subject']
            object_curie = rel_dict['object']
            if subject_curie in nodes_dict and object_curie in nodes_dict:
                ctr_edges_added += 1
                kg['edges'].append(rel_dict)
            else:
                kg_orphan_edges_new.append(rel_dict)
        kg_orphan_edges['edges'] += kg_orphan_edges_new
        kg2_util.log_message("number edges added: " + str(ctr_edges_added),
                             ontology_name=kg_edges_file_name,
                             output_stream=sys.stderr)
        kg2_util.log_message("number of orphan edges: " +
                             str(len(kg_orphan_edges['edges'])),
                             ontology_name=kg_edges_file_name,
                             output_stream=sys.stderr)
    kg2_util.save_json(kg, output_file_name, test_mode)
    kg_file_orphan_edges = args.kgFileOrphanEdges
    if kg_file_orphan_edges is not None:
        kg2_util.save_json(kg_orphan_edges, kg_file_orphan_edges, test_mode)

if __name__ == '__main__':
    args = get_args()
    edges = []
    nodes = []
    test_mode = args.test
    with open(args.inputFile, 'r') as input_file:
        json_data = json.load(input_file)
        update_date = json_data['version'][0]['dtime']
        edges = process_external_ids(json_data['external_ids'], update_date,
                                     test_mode)
        edges += process_omop_relations(json_data['omop_relations'],
                                        update_date, test_mode)
        edges += process_faers_data(json_data['faers_data'], update_date,
                                    test_mode)
        edges += process_atc_codes(json_data['atc_ids'], update_date,
                                   test_mode)
        edges += process_bioactivities(json_data['bioactivities'], update_date,
                                       test_mode)
        edges += process_pharmacologic_actions(
            json_data['pharmacologic_action'], update_date, test_mode)
        nodes = make_nodes(json_data['drugcentral_ids'], update_date)
        kp_node = kg2_util.make_node(DRUGCENTRAL_SOURCE, BASE_URL_DRUGCENTRAL,
                                     'DrugCentral',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, DRUGCENTRAL_SOURCE)
        nodes.append(kp_node)
    graph = {'edges': edges, 'nodes': nodes}
    kg2_util.save_json(graph, args.outputFile, test_mode)
Esempio n. 9
0
    return arg_parser


if __name__ == "__main__":
    node_set = set(["name", "id", "full name", "category label"])
    edge_set = set([
        "simplified relation curie", "subject", "object",
        "simplified edge label", "provided by"
    ])

    args = make_arg_parser().parse_args()
    test_mode = args.test
    reduced = {"nodes": [], "edges": []}
    with open(args.inputFilepath, "r") as fp:
        all_data = json.load(fp)
        for node in all_data["nodes"]:
            temp_node = {}
            for key, val in node.items():
                if key in node_set:
                    temp_node[key] = val
            reduced["nodes"].append(temp_node)

        for edge in all_data["edges"]:
            temp_edge = {}
            for key, val in edge.items():
                if key in edge_set:
                    temp_edge[key] = val
            reduced["edges"].append(temp_edge)

    kg2_util.save_json(reduced, args.outputFilepath, test_mode)
Esempio n. 10
0
                evidence_score, created_date, update_date, pmid, source
            ] = line
            if source != 'BEFREE':
                non_befree_count += 1
                subject_id = format_id(subject_id,
                                       kg2_util.CURIE_PREFIX_NCBI_GENE)
                object_id = format_id(object_id, kg2_util.CURIE_PREFIX_UMLS)
                predicate = kg2_util.EDGE_LABEL_BIOLINK_GENE_ASSOCIATED_WITH_CONDITION
                edge = kg2_util.make_edge_biolink(subject_id, object_id,
                                                  predicate, DISGENET_KB_CURIE,
                                                  update_date)
                publication = kg2_util.CURIE_PREFIX_PMID + ':' + pmid
                edge['publications'] = [publication]
                edges.append(edge)
    return edges


if __name__ == '__main__':
    args = get_args()
    input_file = args.inputFile
    output_file = args.outputFile
    edges = make_edges(input_file, args.test)
    nodes = []
    kp_node = kg2_util.make_node(DISGENET_KB_CURIE, DISGENET_BASE_IRI,
                                 "DisGeNET",
                                 kg2_util.BIOLINK_CATEGORY_DATA_FILE, None,
                                 DISGENET_KB_CURIE)
    nodes.append(kp_node)
    graph = {"edges": edges, "nodes": nodes}
    kg2_util.save_json(graph, output_file, args.test)
Esempio n. 11
0
def make_arg_parser():
    arg_parser = argparse.ArgumentParser(
        description=
        'sample_subgraph.py: sample a smaller subgraph of a KG in JSON format')
    arg_parser.add_argument('--test',
                            dest='test',
                            action="store_true",
                            default=False)
    arg_parser.add_argument('inputFile', type=str, nargs=1)
    arg_parser.add_argument('outputFile', type=str, nargs=1)
    return arg_parser


if __name__ == '__main__':
    args = make_arg_parser().parse_args()
    input_file_name = args.inputFile
    output_file_name = args.outputFile
    graph = kg2_util.load_json(input_file_name)
    nodes = [graph['nodes'][i] for i in range(0, len(graph['nodes']), 5)]
    nodes_id_set = set([node['id'] for node in nodes])
    edges = [
        edge for edge in graph['edges']
        if edge['subject'] in nodes_id_set and edge['object'] in nodes_id_set
    ]
    build = graph.get('build', None)
    out_graph = {'nodes': nodes, 'edges': edges}
    if build is not None:
        out_graph['build'] = build
    kg2_util.save_json(out_graph, output_file_name)
Esempio n. 12
0
                    "http://rest.kegg.jp/conv/glycan/chebi",
                    "http://rest.kegg.jp/conv/drug/chebi"]
    get_base_query = "http://rest.kegg.jp/get/"
    for query in list_queries:
        for results in send_query(query).split('\n'):
            if len(results) < 1:
                continue
            results = results.split('\t')
            results_dict[results[0]] = {'name': results[1]}
    for query in conv_queries:
        for results in send_query(query).split('\n'):
            if len(results) < 1:
                continue
            results = results.split('\t')
            results_dict[results[1]]['eq_id'] = results[0]
    kegg_ids = len(results_dict.keys())
    get_count = 0
    for kegg_id in results_dict:
        previous_line_starter = ''
        results = send_query(get_base_query + kegg_id)
        results_dict = process_get_query(results, results_dict, kegg_id)
        get_count += 1
        if get_count % 1000 == 0:
            print("Processed", get_count, "out of", kegg_ids, "at", date())
    return results_dict


if __name__ == '__main__':
    args = get_args()
    kg2_util.save_json(run_queries(), args.outputFile, True)