def make_nodes(drugcentral_ids, update_date):
    nodes = []
    reformatted_json = dict()
    category_label = kg2_util.BIOLINK_CATEGORY_DRUG
    for name_row in drugcentral_ids:
        drug_central_id = name_row['id']
        name = name_row['name']
        if len(drug_central_id) < 1:
            continue
        drug_central_id = format_drugcentral_id(drug_central_id)
        if drug_central_id not in reformatted_json:
            reformatted_json[drug_central_id] = dict()
            reformatted_json[drug_central_id]['synonyms'] = []
        if name_row['preferred_name'] == "1":
            reformatted_json[drug_central_id]['name'] = name
        else:
            reformatted_json[drug_central_id]['synonyms'].append(name)
    for node_id in reformatted_json:
        synonyms = reformatted_json[node_id]['synonyms']
        name = reformatted_json[node_id]['name']
        iri = BASE_URL_DRUGCENTRAL + node_id.split(':')[1]
        provided_by = DRUGCENTRAL_SOURCE
        node = kg2_util.make_node(node_id, iri, name, category_label,
                                  update_date, provided_by)
        node['synonym'] = synonyms
        nodes.append(node)
    return nodes
Example #2
0
def make_kg2_graph(drugbank_dict: dict, test_mode: bool):
    drugs = drugbank_dict["drugbank"]["drug"]

    nodes = []
    edges = []

    update_date = drugbank_dict["drugbank"]["@exported-on"]
    drugbank_kp_node = kg2_util.make_node(DRUGBANK_KB_CURIE_ID,
                                          DRUGBANK_KB_IRI, "DrugBank",
                                          kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                          update_date, DRUGBANK_KB_CURIE_ID)

    nodes.append(drugbank_kp_node)

    drug_ctr = 0

    for drug in drugs:
        drug_ctr += 1
        if test_mode and drug_ctr > 10000:
            break
        node = make_node(drug)
        if node is not None:
            nodes.append(node)
        for edge in make_edges(drug):
            if edge is not None:
                edges.append(edge)

    return {"nodes": nodes, "edges": edges}
def make_node(metabolite: dict, hmdb_id: str):
    iri = HMDB_BASE_IRI + hmdb_id
    name = metabolite["name"]
    category_label = kg2_util.BIOLINK_CATEGORY_METABOLITE
    update_date = metabolite["update_date"]
    creation_date = metabolite["creation_date"]
    provided_by = HMDB_PROVIDED_BY_CURIE_ID
    description = metabolite["description"]
    synonyms = []
    if (isinstance(metabolite["synonyms"], dict)
            and "synonym" in metabolite["synonyms"]):
        synonym_store = metabolite["synonyms"]["synonym"]
        if isinstance(synonym_store, list):
            for synonym in synonym_store:
                synonyms.append(synonym)
        else:
            synonyms.append(synonym_store)
    general_references = pull_out_references(metabolite["general_references"])
    publications = [reference for reference in general_references.keys()]

    node = kg2_util.make_node(CURIE_PREFIX_HMDB + ":" + hmdb_id, iri, name,
                              category_label, update_date, provided_by)
    node["description"] = description
    node["synonym"] = synonyms
    node["creation_date"] = creation_date
    node["publications"] = publications

    return node
def make_node(id: str, iri: str, name: str, category_label: str,
              description: str, synonym: list, publications: list,
              update_date: str):
    node_dict = kg2_util.make_node(id, iri, name, category_label, update_date,
                                   CHEMBL_KB_IRI)
    node_dict['description'] = description
    node_dict['synonym'] = synonyms
    node_dict['publications'] = publications
    return node_dict
Example #5
0
def make_node_and_edges(article: dict, mesh_predicate_label: str,
                        mesh_relation_curie: str):
    nodes = []
    edges = []

    article_citation = article["MedlineCitation"]

    pmid = kg2_util.CURIE_PREFIX_PMID + ":" + article_citation["PMID"]["#text"]

    update_date = extract_date(article_citation["DateRevised"])

    if pmid in pmids:
        # These aren't necessary yet, but it might be someday, so I wrote
        # and tested a couple of functions to extract them

        #authors = get_authors(article_citation)

        #journal = get_journal(article_citation)

        name = article_citation["Article"]["ArticleTitle"]
        if isinstance(name, dict):
            try:
                name = name["#text"]
            except:
                temp_name = name
                for key in temp_name:
                    name = temp_name[key]["#text"]

        try:
            created_date = extract_date(
                article_citation["Article"]["ArticleDate"])
        except:
            created_date = None

        iri = PMID_BASE_IRI + article_citation["PMID"]["#text"]

        node = kg2_util.make_node(pmid, iri, name,
                                  BIOLINK_CATEGORY_PUBLICATION, update_date,
                                  PMID_PROVIDED_BY_CURIE_ID)
        node["creation_date"] = created_date
        nodes.append(node)
        try:
            for mesh_topic in (
                    article_citation["MeshHeadingList"]["MeshHeading"]):
                mesh_id = kg2_util.CURIE_PREFIX_MESH + ":" + \
                          mesh_topic["DescriptorName"]["@UI"]
                edge = kg2_util.make_edge(pmid, mesh_id, mesh_relation_curie,
                                          mesh_predicate_label,
                                          PMID_PROVIDED_BY_CURIE_ID,
                                          update_date)
                edges.append(edge)
        except:
            mesh_id = None

    return [{"nodes": nodes, "edges": edges}, update_date]
Example #6
0
def format_node(drugbank_id: str, description: str, name: str,
                update_date: str, synonyms: list, publications: list,
                category_label: str, creation_date: str):
    iri = DRUGBANK_BASE_IRI + drugbank_id
    node_curie = kg2_util.CURIE_PREFIX_DRUGBANK + ":" + drugbank_id
    node_dict = kg2_util.make_node(node_curie, iri, name, category_label,
                                   update_date, DRUGBANK_KB_CURIE_ID)
    node_dict["synonym"] = synonyms
    node_dict["creation_date"] = creation_date
    node_dict["description"] = description
    node_dict["publications"] = publications
    return node_dict
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    ensembl_data = kg2_util.load_json(input_file_name)
    nodes = []
    edges = []
    genebuild_str = ensembl_data['genebuild']
    update_date = genebuild_str.split('/')[1]
    gene_ctr = 0

    ontology_curie_id = ENSEMBL_KB_CURIE_ID
    ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI,
                                     'Ensembl Genes',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, ontology_curie_id)
    nodes.append(ens_kp_node)

    for gene_dict in ensembl_data['genes']:
        gene_ctr += 1
        if test_mode and gene_ctr > 10000:
            break
        ensembl_gene_id = gene_dict['id']
        description = gene_dict.get('description', None)
        gene_symbol = gene_dict.get('name', None)
        other_synonyms = []
        xrefs = gene_dict.get('xrefs', None)
        if xrefs is not None:
            other_synonyms = list(
                set([
                    xref['primary_id'] for xref in xrefs
                    if xref['primary_id'] != ensembl_gene_id
                ]))
        node_dict = make_node(ensembl_gene_id, description, gene_symbol,
                              update_date, other_synonyms)
        nodes.append(node_dict)
        ensembl_gene_curie_id = node_dict['id']
        taxon_id_int = gene_dict.get('taxon_id', None)
        assert taxon_id_int == 9606, "unexpected taxon ID"
        edges.append(
            kg2_util.make_edge_biolink(
                ensembl_gene_curie_id,
                kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int),
                kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID,
                update_date))
        hgnc_list = gene_dict.get('HGNC', None)
        if hgnc_list is not None:
            for hgnc_curie in hgnc_list:
                edges.append(
                    kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie,
                                       kg2_util.CURIE_ID_OWL_SAME_AS,
                                       kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                       ENSEMBL_KB_CURIE_ID, update_date))
    return {'nodes': nodes, 'edges': edges}
def make_node(ncbi_gene_id: str,
              full_name: str,
              gene_symbol: str,
              update_date: str,
              other_synonyms: list = None):
    category_label = kg2_util.BIOLINK_CATEGORY_GENE
    if other_synonyms is None:
        other_synonyms = []
    node_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + ncbi_gene_id
    iri = NCBI_BASE_IRI + ncbi_gene_id
    node_dict = kg2_util.make_node(node_curie, iri, full_name, category_label,
                                   update_date, NCBI_KB_CURIE_ID)
    node_dict['synonym'] = [gene_symbol] + sorted(list(set(other_synonyms)))
    return node_dict
Example #9
0
def make_node(ensembl_gene_id: str,
              description: str,
              gene_symbol: str,
              update_date: str,
              other_synonyms: list = None):
    category_label = 'gene'
    if other_synonyms is None:
        other_synonyms = []
    node_curie = kg2_util.CURIE_PREFIX_ENSEMBL + ':' + ensembl_gene_id
    iri = ENSEMBL_BASE_IRI + ensembl_gene_id
    node_dict = kg2_util.make_node(node_curie, iri, description,
                                   category_label, update_date, ENSEMBL_KB_IRI)
    node_dict['synonym'] = [gene_symbol] + list(set(other_synonyms))
    return node_dict
def make_node(ensembl_gene_id: str,
              description: str,
              gene_symbol: str,
              update_date: str,
              other_synonyms: list = None):
    category_label = kg2_util.BIOLINK_CATEGORY_GENE
    if other_synonyms is None:
        other_synonyms = []
    node_curie = kg2_util.CURIE_PREFIX_ENSEMBL + ':' + ensembl_gene_id
    iri = ENSEMBL_BASE_IRI + ensembl_gene_id
    node_dict = kg2_util.make_node(node_curie, iri, description,
                                   category_label, update_date,
                                   ENSEMBL_KB_CURIE_ID)
    node_dict['name'] = gene_symbol
    node_dict['synonym'] = [gene_symbol] + sorted(list(set(other_synonyms)))
    return node_dict
Example #11
0
def make_node(ncbi_gene_id: str,
              full_name: str,
              gene_symbol: str,
              update_date: str,
              category_label: str,
              other_synonyms: list = None) -> dict:

    if other_synonyms is None:
        other_synonyms = []
    node_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + ncbi_gene_id
    iri = NCBI_BASE_IRI + ncbi_gene_id
    node_dict = kg2_util.make_node(node_curie, iri, full_name, category_label,
                                   update_date, NCBI_KB_CURIE_ID)
    node_dict['synonym'] = [gene_symbol] + sorted(list(set(other_synonyms)))
    node_dict['name'] = "Genetic locus associated with " + gene_symbol
    return node_dict
Example #12
0
def make_node(ncbi_gene_id: str,
              full_name: str,
              gene_symbol: str,
              update_date: str,
              other_synonyms: list = None):
    category_label = 'gene'
    if other_synonyms is None:
        other_synonyms = []
    node_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + ncbi_gene_id
    iri = NCBI_BASE_IRI + '/' + ncbi_gene_id
    node_dict = kg2_util.make_node(node_curie,
                                   iri,
                                   full_name,
                                   category_label,
                                   update_date,
                                   NCBI_BASE_IRI)
    node_dict['synonym'] = list(set([gene_symbol] + other_synonyms))
    return node_dict
Example #13
0
def make_kg2_graph(kegg, update_date):
    nodes = []
    edges = []
    for kegg_id in kegg:
        kegg_dict = kegg[kegg_id]
        if kegg_id.startswith(KEGG_COMPOUND_PREFIX):
            node, compound_edges = process_compound(kegg_dict, kegg_id, update_date)
            nodes.append(node)
            edges += compound_edges

    kegg_kp_node = kg2_util.make_node(KEGG_PROVIDED_BY,
                                      KEGG_SOURCE_IRI,
                                      'Kyoto Encyclopedia of Genes and Genomes',
                                      kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                      update_date,
                                      KEGG_PROVIDED_BY)
    nodes.append(kegg_kp_node)
    return {'nodes': nodes,
            'edges': edges}
Example #14
0
def make_node(id: str,
              iri: str,
              name: str,
              category_label: str,
              description: str,
              synonym: list,
              publications: list,
              update_date: str,
              canonical_smiles: str = None):
    node_dict = kg2_util.make_node(id,
                                   iri,
                                   name,
                                   category_label,
                                   update_date,
                                   CHEMBL_KB_CURIE_ID)
    node_dict['description'] = description
    node_dict['synonym'] = sorted(synonym)
    node_dict['publications'] = sorted(publications)
    node_dict['has_biological_sequence'] = canonical_smiles
    return node_dict
Example #15
0
def format_node(node_id,
                name,
                category_label,
                update_date,
                description=None,
                sequence=None,
                synonym=[]):
    iri = KEGG_BASE_IRI + node_id
    curie_id = format_id(node_id)
    node = kg2_util.make_node(curie_id,
                              iri,
                              name,
                              category_label,
                              update_date,
                              KEGG_PROVIDED_BY)
    node['description'] = description
    if sequence is not None and len(sequence) > 0:
        node['has_biological_sequence'] = sequence
    node['synonym'] = synonym

    return node
Example #16
0
def make_nodes(entries, test_mode):
    nodes = []
    all_xrefs = dict()
    nodes_to_species = dict()
    entry_count = 0
    for entry in entries:
        species = entry['ID'].split(';')[2].strip()
        species_id = only_include_certain_species(species)
        if not species_id:
            continue
        entry_count += 1
        if test_mode and entry_count > 1000:
            break
        node_id = get_node_id(entry)
        node_iri = kg2_util.BASE_URL_MIRBASE + node_id.split(':')[1]
        node_category = kg2_util.BIOLINK_CATEGORY_MICRORNA
        node_name = entry['DE'].strip()
        description = entry.get('CC', '').replace('\t', ' ').replace('  ', ' ')
        sequence = entry.get('SQ', '').replace('\t', ' ')
        publications = entry.get('RX', None)
        xrefs = entry.get('DR', None)
        if xrefs is not None:
            xrefs = [
                format_xref(xref) for xref in xrefs.split('\t')
                if format_xref(xref) is not None
            ]
            all_xrefs[node_id] = xrefs
        if publications is not None:
            publications = [
                format_publication(publication)
                for publication in publications.split('\t')
            ]
        node = kg2_util.make_node(node_id, node_iri, node_name, node_category,
                                  None, MIRBASE_KB_CURIE_ID)
        node['description'] = description
        node['publications'] = publications
        node['has_biological_sequence'] = sequence.strip('Sequence ')
        nodes.append(node)
        nodes_to_species[node_id] = species_id
    return [nodes, all_xrefs, nodes_to_species]
Example #17
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    update_date = os.path.getmtime(input_file_name)
    nodes = [
        kg2_util.make_node(id=REPODB_CURIE + ':',
                           iri=REPODB_IRI,
                           name='repoDB drug repositioning database',
                           category_label=kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                           update_date=update_date,
                           provided_by=REPODB_CURIE + ':')
    ]
    edges = []
    df = pd.read_csv(input_file_name)
    for idx in range(len(df)):
        if not df['status'].isna()[idx]:
            status = df['status'][idx].lower()
        else:
            status = "unknown_status"
        if not df['phase'].isna()[idx]:
            phase = df['phase'][idx].lower().replace(" ",
                                                     "_").replace("/", "_or_")
        else:
            phase = "unknown_phase"
        relation = "clinically_tested_" + status + "_" + phase
        edge_dict = kg2_util.make_edge(
            subject_id=DRUGBANK_CURIE + ':' + df['drug_id'][idx],
            object_id=UMLS_CURIE + ':' + df['ind_id'][idx],
            relation_curie=REPODB_CURIE + ':' + relation,
            relation_label=relation,
            provided_by=REPODB_CURIE + ':',
            update_date=None)
        if not df['NCT'].isna()[idx]:
            edge_dict['publications'].append(NCT_CURIE + df['NCT'][idx])
            edge_dict['publications_info'][
                NCT_CURIE +
                df['NCT'][idx]] = CLINICALTRIALS_IRI + df['NCT'][idx]
        edges.append(edge_dict)
    return {'nodes': nodes, 'edges': edges}
Example #18
0
        if not record_of_relation_curie_occurrences[relation_curie]:
            print(
                'relation curie is in the config file but was not used in any edge in the graph: '
                + relation_curie,
                file=sys.stderr)
    for relation_curie in relation_curies_not_in_nodes:
        print('could not find a node for relation curie: ' + relation_curie)
    update_date = datetime.now().strftime("%Y-%m-%d %H:%M")
    version_file = open(args.versionFile, 'r')
    build_name = str
    for line in version_file:
        test_flag = ""
        if test_mode:
            test_flag = "-TEST"
        build_name = "RTX KG" + line.rstrip() + test_flag
        break
    build_node = kg2_util.make_node(kg2_util.CURIE_PREFIX_RTX + ':' + 'KG2',
                                    kg2_util.BASE_URL_RTX + 'KG2', build_name,
                                    kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                    update_date,
                                    kg2_util.CURIE_PREFIX_RTX + ':')
    build_info = {
        'version': build_node['name'],
        'timestamp_utc': build_node['update_date']
    }
    pprint.pprint(build_info)
    graph["build"] = build_info
    graph["nodes"].append(build_node)
    kg2_util.save_json(graph, output_file_name, test_mode)
    del graph
        metabolite_count += 1

        if metabolite_count <= 10000:
            hmdb_id = metabolite["accession"]
            nodes.append(make_node(metabolite, hmdb_id))
            for edge in make_disease_edges(metabolite, hmdb_id):
                edges.append(edge)
            for edge in make_protein_edges(metabolite, hmdb_id):
                edges.append(edge)
            for edge in make_equivalencies(metabolite, hmdb_id):
                edges.append(edge)
            for edge in make_property_edges(metabolite, hmdb_id):
                edges.append(edge)
        else:
            break

    file_update_date = convert_date(os.path.getmtime(args.inputFile))
    hmdb_kp_node = kg2_util.make_node(HMDB_PROVIDED_BY_CURIE_ID, HMDB_KB_IRI,
                                      "Human Metabolome Database",
                                      kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                      file_update_date,
                                      HMDB_PROVIDED_BY_CURIE_ID)
    nodes.append(hmdb_kp_node)
    print("Saving JSON at", date())
    kg2_util.save_json({
        "nodes": nodes,
        "edges": edges
    }, args.outputFile, args.test)
    print("Finished saving JSON at", date())
    print("Script finished at", date())
def get_nodes(connection, test):
    nodes = []

    # This MySQL query uses the StableIdentifier table,
    # which holds all of the node IDs for Reactome, as
    # its left most table. Then, it inner joins the
    # DatabaseObject table, which contains identifiers (called
    # the DB_ID) that can be linked to all of the other tables,
    # which the StableIdentifier can not be. Then, the
    # various node properties are added on using left joins.
    # In general, there are three types of nodes: events (which
    # includes pathways and reactions), physical entities (which
    # includes polymers, drugs, and complexes), and regulations.
    # The regulations are nodes that stand in for edges. As a result,
    # they are filtered out in category assignment. However, we retreive
    # them in this statement in case they are wanted later.
    # Each general node type has different table linkage to
    # retreive its publications and description. As a result,
    # this statement uses left joins, so that each node gets the
    # publications and description that fits it. However,
    # nodes can have more than one publication, so we have
    # to use group by and group concat to ensure that each node
    # is only included in the knowledge graph once and all of its
    # publications are on it. In addition, this statement includes
    # distinct when using group concat, because we don't need repeats
    # of the various fields, it is merely a way to collapse all iterations
    # of the node (because each publication creates a new row of the node)
    # into one.
    nodes_sql = "SELECT si.identifier as node_id, \
                 GROUP_CONCAT(DISTINCT dbobj._displayName) as node_name, \
                 GROUP_CONCAT(DISTINCT dbobj._timestamp) as update_date, \
                 GROUP_CONCAT(DISTINCT dbobj._class) as category, \
                 GROUP_CONCAT(DISTINCT lit_fr_e.pubMedIdentifier) as pmid_event, \
                 GROUP_CONCAT(DISTINCT lit_fr_p.pubMedIdentifier) as pmid_entity, \
                 GROUP_CONCAT(DISTINCT sum_fr_e.text) as description_event, \
                 GROUP_CONCAT(DISTINCT sum_fr_p.text) as description_entity, \
                 GROUP_CONCAT(DISTINCT sum_fr_r.text) as description_regulation, \
                 GROUP_CONCAT(DISTINCT ins_ed.dateTime) as created_date \
                 FROM StableIdentifier si \
                 INNER JOIN DatabaseObject dbobj \
                 ON si.DB_ID=dbobj.stableIdentifier \
                 LEFT JOIN InstanceEdit ins_ed \
                 ON dbobj.created=ins_ed.DB_ID \
                 LEFT JOIN Event_2_literatureReference ev_lit \
                 ON dbobj.DB_ID=ev_lit.DB_ID \
                 LEFT JOIN LiteratureReference lit_fr_e \
                 ON lit_fr_e.DB_ID=ev_lit.literatureReference \
                 LEFT JOIN Event_2_summation ev_sum \
                 ON ev_sum.DB_ID=dbobj.DB_ID \
                 LEFT JOIN Summation sum_fr_e \
                 ON ev_sum.summation=sum_fr_e.DB_ID \
                 LEFT JOIN PhysicalEntity_2_literatureReference pe_lit \
                 ON dbobj.DB_ID=pe_lit.DB_ID \
                 LEFT JOIN LiteratureReference lit_fr_p \
                 ON lit_fr_p.DB_ID=pe_lit.literatureReference \
                 LEFT JOIN PhysicalEntity_2_summation pe_sum \
                 ON dbobj.DB_ID=pe_sum.DB_ID \
                 LEFT JOIN Summation sum_fr_p \
                 ON pe_sum.summation = sum_fr_p.DB_ID \
                 LEFT JOIN Regulation_2_summation reg_sum \
                 on reg_sum.DB_ID=dbobj.DB_ID \
                 LEFT JOIN Summation sum_fr_r \
                 ON sum_fr_r.DB_ID=reg_sum.summation \
                 GROUP BY si.identifier"

    if test:
        nodes_sql += " LIMIT " + str(ROW_LIMIT_TEST_MODE)
    for result in run_sql(nodes_sql, connection):
        node_id = only_include_certain_species(kg2_util.CURIE_PREFIX_REACTOME +
                                               ':' + result[0])
        if node_id is None:
            continue
        name = result[1]
        update_date = str(result[2])
        try:
            category_label = match_reactome_category_to_biolink(result[3])
            if category_label is None:
                continue
        except KeyError:
            print("Category for", result[3],
                  "not in match_reactome_category_to_biolink")
            continue
        publications_event = result[4]
        publications_phy_ent = result[5]
        description_event = result[6]
        description_phy_ent = result[7]
        descrption_reg = result[8]
        iri = REACTOME_BASE_IRI + result[0]
        created_date = result[9]

        # Check to see which general type of node it is and generate the
        # publications list using that
        if publications_event is not None:
            publications = publications_event.split(',')
            publications = [
                PMID_PREFIX + ':' + publication for publication in publications
            ]
        elif publications_phy_ent is not None:
            publications = publications_phy_ent.split(',')
            publications = [
                PMID_PREFIX + ':' + publication for publication in publications
            ]
        else:
            publications = []

        # Check to see which general type of node it is and assign the node's
        # description based on that
        if description_event is not None:
            description = description_event
        elif description_phy_ent is not None:
            description = description_phy_ent
        else:
            description = descrption_reg

        node = kg2_util.make_node(node_id, iri, name, category_label,
                                  update_date, REACTOME_KB_CURIE_ID)
        node['description'] = description
        node['publications'] = publications
        node['creation_date'] = str(created_date)
        nodes.append(node)

    return nodes
    for edge in get_physical_entity_characteristics(connection, test):
        edges.append(edge)
    for edge in get_members_of_set(connection, test):
        edges.append(edge)
    for edge in get_species(connection, test):
        edges.append(edge)
    return edges


if __name__ == '__main__':
    args = get_args()

    connection = pymysql.connect(read_default_file=args.mysqlConfigFile,
                                 db=args.mysqlDBName)

    run_sql("SET SESSION group_concat_max_len=35000", connection)
    run_sql("SET SESSION sort_buffer_size=256000000", connection)

    nodes = get_nodes(connection, args.test)
    edges = get_edges(connection, args.test)

    kp_node = kg2_util.make_node(REACTOME_KB_CURIE_ID, REACTOME_KB_IRI,
                                 'Reactome',
                                 kg2_util.BIOLINK_CATEGORY_DATA_FILE, None,
                                 REACTOME_KB_CURIE_ID)
    nodes.append(kp_node)

    graph = {'nodes': nodes, 'edges': edges}

    kg2_util.save_json(graph, args.outputFile, args.test)
Example #22
0
                                  relation,
                                  relation_label,
                                  INTACT_KB_CURIE_ID,
                                  update_date)
        edge['publications'] = publications
        return edge
    return None


if __name__ == '__main__':
    args = get_args()
    with open(args.inputFile, 'r') as intact:
        edges = []
        nodes = []
        edge_count = 0
        for row in intact:
            edge = make_edge(row)
            if edge is not None and (args.test is False or
                                     edge_count < EDGE_LIMIT_TEST_MODE):
                edges.append(edge)
                edge_count += 1
        kp_node = kg2_util.make_node(INTACT_KB_CURIE_ID,
                                     INTACT_KB_URI,
                                     "IntAct",
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     None,
                                     INTACT_KB_CURIE_ID)
        nodes.append(kp_node)
        graph = {'edges': edges, 'nodes': nodes}
        kg2_util.save_json(graph, args.outputFile, args.test)
Example #23
0
def make_nodes(records: list):
    ret_dict = {}
    for record_dict in records:
        xrefs = set()
        if 'CC' in record_dict:
            freetext_comments_str = record_dict['CC']
            freetext_comments_list = list(
                map(lambda thestr: thestr.strip(),
                    freetext_comments_str.split('-!-')))
            for comment_str in freetext_comments_list:
                if comment_str.startswith(
                        'CATALYTIC ACTIVITY:') or comment_str.startswith(
                            'COFACTOR:'):
                    xref_match_res = REGEX_XREF.search(comment_str)
                    if xref_match_res is not None:
                        xrefs |= set(
                            filter(None,
                                   map(fix_xref,
                                       xref_match_res[1].split(','))))
        accession_list = record_dict['AC']
        accession = accession_list[0]
        synonyms = []
        if len(accession_list) > 1:
            synonyms += accession_list[1:(len(accession_list) + 1)]
        description_list = record_dict['DE']
        full_name = None
        short_name = None
        desc_ctr = 0
        description = record_dict.get('CC', '')
        for description_str in description_list:
            description_str = description_str.lstrip()
            if description_str.startswith('RecName: '):
                full_name = description_str.replace('RecName: Full=', '')
                if desc_ctr < len(description_list) - 1:
                    next_desc = description_list[desc_ctr + 1].lstrip()
                    if next_desc.startswith('Short='):
                        short_name = next_desc.replace('Short=', '')
                        synonyms += [short_name]
#                        continue
            elif description_str.startswith('AltName: Full='):
                synonyms.append(description_str.replace('AltName: Full=', ''))
            elif description_str.startswith('AltName: CD_antigen='):
                synonyms.append(
                    description_str.replace('AltName: CD_antigen=', ''))
            elif description_str.startswith('EC='):
                ec_match = REGEX_EC_XREF.search(description_str)
                if ec_match is not None:
                    xrefs.add(kg2_util.CURIE_PREFIX_KEGG + ':' + 'EC:' +
                              ec_match[1])
            elif not description_str.startswith(
                    'Flags:') and not description_str.startswith('Contains:'):
                description += '; ' + description_str
            desc_ctr += 1
        date_fields = record_dict['DT']
        date_ctr = 0
        creation_date = None
        update_date = None
        for date_str_raw in date_fields:
            date_str = date_str_raw.split(',')[0]
            if date_ctr == 0:
                creation_date = date_str
            if date_ctr == len(date_fields) - 1:
                update_date = date_str
            date_ctr += 1
        publications_raw = record_dict.get('RX', None)
        publications = []
        if publications_raw is not None:
            for pub in publications_raw.split(';'):
                pub = pub.strip()
                if len(pub) > 0:
                    publications.append(
                        pub.replace('=', ':').replace(
                            'PubMed:', kg2_util.CURIE_PREFIX_PMID + ':'))
        else:
            publications = []
        assert type(publications) == list
        assert type(description) == str
        publications += [
            pub.replace('PubMed:', kg2_util.CURIE_PREFIX_PMID + ':')
            for pub in REGEX_PUBLICATIONS.findall(description)
        ]
        publications = sorted(list(set(publications)))
        gene_names_str = record_dict.get('GN', None)
        gene_symbol = None
        if gene_names_str is not None:
            gene_names_str_list = gene_names_str.split(';')
            for gene_names_str_item in gene_names_str_list:
                gene_names_match = REGEX_GENE_NAME.match(gene_names_str_item)
                if gene_names_match is not None:
                    gene_symbol = gene_names_match[1]
                    synonyms.insert(0, gene_symbol)
                else:
                    gene_synonyms_match = REGEX_GENE_SYNONYMS.match(
                        gene_names_str_item)
                    if gene_synonyms_match is not None:
                        # evidence codes from gene synonyms are not preserved
                        synonyms += [
                            seperate_evidence_codes(syn)[0].strip()
                            for syn in gene_synonyms_match[1].split(',')
                        ]
        if gene_symbol is not None:
            name = gene_symbol
        else:
            if short_name is not None:
                name = short_name
            else:
                name = full_name
        # move evidence codes from name to description (issue #1171)
        name, ev_codes = seperate_evidence_codes(name)
        description += f"Evidence Codes from Name: {ev_codes} "

        # append species name to name if not human (issue #1171)
        species = record_dict.get('OS', 'unknown species').rstrip(".")
        if "h**o sapiens (human)" not in species.lower():
            name += f" ({species})"
        node_curie = kg2_util.CURIE_PREFIX_UNIPROT + ':' + accession
        iri = UNIPROTKB_IDENTIFIER_BASE_IRI + accession
        category_label = kg2_util.BIOLINK_CATEGORY_PROTEIN
        node_dict = kg2_util.make_node(node_curie, iri, name, category_label,
                                       update_date,
                                       UNIPROTKB_PROVIDED_BY_CURIE_ID)
        node_dict['full_name'] = full_name
        if not description.endswith(' '):
            description += ' '
        sequence = record_dict.get('SQ', '').strip('SEQUENCE   ')
        node_dict['has_biological_sequence'] = sequence
        description = description.replace(LICENSE_TEXT, '')
        node_dict['description'] = description
        if len(synonyms) > 0:
            synonyms = [synonyms[0]] + list(set(synonyms) - {synonyms[0]})
        node_dict['synonym'] = synonyms
        node_dict['publications'] = publications
        node_dict['creation_date'] = creation_date
        if len(xrefs) == 0:
            xrefs = None
        node_dict['xrefs'] = xrefs
        ret_dict[node_curie] = node_dict
    return ret_dict
Example #24
0
        node_dict['creation_date'] = creation_date
        if len(xrefs) == 0:
            xrefs = None
        node_dict['xrefs'] = xrefs
        ret_dict[node_curie] = node_dict
    return ret_dict


# --------------- main starts here -------------------

if __name__ == '__main__':
    args = make_arg_parser().parse_args()
    test_mode = args.test
    input_file_name = args.inputFile
    output_file_name = args.outputFile
    [uniprot_records,
     update_date] = parse_records_from_uniprot_dat(input_file_name,
                                                   DESIRED_SPECIES_INTS,
                                                   test_mode)

    nodes_dict = make_nodes(uniprot_records)
    ontology_curie_id = UNIPROTKB_PROVIDED_BY_CURIE_ID
    ont_node = kg2_util.make_node(ontology_curie_id, UNIPROT_KB_URL,
                                  'UniProtKB',
                                  kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                  update_date, ontology_curie_id)
    nodes_list = [ont_node] + [node_dict for node_dict in nodes_dict.values()]
    edges_list = make_edges(uniprot_records, nodes_dict)
    output_graph = {'nodes': nodes_list, 'edges': edges_list}
    kg2_util.save_json(output_graph, output_file_name, test_mode)
Example #25
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    gene_ctr = 0

    update_date = os.path.getmtime(input_file_name)
    ontology_curie_id = NCBI_KB_CURIE_ID
    ens_kp_node = kg2_util.make_node(ontology_curie_id, NCBI_KB_URL,
                                     'NCBI Genes',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, ontology_curie_id)
    nodes.append(ens_kp_node)

    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            if line.startswith('#'):
                continue
            gene_ctr += 1
            if test_mode and gene_ctr > 10000:
                break
            fields = line.rstrip("\n").split("\t")
            fields = [(field if field.strip() != '-' else None)
                      for field in fields]
            [
                taxon_id_str, ncbi_gene_id, gene_symbol, locus_tag,
                synonyms_str, db_xrefs, chromosome, map_location, description,
                type_of_gene, symbol_auth, full_name_auth, nomenc_status,
                other_desig, modify_date, feature_type
            ] = fields
            taxon_id_int = int(taxon_id_str)
            if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN:
                # skip neanderthal- and denisovan-specific genes
                continue
            node_synonyms = list()
            if synonyms_str is not None:
                node_synonyms += synonyms_str.split('|')
            if other_desig is not None:
                node_synonyms += other_desig.split('|')
            if symbol_auth is not None and symbol_auth != gene_symbol:
                node_synonyms = [symbol_auth] + node_synonyms
            node_synonyms = list(set(node_synonyms))
            full_name = full_name_auth
            if full_name is None:
                full_name = description
            if type_of_gene != "unknown" or (db_xrefs is None) or (not db_xrefs.startswith("MIM:")) or \
               nomenc_status is not None:
                category_label = kg2_util.BIOLINK_CATEGORY_GENE
            else:
                full_name = 'Genetic locus associated with ' + full_name
                category_label = kg2_util.BIOLINK_CATEGORY_GENOMIC_ENTITY
            if full_name.startswith('microRNA'):
                category_label = kg2_util.BIOLINK_CATEGORY_MICRORNA
            node_dict = make_node(ncbi_gene_id, full_name, gene_symbol,
                                  modify_date, category_label, node_synonyms)
            node_curie_id = node_dict['id']
            type_str = 'Type:' + type_of_gene
            node_description = ''
            if description is not None and description != full_name_auth:
                node_description = description + '; '
            node_description += type_str
            if nomenc_status is not None:
                nomenc_tag = 'official'
            else:
                nomenc_tag = 'unofficial'
            if map_location is not None:
                node_description += '; Locus:' + map_location
            node_description += '; NameStatus:' + nomenc_tag
            node_dict['description'] = node_description
            nodes.append(node_dict)
            org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str
            predicate_label = 'in_taxon'

            edge_dict = kg2_util.make_edge_biolink(node_curie_id, org_curie,
                                                   predicate_label,
                                                   NCBI_KB_CURIE_ID,
                                                   modify_date)
            edges.append(edge_dict)
            if db_xrefs is not None:
                xrefs_list = db_xrefs.split('|')
                for xref_curie in xrefs_list:
                    if xref_curie.startswith('HGNC:HGNC:'):
                        xref_curie = kg2_util.CURIE_PREFIX_HGNC + ':' + xref_curie.replace(
                            'HGNC:', '')
                    elif xref_curie.startswith('Ensembl:'):
                        xref_curie = xref_curie.upper()
                    elif xref_curie.startswith('MIM:'):
                        xref_curie = kg2_util.CURIE_PREFIX_OMIM + ':' + xref_curie.replace(
                            'MIM:', '')
                    elif xref_curie.startswith('miRBase:'):
                        xref_curie = kg2_util.CURIE_PREFIX_MIRBASE + ':' + xref_curie.replace(
                            'miRBase:', '')
                    edges.append(
                        kg2_util.make_edge(node_curie_id, xref_curie,
                                           kg2_util.CURIE_ID_OWL_SAME_AS,
                                           kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                           NCBI_KB_CURIE_ID, modify_date))
    return {'nodes': nodes, 'edges': edges}
Example #26
0
    arg_parser.add_argument('inputFile', type=str)
    arg_parser.add_argument('outputFile', type=str)
    return arg_parser


if __name__ == '__main__':
    args = make_arg_parser().parse_args()
    input_file_name = args.inputFile
    output_file_name = args.outputFile
    test_mode = args.test
    edges = []
    nodes = []

    file_update_date = kg2_util.convert_date(os.path.getmtime(args.inputFile))
    unichem_kp_node = kg2_util.make_node(UNICHEM_KB_CURIE, UNICHEM_KB_IRI,
                                         "UniChem database",
                                         kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                         file_update_date, UNICHEM_KB_CURIE)
    nodes.append(unichem_kp_node)

    update_date = None
    line_ctr = 0
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            if line.startswith('#'):
                update_date = line.split('# ')[1].rstrip()
                continue
            line_ctr += 1
            if test_mode and line_ctr > 10000:
                break
            (subject_curie_id, object_curie_id) = line.rstrip().split('\t')
            edges.append(
Example #27
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    line_ctr = 0
    update_date = None
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            line = line.rstrip("\n")
            if line.startswith('#'):
                update_date = line.replace('#', '')
                continue
            if line.startswith('gene_name\t'):
                continue
            line_ctr += 1
            if test_mode and line_ctr > 10000:
                break
            fields = line.split("\t")
            [gene_name,
             gene_claim_name,
             entrez_id,
             interaction_claim_source,
             interaction_types,
             drug_claim_name,
             drug_claim_primary_name,
             drug_name,
             drug_chembl_id,
             PMIDs] = fields
            if entrez_id != "":
                object_curie_id = 'NCBIGene:' + entrez_id
                if drug_chembl_id != "":
                    subject_curie_id = 'CHEMBL.COMPOUND:' + drug_chembl_id
                else:
                    if drug_claim_name != "":
                        node_pubs_list = []
                        subject_curie_id = None
                        if interaction_claim_source == "GuideToPharmacologyInteractions":
                            subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name
                            pmid_match = RE_PMID.match(drug_claim_primary_name)
                            if pmid_match is not None:
                                node_pubs_list = [pmid_match[2].replace(' ', '').strip()]
                                node_name = pmid_match[1].strip()
                            else:
                                node_name = drug_claim_primary_name
                            node_iri = GTPI_IRI_BASE + GTPI_LIGAND_SUFFIX + drug_claim_name
                            provided_by = GTPI_IRI_BASE
                        elif interaction_claim_source == "TTD":
                            subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name
                            node_name = drug_claim_primary_name
                            node_iri = TTD_IRI_BASE + drug_claim_name
                            provided_by = TTD_IRI_BASE
                        if subject_curie_id is not None:
                            node_dict = kg2_util.make_node(subject_curie_id,
                                                           node_iri,
                                                           node_name,
                                                           'chemical_substance',
                                                           update_date,
                                                           provided_by)
                            node_dict['publications'] = node_pubs_list
                            nodes.append(node_dict)
                if subject_curie_id is None:
                    print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name + "; source DB: " + interaction_claim_source, file=sys.stderr)
                    continue
                if interaction_types == "":
                    interaction_types = "affects"
                pmids_list = []
                if PMIDs.strip() != "":
                    pmids_list = [('PMID:' + pmid.strip()) for pmid in PMIDs.split(',')]
                interaction_list = interaction_types.split(',')
                for interaction in interaction_list:
                    interaction = interaction.replace(' ', '_')
                    edge_dict = kg2_util.make_edge(subject_curie_id,
                                                   object_curie_id,
                                                   DGIDB_BASE_IRI + '/' +
                                                   kg2_util.convert_snake_case_to_camel_case(interaction),
                                                   DGIDB_CURIE_PREFIX + ':' + interaction,
                                                   interaction,
                                                   DGIDB_BASE_IRI,
                                                   update_date)
                    edge_dict['publications'] = pmids_list
                    edges.append(edge_dict)
    return {'nodes': nodes,
            'edges': edges}
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    line_ctr = 0
    update_date = None
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            line = line.rstrip("\n")
            if line.startswith('#'):
                update_date = line.replace('#', '')
                continue
            if line.startswith('gene_name\t'):
                continue
            line_ctr += 1
            if test_mode and line_ctr > 10000:
                break
            fields = line.split("\t")
            [gene_name,
             gene_claim_name,
             entrez_id,
             interaction_claim_source,
             interaction_types,
             drug_claim_name,
             drug_claim_primary_name,
             drug_name,
             drug_concept_id,
             _, #12.5.2020 new field in tsv: interaction group score
             PMIDs] = fields
            if entrez_id != "":
                object_curie_id = kg2_util.CURIE_PREFIX_NCBI_GENE + ':' + entrez_id
                if drug_concept_id != "":   
                    if "chembl" in drug_concept_id:
                        _, chembl_id = drug_concept_id.split(":")                    
                        subject_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_COMPOUND + ':' + chembl_id
                    else:
                        print(f"DGIDB: Skipping row with drug concept id {drug_concept_id}", file=sys.stderr)
                        continue #skipping over wikidata nodes, see #1185
                else:
                    if drug_claim_name != "":
                        node_pubs_list = []
                        subject_curie_id = None
                        if interaction_claim_source == INTERACTION_CLAIM_SOURCE_GTPI:
                            subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name
                            pmid_match = RE_PMID.match(drug_claim_primary_name)
                            if pmid_match is not None:
                                node_pubs_list = [pmid_match[2].replace(' ', '').strip()]
                                node_name = pmid_match[1].strip()
                            else:
                                node_name = drug_claim_primary_name
                            node_iri = GTPI_BASE_URL + drug_claim_name
                            provided_by = GTPI_KB_CURIE
                        elif interaction_claim_source == INTERACTION_CLAIM_SOURCE_TTD:
                            subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name
                            node_name = drug_claim_primary_name
                            node_iri = TTD_IRI_BASE + drug_claim_name
                            provided_by = TTD_KB_CURIE
                        if subject_curie_id is not None:
                            node_dict = kg2_util.make_node(subject_curie_id,
                                                           node_iri,
                                                           node_name,
                                                           kg2_util.BIOLINK_CATEGORY_CHEMICAL_SUBSTANCE,
                                                           update_date,
                                                           provided_by)
                            node_dict['publications'] = node_pubs_list
                            nodes.append(node_dict)
                if subject_curie_id is None:
                    print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name +
                          "; source DB: " + interaction_claim_source, file=sys.stderr)
                    continue
                if interaction_types == "":
                    print("DGIDB: interaction type was empty. Setting to 'affects'.", file=sys.stderr)
                    interaction_types = "affects"
                pmids_list = []
                if PMIDs.strip() != "":
                    pmids_list = [(kg2_util.CURIE_PREFIX_PMID + ':' + pmid.strip()) for pmid in PMIDs.split(',')]
                interaction_list = interaction_types.split(',')
                for interaction in interaction_list:
                    interaction = interaction.replace(' ', '_')
                    edge_dict = kg2_util.make_edge(subject_curie_id,
                                                   object_curie_id,
                                                   DGIDB_CURIE_PREFIX + ':' + interaction,
                                                   interaction,
                                                   DGIDB_KB_CURIE,
                                                   update_date)
                    edge_dict['publications'] = pmids_list
                    edges.append(edge_dict)
    return {'nodes': nodes,
            'edges': edges}
Example #29
0
                [data,
                 update_date] = make_node_and_edges(article,
                                                    mesh_predicate_label)
                for node in data["nodes"]:
                    nodes.append(node)
                for edge in data["edges"]:
                    edges.append(edge)
                if date_to_num(update_date) > latest_date:
                    latest_date = date_to_num(update_date)

    latest_date = {
        "Year": str(latest_date)[0:4],
        "Month": str(latest_date)[4:6],
        "Day": str(latest_date)[6:]
    }
    pmid_kp_node = kg2_util.make_node(PMID_PROVIDED_BY_CURIE_ID, PMID_KB_IRI,
                                      "PubMed",
                                      kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                      extract_date(latest_date),
                                      PMID_PROVIDED_BY_CURIE_ID)
    nodes.append(pmid_kp_node)
    print("Saving JSON:", date())
    kg2_util.save_json({
        "nodes": nodes,
        "edges": edges
    }, args.outputFile, args.test)
    print("Finished saving JSON:", date())
    del nodes
    del edges
    print("Script Finished:", date())
            else:
                edge = kg2_util.make_edge_biolink(
                    node_id, xref_id, kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO,
                    MIRBASE_KB_CURIE_ID, None)
                edges.append(edge)
    taxon_edge_count = 0
    for node_id in nodes_to_species:
        taxon_edge_count += 1
        if test_mode and taxon_edge_count > 1000:
            break
        taxon_edge = kg2_util.make_edge_biolink(
            node_id, nodes_to_species[node_id],
            kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, MIRBASE_KB_CURIE_ID, None)
        edges.append(taxon_edge)
    return edges


if __name__ == '__main__':
    args = get_args()
    with open(args.inputFile, 'r') as mirbase:
        entries = format_data(mirbase)
    kp_node = kg2_util.make_node(MIRBASE_KB_CURIE_ID, MIRBASE_KB_URL,
                                 'miRBase',
                                 kg2_util.BIOLINK_CATEGORY_DATA_FILE, None,
                                 MIRBASE_KB_CURIE_ID)
    [nodes, xrefs, nodes_to_species] = make_nodes(entries, args.test)
    nodes.append(kp_node)
    edges = make_edges(xrefs, nodes_to_species, args.test)
    graph = {'nodes': nodes, 'edges': edges}
    kg2_util.save_json(graph, args.outputFile, args.test)