def make_edges(xrefs, nodes_to_species, test_mode):
    edges = []
    edge_count = 0
    for node_id in xrefs:
        edge_count += 1
        if test_mode and edge_count > 1000:
            break
        for xref_id in xrefs[node_id]:
            if xref_id.startswith(CURIE_PREFIX_HGNC) or \
               xref_id.startswith(CURIE_PREFIX_NCBI_GENE):
                edge = kg2_util.make_edge_biolink(
                    node_id, xref_id, kg2_util.EDGE_LABEL_BIOLINK_SAME_AS,
                    MIRBASE_KB_CURIE_ID, None)
                edges.append(edge)
            else:
                edge = kg2_util.make_edge_biolink(
                    node_id, xref_id, kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO,
                    MIRBASE_KB_CURIE_ID, None)
                edges.append(edge)
    taxon_edge_count = 0
    for node_id in nodes_to_species:
        taxon_edge_count += 1
        if test_mode and taxon_edge_count > 1000:
            break
        taxon_edge = kg2_util.make_edge_biolink(
            node_id, nodes_to_species[node_id],
            kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, MIRBASE_KB_CURIE_ID, None)
        edges.append(taxon_edge)
    return edges
Ejemplo n.º 2
0
def format_same_as_edge(kegg_id, external_id, update_date):
    edge = kg2_util.make_edge_biolink(format_id(kegg_id),
                                      external_id,
                                      kg2_util.EDGE_LABEL_BIOLINK_SAME_AS,
                                      KEGG_PROVIDED_BY,
                                      update_date)
    return edge
Ejemplo n.º 3
0
def make_edges(input_file: str, test_mode: bool):
    edges = []
    count = 0
    non_befree_count = 0
    with open(input_file, 'r') as input_tsv:
        tsvreader = csv.reader(input_tsv, delimiter='\t')
        for line in tsvreader:
            count += 1
            if count == 1:
                continue
            if test_mode and non_befree_count >= TEST_MODE_LIMIT:
                break
            [
                subject_id, _, _, _, object_id, _, _, _, _, score,
                evidence_score, created_date, update_date, pmid, source
            ] = line
            if source != 'BEFREE':
                non_befree_count += 1
                subject_id = format_id(subject_id,
                                       kg2_util.CURIE_PREFIX_NCBI_GENE)
                object_id = format_id(object_id, kg2_util.CURIE_PREFIX_UMLS)
                predicate = kg2_util.EDGE_LABEL_BIOLINK_GENE_ASSOCIATED_WITH_CONDITION
                edge = kg2_util.make_edge_biolink(subject_id, object_id,
                                                  predicate, DISGENET_KB_CURIE,
                                                  update_date)
                publication = kg2_util.CURIE_PREFIX_PMID + ':' + pmid
                edge['publications'] = [publication]
                edges.append(edge)
    return edges
Ejemplo n.º 4
0
def make_edges(records: list, nodes_dict: dict):
    ret_list = []
    for record_dict in records:
        accession = record_dict['AC'][0]
        curie_id = kg2_util.CURIE_PREFIX_UNIPROT + ':' + accession
        organism_int = record_dict['organism']
        update_date = nodes_dict[curie_id]['update_date']
        ret_list.append(
            kg2_util.make_edge_biolink(
                curie_id,
                kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(organism_int),
                kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON,
                UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date))
        record_xrefs = record_dict.get('DR', None)
        if record_xrefs is not None:
            for xref_str in record_xrefs:
                hgnc_match = REGEX_HGNC.match(xref_str)
                if hgnc_match is not None:
                    hgnc_curie = hgnc_match[1]
                    ret_list.append(
                        kg2_util.make_edge_biolink(
                            hgnc_curie, curie_id,
                            kg2_util.EDGE_LABEL_BIOLINK_HAS_GENE_PRODUCT,
                            UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date))
                gene_id_match = REGEX_NCBIGeneID.match(xref_str)
                if gene_id_match is not None:
                    ncbi_curie = kg2_util.CURIE_PREFIX_NCBI_GENE + \
                        ':' + gene_id_match[1]
                    ret_list.append(
                        kg2_util.make_edge_biolink(
                            ncbi_curie, curie_id,
                            kg2_util.EDGE_LABEL_BIOLINK_HAS_GENE_PRODUCT,
                            UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date))

    for node_id, node_dict in nodes_dict.items():
        xrefs = node_dict['xrefs']
        if xrefs is not None and len(xrefs) > 0:
            for xref_curie in sorted(list(xrefs)):
                ret_list.append(
                    kg2_util.make_edge_biolink(
                        node_id, xref_curie,
                        kg2_util.EDGE_LABEL_BIOLINK_PHYSICALLY_INTERACTS_WITH,
                        UNIPROTKB_PROVIDED_BY_CURIE_ID, update_date))
        del node_dict['xrefs']
    return ret_list
Ejemplo n.º 5
0
def format_edge(subject_id: str, object_id: str, predicate_label: str):
    relation_curie = kg2_util.predicate_label_to_curie(
        predicate_label, REACTOME_RELATION_CURIE_PREFIX)
    if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS:
        return kg2_util.make_edge_biolink(subject_id, object_id,
                                          predicate_label,
                                          REACTOME_KB_CURIE_ID, None)
    return kg2_util.make_edge(subject_id, object_id, relation_curie,
                              predicate_label, REACTOME_KB_CURIE_ID)
Ejemplo n.º 6
0
def format_edge(subject_id, object_id, predicate, update_date):
    relation_curie = kg2_util.predicate_label_to_curie(
        predicate, DRUGCENTRAL_RELATION_CURIE_PREFIX)
    if predicate == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS:
        return kg2_util.make_edge_biolink(subject_id, object_id, predicate,
                                          DRUGCENTRAL_SOURCE, update_date)
    else:
        return kg2_util.make_edge(subject_id, object_id, relation_curie,
                                  predicate, DRUGCENTRAL_SOURCE, update_date)
Ejemplo n.º 7
0
def make_node_and_edges(article: dict, mesh_predicate_label: str):
    nodes = []
    edges = []

    article_citation = article["MedlineCitation"]

    pmid = kg2_util.CURIE_PREFIX_PMID + ":" + article_citation["PMID"]["#text"]

    update_date = extract_date(article_citation["DateRevised"])

    if pmid in pmids:
        # These aren't necessary yet, but it might be someday, so I wrote
        # and tested a couple of functions to extract them

        #authors = get_authors(article_citation)

        #journal = get_journal(article_citation)

        name = article_citation["Article"]["ArticleTitle"]
        if isinstance(name, dict):
            try:
                name = name["#text"]
            except:
                temp_name = name
                for key in temp_name:
                    name = temp_name[key]["#text"]

        try:
            created_date = extract_date(
                article_citation["Article"]["ArticleDate"])
        except:
            created_date = None

        iri = PMID_BASE_IRI + article_citation["PMID"]["#text"]

        node = kg2_util.make_node(pmid, iri, name,
                                  BIOLINK_CATEGORY_PUBLICATION, update_date,
                                  PMID_PROVIDED_BY_CURIE_ID)
        node["creation_date"] = created_date
        nodes.append(node)
        try:
            for mesh_topic in (
                    article_citation["MeshHeadingList"]["MeshHeading"]):
                mesh_id = kg2_util.CURIE_PREFIX_MESH + ":" + \
                          mesh_topic["DescriptorName"]["@UI"]
                edge = kg2_util.make_edge_biolink(pmid, mesh_id,
                                                  mesh_predicate_label,
                                                  PMID_PROVIDED_BY_CURIE_ID,
                                                  update_date)
                edges.append(edge)
        except:
            mesh_id = None

    return [{"nodes": nodes, "edges": edges}, update_date]
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    ensembl_data = kg2_util.load_json(input_file_name)
    nodes = []
    edges = []
    genebuild_str = ensembl_data['genebuild']
    update_date = genebuild_str.split('/')[1]
    gene_ctr = 0

    ontology_curie_id = ENSEMBL_KB_CURIE_ID
    ens_kp_node = kg2_util.make_node(ontology_curie_id, ENSEMBL_KB_URI,
                                     'Ensembl Genes',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, ontology_curie_id)
    nodes.append(ens_kp_node)

    for gene_dict in ensembl_data['genes']:
        gene_ctr += 1
        if test_mode and gene_ctr > 10000:
            break
        ensembl_gene_id = gene_dict['id']
        description = gene_dict.get('description', None)
        gene_symbol = gene_dict.get('name', None)
        other_synonyms = []
        xrefs = gene_dict.get('xrefs', None)
        if xrefs is not None:
            other_synonyms = list(
                set([
                    xref['primary_id'] for xref in xrefs
                    if xref['primary_id'] != ensembl_gene_id
                ]))
        node_dict = make_node(ensembl_gene_id, description, gene_symbol,
                              update_date, other_synonyms)
        nodes.append(node_dict)
        ensembl_gene_curie_id = node_dict['id']
        taxon_id_int = gene_dict.get('taxon_id', None)
        assert taxon_id_int == 9606, "unexpected taxon ID"
        edges.append(
            kg2_util.make_edge_biolink(
                ensembl_gene_curie_id,
                kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + str(taxon_id_int),
                kg2_util.EDGE_LABEL_BIOLINK_IN_TAXON, ENSEMBL_KB_CURIE_ID,
                update_date))
        hgnc_list = gene_dict.get('HGNC', None)
        if hgnc_list is not None:
            for hgnc_curie in hgnc_list:
                edges.append(
                    kg2_util.make_edge(ensembl_gene_curie_id, hgnc_curie,
                                       kg2_util.CURIE_ID_OWL_SAME_AS,
                                       kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                       ENSEMBL_KB_CURIE_ID, update_date))
    return {'nodes': nodes, 'edges': edges}
Ejemplo n.º 9
0
def make_hmdb_edge(subject_id: str, object_id: str, subject_prefix: str,
                   object_prefix: str, predicate_label: str, update_date: str,
                   publications_info: dict):
    relation_curie = kg2_util.predicate_label_to_curie(predicate_label,
                                                       CURIE_PREFIX_HMDB)
    subject = subject_prefix + ":" + subject_id
    object = object_id
    if object_prefix is not None:
        object = object_prefix + ":" + object_id
    if predicate_label == kg2_util.EDGE_LABEL_BIOLINK_SAME_AS:
        edge = kg2_util.make_edge_biolink(subject, object, predicate_label,
                                          HMDB_PROVIDED_BY_CURIE_ID,
                                          update_date)

    else:
        edge = kg2_util.make_edge(subject, object, relation_curie,
                                  predicate_label, HMDB_PROVIDED_BY_CURIE_ID,
                                  update_date)
    edge["publications_info"] = publications_info

    return edge
Ejemplo n.º 10
0
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    gene_ctr = 0

    update_date = os.path.getmtime(input_file_name)
    ontology_curie_id = NCBI_KB_CURIE_ID
    ens_kp_node = kg2_util.make_node(ontology_curie_id, NCBI_KB_URL,
                                     'NCBI Genes',
                                     kg2_util.BIOLINK_CATEGORY_DATA_FILE,
                                     update_date, ontology_curie_id)
    nodes.append(ens_kp_node)

    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            if line.startswith('#'):
                continue
            gene_ctr += 1
            if test_mode and gene_ctr > 10000:
                break
            fields = line.rstrip("\n").split("\t")
            fields = [(field if field.strip() != '-' else None)
                      for field in fields]
            [
                taxon_id_str, ncbi_gene_id, gene_symbol, locus_tag,
                synonyms_str, db_xrefs, chromosome, map_location, description,
                type_of_gene, symbol_auth, full_name_auth, nomenc_status,
                other_desig, modify_date, feature_type
            ] = fields
            taxon_id_int = int(taxon_id_str)
            if taxon_id_int != kg2_util.NCBI_TAXON_ID_HUMAN:
                # skip neanderthal- and denisovan-specific genes
                continue
            node_synonyms = list()
            if synonyms_str is not None:
                node_synonyms += synonyms_str.split('|')
            if other_desig is not None:
                node_synonyms += other_desig.split('|')
            if symbol_auth is not None and symbol_auth != gene_symbol:
                node_synonyms = [symbol_auth] + node_synonyms
            node_synonyms = list(set(node_synonyms))
            full_name = full_name_auth
            if full_name is None:
                full_name = description
            if type_of_gene != "unknown" or (db_xrefs is None) or (not db_xrefs.startswith("MIM:")) or \
               nomenc_status is not None:
                category_label = kg2_util.BIOLINK_CATEGORY_GENE
            else:
                full_name = 'Genetic locus associated with ' + full_name
                category_label = kg2_util.BIOLINK_CATEGORY_GENOMIC_ENTITY
            if full_name.startswith('microRNA'):
                category_label = kg2_util.BIOLINK_CATEGORY_MICRORNA
            node_dict = make_node(ncbi_gene_id, full_name, gene_symbol,
                                  modify_date, category_label, node_synonyms)
            node_curie_id = node_dict['id']
            type_str = 'Type:' + type_of_gene
            node_description = ''
            if description is not None and description != full_name_auth:
                node_description = description + '; '
            node_description += type_str
            if nomenc_status is not None:
                nomenc_tag = 'official'
            else:
                nomenc_tag = 'unofficial'
            if map_location is not None:
                node_description += '; Locus:' + map_location
            node_description += '; NameStatus:' + nomenc_tag
            node_dict['description'] = node_description
            nodes.append(node_dict)
            org_curie = kg2_util.CURIE_PREFIX_NCBI_TAXON + ':' + taxon_id_str
            predicate_label = 'in_taxon'

            edge_dict = kg2_util.make_edge_biolink(node_curie_id, org_curie,
                                                   predicate_label,
                                                   NCBI_KB_CURIE_ID,
                                                   modify_date)
            edges.append(edge_dict)
            if db_xrefs is not None:
                xrefs_list = db_xrefs.split('|')
                for xref_curie in xrefs_list:
                    if xref_curie.startswith('HGNC:HGNC:'):
                        xref_curie = kg2_util.CURIE_PREFIX_HGNC + ':' + xref_curie.replace(
                            'HGNC:', '')
                    elif xref_curie.startswith('Ensembl:'):
                        xref_curie = xref_curie.upper()
                    elif xref_curie.startswith('MIM:'):
                        xref_curie = kg2_util.CURIE_PREFIX_OMIM + ':' + xref_curie.replace(
                            'MIM:', '')
                    elif xref_curie.startswith('miRBase:'):
                        xref_curie = kg2_util.CURIE_PREFIX_MIRBASE + ':' + xref_curie.replace(
                            'miRBase:', '')
                    edges.append(
                        kg2_util.make_edge(node_curie_id, xref_curie,
                                           kg2_util.CURIE_ID_OWL_SAME_AS,
                                           kg2_util.EDGE_LABEL_OWL_SAME_AS,
                                           NCBI_KB_CURIE_ID, modify_date))
    return {'nodes': nodes, 'edges': edges}
Ejemplo n.º 11
0
        cursor.execute(sql)
        results = cursor.fetchall()
    for (action_type, description, parent_type) in results:
        name = action_type.lower()
        predicate_label = name.replace(' ', '_')
        curie_id = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + predicate_label
        node_dict = make_node(curie_id,
                              CHEMBL_BASE_IRI_PREDICATE + predicate_label,
                              name,
                              kg2_util.BIOLINK_CATEGORY_RELATIONSHIP_TYPE,
                              description, [], [], update_date)
        nodes.append(node_dict)
        parent_label = parent_type.lower().replace(' ', '_')
        parent_curie_id = kg2_util.CURIE_PREFIX_CHEMBL_MECHANISM + ':' + parent_label
        new_edge = kg2_util.make_edge_biolink(
            curie_id, parent_curie_id, kg2_util.EDGE_LABEL_BIOLINK_SUBCLASS_OF,
            CHEMBL_KB_CURIE_ID, update_date)
        edges.append(new_edge)

# get target-to-target subset_of relationships

    sql = '''select distinct
             t1.chembl_id,
             target_relations.relationship,
             t2.chembl_id
             from
             (target_dictionary as t1 inner join
             target_relations on t1.tid = target_relations.tid) inner join
             target_dictionary as t2 on t2.tid = target_relations.related_tid'''
    if test_mode:
        sql += str_sql_row_limit_test_mode