def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    df = pd.read_csv(input_file_name)
    for idx in range(len(df)):
        if not df['status'].isna()[idx]:
            status = df['status'][idx].lower()
        else:
            status = "unknown_status"
        if not df['phase'].isna()[idx]:
            phase = df['phase'][idx].lower().replace(" ",
                                                     "_").replace("/", "_or_")
        else:
            phase = "unknown_phase"
        relation = "clinically_tested_" + status + "_" + phase
        edge_dict = kg2_util.make_edge(
            subject_id=DRUGBANK_CURIE + df['drug_id'][idx],
            object_id=UMLS_CURIE + df['ind_id'][idx],
            relation=REPODB_IRI + '#' +
            kg2_util.convert_snake_case_to_camel_case(relation),
            relation_curie=REPODB_CURIE + relation,
            predicate_label=relation,
            provided_by=REPODB_IRI,
            update_date=None)
        if not df['NCT'].isna()[idx]:
            edge_dict['publications'].append(NCT_CUTRIE + df['NCT'][idx])
            edge_dict['publications info'][
                NCT_CUTRIE +
                df['NCT'][idx]] = CLINICALTRIALS_IRI + df['NCT'][idx]
        edges.append(edge_dict)
    return {'nodes': nodes, 'edges': edges}
def make_edge(subject_curie_id: str, object_curie_id: str,
              predicate_label: str, update_date: str):
    relation = kg2_util.BIOLINK_CATEGORY_BASE_IRI + kg2_util.convert_snake_case_to_camel_case(
        predicate_label)
    relation_curie = kg2_util.BIOLINK_CURIE_PREFIX + ':' + predicate_label.replace(
        ' ', '_')
    rel = kg2_util.make_edge(subject_curie_id, object_curie_id, relation,
                             relation_curie, predicate_label,
                             UNIPROTKB_BASE_IRI, update_date)
    return rel
Beispiel #3
0
def make_edge(subject_id: str,
              object_id: str,
              predicate_label: str,
              update_date: str = None,
              publications: list = None):
    relation = CHEMBL_BASE_IRI_PREDICATE + kg2_util.convert_snake_case_to_camel_case(predicate_label)
    if publications is None:
        publications = []
    return {'subject': subject_id,
            'object': object_id,
            'edge label': predicate_label,
            'relation': relation,
            'relation curie': 'CHEMBL:' + predicate_label,
            'negated': False,
            'publications': publications,
            'publications info': {},
            'update date': update_date,
            'provided by': CHEMBL_KB_IRI}
Beispiel #4
0
def make_rel(preds_dict: dict, subject_curie: str, object_curie: str,
             predicate: str, pmid: str, pub_date: str, sentence: str,
             subject_score: str, object_score: str, negated: bool):
    key = subject_curie + '-' + predicate + '-' + object_curie
    key_val = preds_dict.get(key, None)
    publication_curie = 'PMID:' + pmid
    publication_info_dict = {
        'publication date': pub_date,
        'sentence': sentence,
        'subject score': subject_score,
        'object score': object_score
    }
    if key_val is None:
        relation_type = predicate.lower()
        if relation_type != 'xref':
            relation_iri = kg2_util.convert_snake_case_to_camel_case(
                relation_type.replace(' ', '_'))
            relation_iri = relation_iri[0].lower() + relation_iri[1:]
            relation_iri = SEMMEDDB_IRI + '#' + relation_iri
            relation_curie = 'SEMMEDDB:' + relation_type
        else:
            relation_curie = 'OBO:xref'
            relation_iri = prefixcommons.expand_uri(relation_curie)
        edge_dict = kg2_util.make_edge(subject_curie, object_curie,
                                       relation_iri, relation_curie,
                                       relation_type, SEMMEDDB_IRI,
                                       curr_timestamp)
        edge_dict['publications'] = [publication_curie]
        edge_dict['publications info'] = {
            publication_curie: publication_info_dict
        }
        edge_dict['negated'] = negated
        preds_dict[key] = edge_dict
    else:
        key_val['publications info'][publication_curie] = publication_info_dict
        key_val['publications'] = key_val['publications'] + [publication_curie]
def make_kg2_graph(input_file_name: str, test_mode: bool = False):
    nodes = []
    edges = []
    line_ctr = 0
    update_date = None
    with open(input_file_name, 'r') as input_file:
        for line in input_file:
            line = line.rstrip("\n")
            if line.startswith('#'):
                update_date = line.replace('#', '')
                continue
            if line.startswith('gene_name\t'):
                continue
            line_ctr += 1
            if test_mode and line_ctr > 10000:
                break
            fields = line.split("\t")
            [gene_name,
             gene_claim_name,
             entrez_id,
             interaction_claim_source,
             interaction_types,
             drug_claim_name,
             drug_claim_primary_name,
             drug_name,
             drug_chembl_id,
             PMIDs] = fields
            if entrez_id != "":
                object_curie_id = 'NCBIGene:' + entrez_id
                if drug_chembl_id != "":
                    subject_curie_id = 'CHEMBL.COMPOUND:' + drug_chembl_id
                else:
                    if drug_claim_name != "":
                        node_pubs_list = []
                        subject_curie_id = None
                        if interaction_claim_source == "GuideToPharmacologyInteractions":
                            subject_curie_id = GTPI_CURIE_PREFIX + ':' + drug_claim_name
                            pmid_match = RE_PMID.match(drug_claim_primary_name)
                            if pmid_match is not None:
                                node_pubs_list = [pmid_match[2].replace(' ', '').strip()]
                                node_name = pmid_match[1].strip()
                            else:
                                node_name = drug_claim_primary_name
                            node_iri = GTPI_IRI_BASE + GTPI_LIGAND_SUFFIX + drug_claim_name
                            provided_by = GTPI_IRI_BASE
                        elif interaction_claim_source == "TTD":
                            subject_curie_id = TTD_CURIE_PREFIX + ':' + drug_claim_name
                            node_name = drug_claim_primary_name
                            node_iri = TTD_IRI_BASE + drug_claim_name
                            provided_by = TTD_IRI_BASE
                        if subject_curie_id is not None:
                            node_dict = kg2_util.make_node(subject_curie_id,
                                                           node_iri,
                                                           node_name,
                                                           'chemical_substance',
                                                           update_date,
                                                           provided_by)
                            node_dict['publications'] = node_pubs_list
                            nodes.append(node_dict)
                if subject_curie_id is None:
                    print("DGIDB: no controlled ID was provided for this drug: " + drug_claim_primary_name + "; source DB: " + interaction_claim_source, file=sys.stderr)
                    continue
                if interaction_types == "":
                    interaction_types = "affects"
                pmids_list = []
                if PMIDs.strip() != "":
                    pmids_list = [('PMID:' + pmid.strip()) for pmid in PMIDs.split(',')]
                interaction_list = interaction_types.split(',')
                for interaction in interaction_list:
                    interaction = interaction.replace(' ', '_')
                    edge_dict = kg2_util.make_edge(subject_curie_id,
                                                   object_curie_id,
                                                   DGIDB_BASE_IRI + '/' +
                                                   kg2_util.convert_snake_case_to_camel_case(interaction),
                                                   DGIDB_CURIE_PREFIX + ':' + interaction,
                                                   interaction,
                                                   DGIDB_BASE_IRI,
                                                   update_date)
                    edge_dict['publications'] = pmids_list
                    edges.append(edge_dict)
    return {'nodes': nodes,
            'edges': edges}
Beispiel #6
0
        # Create the new edge(s) based on this SemMedDB row
        for rel_to_make in get_rels_to_make_for_row(subject_cui_str,
                                                    object_cui_str, predicate):
            subject_curie = rel_to_make[0]
            object_curie = rel_to_make[1]
            edge_label = rel_to_make[2]
            # Exclude self-edges for certain types of predicates
            if subject_curie != object_curie or edge_label.lower(
            ) not in EDGE_LABELS_EXCLUDE_FOR_LOOPS:
                make_rel(edges_dict, subject_curie, object_curie, edge_label,
                         pmid, pub_date, sentence, subject_score, object_score,
                         negated)

        if predicate not in nodes_dict:
            relation_iri = kg2_util.convert_snake_case_to_camel_case(
                predicate.lower().replace(' ', '_'))
            relation_iri = SEMMEDDB_IRI + '#' + relation_iri
            nodes_dict[predicate] = kg2_util.make_node(
                id='SEMMEDDB:' + predicate.lower(),
                iri=relation_iri,
                name=predicate.lower(),
                category_label="relationship type",
                update_date=curr_timestamp,
                provided_by=SEMMEDDB_IRI)
    out_graph = {
        'edges': [rel_dict for rel_dict in edges_dict.values()],
        'nodes': [node_dict for node_dict in nodes_dict.values()]
    }
    for rel_dict in out_graph['edges']:
        if len(rel_dict['publications']) > 1:
            rel_dict['publications'] = list(set(rel_dict['publications']))