def import_proteins(gene_level=False):
    pathways = []
    proteins = []
    genes = []
    source = "http://smpdb.ca/downloads/smpdb_proteins.csv.zip"

    if not "smpdb_proteins.csv.zip" in os.listdir("raw_data/"):
        print("Started downloading smpdb_proteins.csv, It will take some time to download \n")
        wget.download(source, "raw_data")

    ZipFile("raw_data/smpdb_proteins.csv.zip").extractall("raw_data/smpdb_prot")
    pathway_prot = os.listdir("raw_data/smpdb_prot")

    print("Started importing {} files of smpdb_proteins".format(len(pathway_prot)))
    
    if gene_level:
        g = open("gene-level/smpdb_gene_{}.scm".format(str(date.today())), "w")

    with open("dataset/smpdb_protein_{}.scm".format(str(date.today())), 'w') as f:
        for filename in pathway_prot:
            data = pd.read_csv("raw_data/smpdb_prot/"+filename, low_memory=False)
            for r,c in data.iterrows():
                protein = filter_nan(str(data.iloc[r]['Uniprot ID']).split(".")[0].strip())
                protein_name = filter_nan(str(data.iloc[r]['Protein Name']).strip()) 
                gene = filter_nan(str(data.iloc[r]['Gene Name']).upper().strip())
                smpdb_id = filter_nan(str(data.iloc[r]['SMPDB ID']).strip())
                smpdb_name = filter_nan(str(data.iloc[r]['Pathway Name']).strip())
                try:
                    member = CMemberLink(CGeneNode(gene), SMPNode(smpdb_id))
                    f.write(member.recursive_print() + "\n")
                    expression = CEvaluationLink(CPredicateNode("expresses"), CListLink(CGeneNode(gene), ProteinNode(protein)))
                    f.write(expression.recursive_print() + "\n")                    
                    if gene_level:
                        g.write(member.recursive_print() + "\n")
                    if not smpdb_id in pathways:
                        smp_name = CEvaluationLink(CPredicateNode("has_name"), CListLink(SMPNode(smpdb_id), CConceptNode(smpdb_name)))
                        f.write(smp_name.recursive_print() + "\n")
                        pathways.append(smpdb_id)
                    if not protein in proteins:
                        prot_name = CEvaluationLink(CPredicateNode("has_name"), CListLink(ProteinNode(protein), CConceptNode(protein_name)))
                        f.write(prot_name.recursive_print() + "\n")
                        proteins.append(protein)
                    if not gene in genes:
                        genes.append(gene)
                except AttributeError:
                    print("Null value detected")
                    continue
    
    num_pathways = {"SMPDB Pathway": len(pathways)} 
    metadata.update_meta("smpdb_proteins: Latest",source, script,genes=len(genes), prot=len(proteins),pathways=num_pathways)
    print("Done. Check dataset/smpdb_protein.scm and gene-level/smpdb_gene.scm")
def import_metabolites(gene_level=False):
    pathways = []
    chebis = []
    source = "http://smpdb.ca/downloads/smpdb_metabolites.csv.zip"

    if not "smpdb_metabolites.csv.zip" in os.listdir("raw_data/"):

        print("Started downloading smpdb_metabolites.csv, it will take some time to download")
        wget.download(source, "raw_data/")
        
    ZipFile("raw_data/smpdb_metabolites.csv.zip").extractall("raw_data/smpdb_chebi")
    pathway_chebi = os.listdir("raw_data/smpdb_chebi")
     
    print("Started importing {} files of smpdb_metabolites".format(len(pathway_chebi)))

    # For a gene level dataset, excelude the name
    if gene_level:
        if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')):
            os.makedirs('gene-level')  
        g = open("gene-level/smpdb_chebi_{}.scm".format(str(date.today())), "w")

    with open("dataset/smpdb_chebi_{}.scm".format(str(date.today())), 'w') as f:
        for filename in pathway_chebi:
            data = pd.read_csv("raw_data/smpdb_chebi/"+filename, low_memory=False)

            for r,c in data.iterrows():
                chebi_id = filter_nan(str(data.iloc[r]['ChEBI ID']).split(".")[0].strip())
                smpdb_id = filter_nan(str(data.iloc[r]['SMPDB ID']).strip())
                chebi_name = filter_nan(str(data.iloc[r]['IUPAC']).strip())
                try:
                    if chebi_id: 
                        chebi_id= "ChEBI:" + chebi_id 
                    member = CMemberLink(ChebiNode(chebi_id), SMPNode(smpdb_id))
                    f.write(member.recursive_print() + "\n")
                    if gene_level:
                        g.write(member.recursive_print() + "\n")
                    if not chebi_id in chebis:
                        ch_name = CEvaluationLink(CPredicateNode("has_name"), CListLink(ChebiNode(chebi_id), CConceptNode(chebi_name)))
                        f.write(ch_name.recursive_print() + "\n")
                        chebis.append(chebi_id)
                    if not smpdb_id in pathways:
                        pathways.append(smpdb_id)
                except AttributeError:
                    print("Null value detected")
                    continue

    num_pathways = {"SMPDB Pathway": len(pathways)} 
    metadata.update_meta("smpdb_metabolites: Latest",source, script,chebi=len(chebis), pathways=num_pathways)        
    print("Done. Check dataset/smpdb_chebi.scm")
Example #3
0
def to_atomese(data):
    print("importing the data")
    df = data.dropna()
    genes = []
    proteins = []
    if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
        os.makedirs('dataset')
    output_file = "dataset/biogridgene2uniprot_{}.scm".format(str(
        date.today()))
    with open(output_file, 'w') as f:
        for i in range(df.shape[0]):
            gene = df.iloc[i]['gene_symbol'].upper().strip()
            biogrid_id = str(df.iloc[i]['biogrid_id'])
            prot = df.iloc[i]['uniprot'].strip()
            if gene and biogrid_id and prot:
                if gene not in genes:
                    genes.append(gene)
                if prot not in proteins:
                    proteins.append(prot)
                expresion = CEvaluationLink(
                    CPredicateNode("expresses"),
                    CListLink(CGeneNode(gene), ProteinNode(prot)))
                bio_prot = CEvaluationLink(
                    CPredicateNode("has_biogridID"),
                    CListLink(ProteinNode(prot),
                              CConceptNode("Bio:" + biogrid_id)))
                bio_gene = CEvaluationLink(
                    CPredicateNode("has_biogridID"),
                    CListLink(CGeneNode(gene),
                              CConceptNode("Bio:" + biogrid_id)))
                f.write("\n".join([
                    x.recursive_print()
                    for x in [expresion, bio_prot, bio_gene]
                ]))

    metadata.update_meta("Biogrid-Gene2uniprot:latest",
                         "uniprot2biogrid.csv, gene2biogrid.csv",
                         script,
                         genes=str(len(genes)),
                         prot=len(proteins))
    print("Done, check {}".format(output_file))
def to_atomese(data):
    print("importing the data")
    df = data.dropna()
    genes = []
    proteins = []
    if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
        os.makedirs('dataset')
    output_file = "dataset/biogridgene2uniprot_{}.scm".format(str(
        date.today()))
    with open(output_file, 'w') as f:
        for i in range(df.shape[0]):
            gene = df.iloc[i]['gene_symbol'].upper().strip()
            biogrid_id = str(df.iloc[i]['biogrid_id'])
            prot = df.iloc[i]['uniprot'].strip()
            if gene and biogrid_id and prot:
                if gene not in genes:
                    genes.append(gene)
                if prot not in proteins:
                    proteins.append(prot)
                f.write('(EvaluationLink \n' +
                        '\t(PredicateNode "expresses")\n' + '\t(ListLink \n' +
                        '\t\t(GeneNode "' + gene + '")\n' +
                        '\t\t(MoleculeNode "Uniprot:' + prot + '")))\n\n' +
                        '(EvaluationLink \n' +
                        '\t(PredicateNode "has_biogridID")\n' +
                        '\t(ListLink \n' + '\t\t(MoleculeNode "Uniprot:' +
                        prot + '")\n' + '\t\t(ConceptNode "Bio:' + biogrid_id +
                        '")))\n\n' + '(EvaluationLink \n' +
                        '\t(PredicateNode "has_biogridID")\n' +
                        '\t(ListLink \n' + '\t\t(GeneNode "' + gene + '")\n' +
                        '\t\t(ConceptNode "Bio:' + biogrid_id + '")))\n\n')
    metadata.update_meta("Biogrid-Gene2uniprot:latest",
                         "uniprot2biogrid.csv, gene2biogrid.csv",
                         script,
                         genes=str(len(genes)),
                         prot=len(proteins))
    print("Done, check {}".format(output_file))
def import_data(data, source, version, gene_level=False, form='tab2'):
    # Set the gene_level to True to get only the GGI without extra entrez and pubmedID info
    print("started importing")
    if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
        os.makedirs('dataset')

    if gene_level:
        if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')):
            os.makedirs('gene-level')
        g = open(
            'gene-level/COVID-19-biogrid_' + version + "_gene-level_" +
            str(date.today()) + '.scm', 'w')

    with open(
            'dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) +
            '.scm', 'w') as f:
        gene_pairs = []
        protein_pairs = []
        entrez = []
        covid_genes = []
        proteins = []
        for i in range(len(data)):
            if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or
                    pd.isnull(data.iloc[i]['Official Symbol Interactor B'])):
                gene1 = str(data.iloc[i]
                            ['Official Symbol Interactor A']).upper().strip()
                gene2 = str(data.iloc[i]
                            ['Official Symbol Interactor B']).upper().strip()
                prot1 = str(data.iloc[i]
                            ['SWISS-PROT Accessions Interactor A']).strip()
                prot2 = str(data.iloc[i]
                            ['SWISS-PROT Accessions Interactor B']).strip()
                score = data.iloc[i]['Score']
                entrez1 = str(data.iloc[i]['Entrez Gene Interactor A']).strip()
                entrez2 = str(data.iloc[i]['Entrez Gene Interactor B']).strip()
                taxonomy_id_1 = int(data.iloc[i]['Organism Interactor A'])
                taxonomy_id_2 = int(data.iloc[i]['Organism Interactor B'])

                gene_node_1 = CGeneNode(gene1)
                gene_node_2 = CGeneNode(gene2)

                prot_node_1 = ProteinNode(prot1)
                prot_node_2 = ProteinNode(prot2)

                stv_node = None
                if not str(score) in ["-", "nan"]:
                    stv_node = CStv(1.0, round(float(score), 3))

                if (gene1, gene2) not in gene_pairs:

                    if not gene1 in entrez:
                        entrez_ln_1 = CEvaluationLink(
                            CPredicateNode("has_entrez_id"),
                            CListLink(gene_node_1, Entrez(entrez1)))
                        f.write(entrez_ln_1.recursive_print() + "\n")
                        entrez.append(gene1)

                    if not gene2 in entrez:
                        eval_ln_2 = CEvaluationLink(
                            CPredicateNode("has_entrez_id"),
                            CListLink(gene_node_2, Entrez(entrez2)))
                        f.write(eval_ln_2.recursive_print() + "\n")
                        entrez.append(gene2)

                    interacts_ln = CEvaluationLink(
                        CPredicateNode("interacts_with"),
                        CSetLink(gene_node_1, gene_node_2),
                        stv=stv_node)
                    f.write(interacts_ln.recursive_print() + "\n")

                    if gene_level:
                        g.write(interacts_ln.recursive_print() + "\n")

                    if taxonomy_id_1 == 2697049:
                        covid_genes.append(gene1)
                        organism_ln_1 = CEvaluationLink(
                            CPredicateNode("from_organism"),
                            CListLink(
                                gene_node_1,
                                NcbiTaxonomy("taxid:{}".format(
                                    str(taxonomy_id_1)))))
                        organism_ln_2 = CEvaluationLink(
                            CPredicateNode("from_organism"),
                            CListLink(
                                prot_node_1,
                                NcbiTaxonomy("taxid:{}".format(
                                    str(taxonomy_id_1)))))
                        f.write(organism_ln_1.recursive_print() + "\n")
                        f.write(organism_ln_2.recursive_print() + "\n")
                        if gene_level:
                            g.write(organism_ln_1.recursive_print() + "\n")
                    if taxonomy_id_2 == 2697049:
                        covid_genes.append(gene2)
                        organism_ln_1 = CEvaluationLink(
                            CPredicateNode("from_organism"),
                            CListLink(
                                gene_node_2,
                                NcbiTaxonomy("taxid:{}".format(
                                    str(taxonomy_id_2)))))
                        organism_ln_2 = CEvaluationLink(
                            CPredicateNode("from_organism"),
                            CListLink(
                                prot_node_2,
                                NcbiTaxonomy("taxid:{}".format(
                                    str(taxonomy_id_2)))))
                        f.write(organism_ln_1.recursive_print() + "\n")
                        f.write(organism_ln_2.recursive_print() + "\n")
                        if gene_level:
                            g.write(organism_ln_1.recursive_print() + "\n")

                    gene_pairs.append((gene1, gene2))

                if (prot1, prot2) not in protein_pairs:
                    interacts_ln = CEvaluationLink(
                        CPredicateNode("interacts_with"),
                        CSetLink(prot_node_1, prot_node_2),
                        stv=stv_node)

                    f.write(interacts_ln.recursive_print() + "\n")

                    bio_1 = str(
                        data.iloc[i]['BioGRID ID Interactor A']).strip()
                    bio_2 = str(
                        data.iloc[i]['BioGRID ID Interactor B']).strip()
                    add_protein_interaction(proteins, prot_node_1, gene_node_1,
                                            prot_node_2, gene_node_2, bio_1,
                                            bio_2, f)

                    protein_pairs.append((prot1, prot2))

        org_name_ln = CEvaluationLink(
            CPredicateNode("has_name"),
            CListLink(NcbiTaxonomy("taxid:2697049"),
                      CConceptNode("SARS-CoV-2")))
        f.write(org_name_ln.recursive_print() + "\n")
        g.write(org_name_ln.recursive_print() + "\n")
    gene_pairs = set((a, b) if a <= b else (b, a) for a, b in gene_pairs)
    number_of_interactions = len(gene_pairs)
    script = "https://github.com/MOZI-AI/knowledge-import/coronavirus_biogrid.py"
    metadata.update_meta("Coronavirus Biogrid:" + version,
                         source,
                         script,
                         genes=str(len(set(entrez))),
                         prot=len(set(proteins)),
                         interactions=str(number_of_interactions))
    print("Done, check " + 'dataset/COVID-19-biogrid_' + version + "_" +
          str(date.today()) + '.scm')
    with open("Covid19-genes", "w") as co:
        co.write("\n".join(list(set(covid_genes))))
Example #6
0
prot = []
go = []
if not os.path.isfile('raw_data/goa_human.gaf.gz'):
    print("Downloading dataset")
    lines = gzip.open(wget.download(dataset_url, "raw_data/")).readlines()
    lines = [l.decode("utf-8") for l in lines]
else:
    lines = open('raw_data/goa_human.gaf.gz').readlines()

with open("raw_data/go-namespace.json", "r") as ns:
    go_namespace = json.load(ns)
init_namespace = len(go_namespace)

with open("dataset/uniprot2GO_{}.scm".format(str(date.today())), 'w') as f:
    print("\nStarted importing")
    for i in lines:
        if 'UniProtKB' in i:
            go_namespace, go_term = find_gons.find_type(i.split('\t')[4], go_namespace)
            protein = ProteinNode(i.split('\t')[1])
            if go_term:
                f.write(CMemberLink(protein,go_term).recursive_print() + "\n")
            prot.append(i.split('\t')[1])
            go.append(go_term)

if len(go_namespace) > init_namespace:
    with open("raw_data/go-namespace.json", "w") as ns:
        json.dump(go_namespace, ns, indent=2)

script = "https://github.com/MOZI-AI/knowledge-import/uniprot2GO.py"
metadata.update_meta("Uniprot-GO:latest", dataset_url,script,prot=len(set(prot)), goterms={"go-terms":len(set(go))})
print("Done, check dataset/uniprot2GO.scm")
Example #7
0
    for i in range(len(df)):
        rna = df.iloc[i]["transcript_stable_id"]
        gene = df.iloc[i]['Approved symbol'].strip().upper()
        prot = df.iloc[i]["xref"]
        rnas.append(rna)
        genes.append(gene)
        proteins.append(prot)
        if gene:
            trans = CEvaluationLink(CPredicateNode("transcribed_to"),
                                    CListLink(CGeneNode(gene), CRNANode(rna)))
            f.write(trans.recursive_print() + "\n")
        if rna:
            trans = CEvaluationLink(
                CPredicateNode("translated_to"),
                CListLink(CRNANode(rna), ProteinNode(prot)))
            f.write(trans.recursive_print() + "\n")
            expr = CEvaluationLink(
                CPredicateNode("expresses"),
                CListLink(CGeneNode(gene), ProteinNode(prot)))
            f.write(expr.recursive_print() + "\n")

version = dataset.split(".")[1]
script = "https://github.com/MOZI-AI/knowledge-import/codingRNA.py"

metadata.update_meta("codingRNA:{}".format(version),
                     dataset,
                     script,
                     genes=len(set(genes)),
                     rna=len(set(rnas)))

print("Done")
Example #8
0
      os.makedirs('gene-level')
if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
      os.makedirs('dataset') 
f_annotation = open('dataset/GO_annotation_{}.scm'.format(str(date.today())), 'a')
g_annotation = open('gene-level/GO_annotation_gene-level_{}.scm'.format(str(date.today())), 'a')

#add GOC Validation Date
f_annotation.write(";"+((lines[0]).split('!')[1]).split('$')[0]+ "\n")
f_annotation.write(";"+((lines[1]).split('!')[1]).split('$')[0]+ "\n\n")

genes = []
go = []
#loop through lines
for l in lines_annotate:
    db_object_symbol =l.split('\t')[2]
    go_id = (l.split('\t')[4]).split(':')[1]
    qualifier = l.split('\t')[3]
    gene_name = l.split('\t')[9]
    memberLink(db_object_symbol,go_id,qualifier)
    go.append(go_id)
    if not db_object_symbol in genes:
        genes.append(db_object_symbol)
        evaLink(db_object_symbol,gene_name, qualifier)
f_annotation.close()
g_annotation.close()
script = "https://github.com/MOZI-AI/knowledge-import/GO_Annotation_scm.py"
metadata.update_meta("GO_Annotation:latest", source,script,genes=len(genes), goterms={"go-terms":len(set(go))})
print("Done, check dataset/GO_annotation.scm and gene-level/GO_annotation.scm")


Example #9
0
    prot = []
    genes = []
    if not os.path.exists(os.getcwd() + '/dataset'):
        os.makedirs('dataset')
    with open("dataset/entrez_to_protein_{}.scm".format(str(date.today())),
              'w') as f:
        for i in range(len(data)):
            try:
                g = data.iloc[i]['symbol']
                p = data.iloc[i]['uniprot'].strip()
                genes.append(g)
                prot.append(p)
                f.write(
                    expres("expresses", '(GeneNode ' + '"' + g + '")\n',
                           '(MoleculeNode "' + 'Uniprot:' + p + '")\n'))
            except:
                continue
            if not math.isnan(data.iloc[i]['entrez']):
                f.write(
                    expres(
                        "has_entrez_id", '(GeneNode ' + '"' + g + '")\n',
                        '(ConceptNode "' + 'entrez:' +
                        str(int(data.iloc[i]['entrez'])) + '")\n'))
        metadata.update_meta("gene2proteinMapping:latest",
                             "entrez2uniprot.csv",
                             script,
                             genes=len(set(genes)),
                             prot=len(set(prot)))

        print("Done")
Example #10
0
            goterm[namespace].append(idd)
        # go_definition(idd, definition)
        # if len(synonym) != 0:
        #     sy_len = 0
        #     while sy_len < len(synonym):
        #         go_synonyms(idd, synonym[sy_len], synonym_type[sy_len])
        #         sy_len = sy_len + 1
        if len(is_a) != 0:
            isa_len = 0
            while isa_len < len(is_a):
                go_isa(idd, is_a[isa_len])
                isa_len = isa_len + 1
        # if len(alt_id) != 0:
        #     altid_len = 0
        #     while altid_len < len(alt_id):
        #         go_altid(idd, alt_id[altid_len])
        #         altid_len = altid_len + 1
        # if len(relationship) != 0:
        #     parts_len = 0
        #     while parts_len < len(relationship):
        #         go_relationship(idd, relationship[parts_len], relationship_type[parts_len])
        #         parts_len = parts_len + 1
    i= i + 1
f_go.close()
ns = {}
for k in goterm.keys():
    ns[k] = len(set(goterm[k]))
script = "https://github.com/MOZI-AI/agi-bio/blob/master/knowledge-import/SNET/GO_scm.py"
metadata.update_meta("GO Obo:latest", source,script,goterms=ns)
print("Done, check dataset/GO.scm")
Example #11
0
def import_metabolites(gene_level=False):
    pathways = []
    chebis = []
    source = "http://smpdb.ca/downloads/smpdb_metabolites.csv.zip"

    if not "smpdb_metabolites.csv.zip" in os.listdir("raw_data/"):

        print(
            "Started downloading smpdb_metabolites.csv, it will take some time to download"
        )
        wget.download(source, "raw_data/")

    ZipFile("raw_data/smpdb_metabolites.csv.zip").extractall(
        "raw_data/smpdb_chebi")
    pathway_chebi = os.listdir("raw_data/smpdb_chebi")

    print("Started importing {} files of smpdb_metabolites".format(
        len(pathway_chebi)))

    # For a gene level dataset, excelude the name
    if gene_level:
        if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')):
            os.makedirs('gene-level')
        g = open("gene-level/smpdb_chebi_{}.scm".format(str(date.today())),
                 "w")

    with open("dataset/smpdb_chebi_{}.scm".format(str(date.today())),
              'w') as f:
        for filename in pathway_chebi:
            data = pd.read_csv("raw_data/smpdb_chebi/" + filename,
                               low_memory=False)

            for r, c in data.iterrows():
                chebi_id = str(data.iloc[r]['ChEBI ID']).split(".")[0].strip()
                smpdb_id = str(data.iloc[r]['SMPDB ID']).strip()
                chebi_name = str(data.iloc[r]['IUPAC']).strip()

                if not chebi_id in chebis:
                    chebis.append(chebi_id)
                if not smpdb_id in pathways:
                    pathways.append(smpdb_id)

                f.write(
                    atomese(chebi_id,
                            'MoleculeNode',
                            smpdb_id,
                            'ConceptNode',
                            node1_prefix='ChEBI:'))
                g.write(
                    atomese(chebi_id,
                            'MoleculeNode',
                            smpdb_id,
                            'ConceptNode',
                            node1_prefix='ChEBI:'))
                f.write(
                    atomese(chebi_id,
                            'MoleculeNode',
                            chebi_name,
                            'ConceptNode',
                            node1_prefix='ChEBI:',
                            predicate='has_name'))

    num_pathways = {"SMPDB Pathway": len(pathways)}
    metadata.update_meta("smpdb_metabolites: Latest",
                         source,
                         script,
                         chebi=len(chebis),
                         pathways=num_pathways)
    print("Done. Check dataset/smpdb_chebi.scm")
Example #12
0
def import_proteins(gene_level=False):
    pathways = []
    proteins = []
    genes = []
    source = "http://smpdb.ca/downloads/smpdb_proteins.csv.zip"

    if not "smpdb_proteins.csv.zip" in os.listdir("raw_data/"):
        print(
            "Started downloading smpdb_proteins.csv, It will take some time to download \n"
        )
        wget.download(source, "raw_data")

    ZipFile("raw_data/smpdb_proteins.csv.zip").extractall(
        "raw_data/smpdb_prot")
    pathway_prot = os.listdir("raw_data/smpdb_prot")

    print("Started importing {} files of smpdb_proteins".format(
        len(pathway_prot)))

    if gene_level:
        g = open("gene-level/smpdb_gene_{}.scm".format(str(date.today())), "w")

    with open("dataset/smpdb_protein_{}.scm".format(str(date.today())),
              'w') as f:
        for filename in pathway_prot:
            data = pd.read_csv("raw_data/smpdb_prot/" + filename,
                               low_memory=False)
            for r, c in data.iterrows():
                protein = str(data.iloc[r]['Uniprot ID']).split(".")[0].strip()
                protein_name = str(data.iloc[r]['Protein Name']).strip()
                gene = str(data.iloc[r]['Gene Name']).upper().strip()
                smpdb_id = str(data.iloc[r]['SMPDB ID']).strip()
                smpdb_name = str(data.iloc[r]['Pathway Name']).strip()

                if not protein in proteins:
                    proteins.append(protein)
                if not gene in genes:
                    genes.append(gene)
                if not smpdb_id in pathways:
                    pathways.append(smpdb_id)

                f.write(
                    atomese(gene,
                            'GeneNode',
                            protein,
                            'MoleculeNode',
                            node2_prefix='Uniprot:',
                            predicate='expresses'))
                f.write(atomese(gene, 'GeneNode', smpdb_id, 'ConceptNode'))
                if gene_level:
                    g.write(atomese(gene, 'GeneNode', smpdb_id, 'ConceptNode'))
                f.write(
                    atomese(protein,
                            'MoleculeNode',
                            smpdb_id,
                            'ConceptNode',
                            node1_prefix='Uniprot:'))
                f.write(
                    atomese(smpdb_id,
                            'ConceptNode',
                            smpdb_name,
                            'ConceptNode',
                            predicate='has_name'))
                f.write(
                    atomese(protein,
                            'MoleculeNode',
                            protein_name,
                            'ConceptNode',
                            node1_prefix='Uniprot:',
                            predicate='has_name'))

            # print("Imported "+filename)

    num_pathways = {"SMPDB Pathway": len(pathways)}
    metadata.update_meta("smpdb_proteins: Latest",
                         source,
                         script,
                         genes=len(genes),
                         prot=len(proteins),
                         pathways=num_pathways)
    print(
        "Done. Check dataset/smpdb_protein.scm and gene-level/smpdb_gene.scm")
Example #13
0
def import_data(data, source, version, gene_level=False, form='tab2'):
    # Set the gene_level to True to get only the GGI without extra entrez and pubmedID info
    if form == 'tab2':
        pubsource = 'Pubmed ID'
    elif form == 'tab3':
        pubsource = 'Publication Source'
    else:
        raise RuntimeError("format {0} is not yet supported".format(form))
    data = data[[
        'Entrez Gene Interactor A', 'Entrez Gene Interactor B',
        'Official Symbol Interactor A', 'Official Symbol Interactor B',
        pubsource
    ]]
    print("started importing")
    dataset_path = os.path.join(os.getcwd(), 'dataset')
    if not os.path.exists(dataset_path):
        os.makedirs('dataset')

    if gene_level:
        if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')):
            os.makedirs('gene-level')
        g = open(
            'gene-level/biogrid_gene_gene_' + version + "_gene-level_" +
            str(date.today()) + '.scm', 'w')

    biogrid_path = os.path.join(
        dataset_path,
        'biogrid_gene_gene_' + version + '_' + str(date.today()) + '.scm')
    with open(biogrid_path, 'w') as f:
        pairs = collections.defaultdict(list)
        entrez = []
        for i in range(len(data)):
            if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or
                    pd.isnull(data.iloc[i]['Official Symbol Interactor B'])):
                node1 = str(data.iloc[i]
                            ['Official Symbol Interactor A']).upper().strip()
                node2 = str(data.iloc[i]
                            ['Official Symbol Interactor B']).upper().strip()
                gene_1 = CGeneNode(node1)
                gene_2 = CGeneNode(node2)
                pubmed = data.iloc[i][pubsource]
                if node1 > node2:
                    interactors = node1 + ':' + node2
                else:
                    interactors = node2 + ':' + node1

                pubmed_link = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=' + str(
                    pubmed)
                pairs[interactors].append(CConceptNode(pubmed_link))

                if not node1 in entrez:
                    entrez_1 = Entrez(
                        str(data.iloc[i]['Entrez Gene Interactor A']))
                    eval_ln = CEvaluationLink(CPredicateNode("has_entrez_id"),
                                              CListLink(gene_1, entrez_1))
                    f.write(eval_ln.recursive_print() + "\n")
                    entrez.append(node1)

                if not node2 in entrez:
                    entrez_2 = Entrez(
                        str(data.iloc[i]['Entrez Gene Interactor B']))
                    eval_ln = CEvaluationLink(CPredicateNode("has_entrez_id"),
                                              CListLink(gene_2, entrez_2))
                    f.write(eval_ln.recursive_print() + "\n")
                    entrez.append(node2)

        number_of_genes = []
        for p in pairs.keys():
            gene_1 = CGeneNode(p.split(':')[0])
            gene_2 = CGeneNode(p.split(':')[1])
            interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"),
                                           CSetLink(gene_1, gene_2))
            eval_ln = CEvaluationLink(
                CPredicateNode("has_pubmedID"),
                CListLink(interacts_ln, CListLink(*pairs[p])))
            f.write(eval_ln.recursive_print() + "\n")
            if gene_level:
                g.write(eval_ln.recursive_print() + "\n")

            number_of_genes.append(gene_1.name)
            number_of_genes.append(gene_2.name)

    number_of_interactions = len(set(pairs.keys()))
    script = "https://github.com/MOZI-AI/knowledge-import/biogrid.py"
    metadata.update_meta("Biogrid:" + version,
                         source,
                         script,
                         genes=str(len(set(number_of_genes))),
                         interactions=str(number_of_interactions))
    print("Done, check " + 'dataset/biogrid_gene_gene_' + version + "_" +
          str(date.today()) + '.scm')
Example #14
0
                               names=["ID", "name", "Species"])

pathway_list = pathway_list[pathway_list['Species'] == 'H**o sapiens']
max_len = max(len(pathway_list), len(pathway_relation))

print("Started importing")

script = "https://github.com/MOZI-AI/agi-bio/blob/master/knowledge-import/SNET/reactome_pathway.py"
pathways = pathway_relation['parent'].values + pathway_relation['child'].values
if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
    os.makedirs('dataset')
with open("dataset/reactome.scm", 'a') as f:
    for i in range(max_len):
        try:
            f.write(
                eva(pathway_list.iloc[i]['name'], pathway_list.iloc[i]['ID']))
            f.write(
                inherit(pathway_relation.iloc[i]['parent'],
                        pathway_relation.iloc[i]['child']))
        except IndexError:
            f.write(
                inherit(pathway_relation.iloc[i]['parent'],
                        pathway_relation.iloc[i]['child']))
num_pathways = {"Reactome Pathway": len(set(pathways))}
metadata.update_meta("Reactome Pathways relationship:latest",
                     pathway_rln + " " + pathway,
                     script,
                     pathways=num_pathways)

print("Done")
def import_dataset(dataset, delim, without_location=False):
    print("Started importing " + dataset)
    if "UniProt" in dataset or "ChEBI" in dataset:
        data = pd.read_csv(dataset,
                           low_memory=False,
                           delimiter=delim,
                           names=[
                               "db_id", "R_PE_id", "R_PE_name", "pathway",
                               "url", "event_name", "evidence_code", "species",
                               "un1", "un2", "un3", "un4", "un5", "un6"
                           ])

    else:
        data = pd.read_csv(dataset,
                           low_memory=False,
                           delimiter=delim,
                           names=[
                               "db_id", "R_PE_id", "R_PE_name", "pathway",
                               "url", "event_name", "evidence_code", "species"
                           ])
    mapping_entrez = pd.read_csv("raw_data/entrez.txt",
                                 low_memory=False,
                                 sep="\t")
    # Take only symbols of Human species
    data_human = data[data['species'] == 'H**o sapiens'][[
        'db_id', 'R_PE_name', 'pathway'
    ]]

    if without_location:
        if not os.path.exists(
                os.path.join(os.getcwd(), 'gene-level-without-location')):
            os.makedirs('gene-level-without-location')
        file_name = open(
            "gene-level-without-location/" + dataset.split("/")[-1] +
            "_without_location_{}.scm".format(str(date.today())), "w")

    if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
        os.makedirs('dataset')
    with open(
            "dataset/" + dataset.split("/")[-1] +
            "_{}.scm".format(str(date.today())), 'w') as f:
        if "NCBI" in dataset:
            genes = []
            pathways = []
            infered = {}
            gene_symbols = mapping_entrez["Approved symbol"].values
            for i in range(len(data_human)):
                gene_sym, location = find_location(
                    data_human.iloc[i]['R_PE_name'])
                pathway = data_human.iloc[i]['pathway']
                db_id = data_human.iloc[i]['db_id']
                try:
                    gene = mapping_entrez[
                        mapping_entrez["NCBI Gene ID"] == int(
                            db_id)]["Approved symbol"].values[0]
                except:
                    if len(gene_sym.split(" ")) > 1:
                        if str(db_id) in infered.keys():
                            gene = infered[str(db_id)]
                        else:
                            # non_exist.append(gene_sym + '\t' +str(db_id))
                            continue
                    else:
                        if gene_sym in gene_symbols:
                            gene = gene_sym
                            infered[str(db_id)] = gene
                        else:
                            continue
                if not gene.isdigit() and not len(gene) == 1 and not gene in [
                        "", " "
                ]:
                    gene = gene.strip()
                    member = CMemberLink(CGeneNode(gene),
                                         ReactomeNode(pathway))
                    eva = CEvaluationLink(
                        CPredicateNode("has_location"),
                        CListLink(CGeneNode(gene), CConceptNode(location)))
                    cont = CContextLink(member, eva)
                    f.write(cont.recursive_print() + "\n")
                    if without_location:
                        file_name.write(member.recursive_print() + "\n")
                    if not gene in genes:
                        genes.append(gene)
                    if not pathway in pathways:
                        pathways.append(pathway)
            version = "NCBI2reactome_pathway_mapping:latest"
            num_pathways = {"Reactome Pathway": len(pathways)}
            metadata.update_meta(version,
                                 ncbi,
                                 script,
                                 genes=len(genes),
                                 pathways=num_pathways)
        elif "UniProt" in dataset:
            molecules = []
            pathways = []
            for i in range(len(data_human)):
                prot = str(data_human.iloc[i]['R_PE_name'])
                loc = prot[prot.find("[") + 1:prot.find("]")]
                prot_name = prot.split("[" + loc + "]")[0]
                pathway = data_human.iloc[i]['pathway']
                protein = [
                    i for i in str(data_human.iloc[i]['db_id']).split("-")
                    if not i.strip().isdigit()
                ][-1]
                protein = protein.strip()
                member = CMemberLink(ProteinNode(protein),
                                     ReactomeNode(pathway))
                eva_loc = CEvaluationLink(
                    CPredicateNode("has_location"),
                    CListLink(ProteinNode(protein), CConceptNode(loc)))
                eva_name = CEvaluationLink(
                    CPredicateNode("has_name"),
                    CListLink(ProteinNode(protein), CConceptNode(prot_name)))
                cont = CContextLink(member, eva_loc)
                f.write(cont.recursive_print() + "\n")
                if without_location:
                    file_name.write(member.recursive_print() + "\n")
                if not protein in molecules:
                    molecules.append(protein)
                    f.write(eva_name.recursive_print() + "\n")
                if not pathway in pathways:
                    pathways.append(pathway)
            version = "Uniprot2reactome_pathway_mapping:latest"
            num_pathways = {"Reactome Pathway": len(pathways)}
            metadata.update_meta(version,
                                 ncbi,
                                 script,
                                 prot=len(molecules),
                                 pathways=num_pathways)
        elif "ChEBI" in dataset:
            molecules = []
            pathways = []
            for i in range(len(data_human)):
                chebi = str(data_human.iloc[i]['R_PE_name'])
                loc = chebi[chebi.find("[") + 1:chebi.find("]")]
                chebi_name = chebi.split("[" + loc + "]")[0].replace('"', "")
                chebi_id = str(data_human.iloc[i]['db_id'])
                if not chebi_id is "nan":
                    chebi_id = "ChEBI:" + str(chebi_id.strip())
                    pathway = data_human.iloc[i]['pathway']
                    member = CMemberLink(ChebiNode(chebi_id),
                                         ReactomeNode(pathway))
                    eva_loc = CEvaluationLink(
                        CPredicateNode("has_location"),
                        CListLink(ChebiNode(chebi_id), CConceptNode(loc)))
                    eva_name = CEvaluationLink(
                        CPredicateNode("has_name"),
                        CListLink(ChebiNode(chebi_id),
                                  CConceptNode(chebi_name)))
                    cont = CContextLink(member, eva_loc)
                    f.write(cont.recursive_print() + "\n")
                    if without_location:
                        file_name.write(member.recursive_print() + "\n")
                    if not chebi_id in molecules:
                        molecules.append(chebi_id)
                        f.write(eva_name.recursive_print() + "\n")
                    if not pathway in pathways:
                        pathways.append(pathway)
            version = "Chebi2reactome_pathway_mapping:latest"
            num_pathways = {"Reactome Pathway": len(pathways)}
            metadata.update_meta(version,
                                 ncbi,
                                 script,
                                 chebi=len(molecules),
                                 pathways=num_pathways)
    print("Done")
Example #16
0
def import_dataset(dataset, delim):
    print("Started importing " + dataset)
    if "UniProt" in dataset or "ChEBI" in dataset:
        data = pd.read_csv(dataset,
                           low_memory=False,
                           delimiter=delim,
                           names=[
                               "db_id", "R_PE_id", "R_PE_name", "pathway",
                               "url", "event_name", "evidence_code", "species",
                               "un1", "un2", "un3", "un4", "un5", "un6"
                           ])

    else:
        data = pd.read_csv(dataset,
                           low_memory=False,
                           delimiter=delim,
                           names=[
                               "db_id", "R_PE_id", "R_PE_name", "pathway",
                               "url", "event_name", "evidence_code", "species"
                           ])

    # Take only symbols of Human species
    data_human = data[data['species'] == 'H**o sapiens'][[
        'db_id', 'R_PE_name', 'pathway'
    ]]
    if not os.path.exists(os.path.join(os.getcwd(), 'dataset')):
        os.makedirs('dataset')
    with open("dataset/" + dataset.split("/")[-1] + ".scm", 'w') as f:
        if "NCBI" in dataset:
            genes = []
            pathways = []
            db_ids = {}
            for i in range(len(data_human)):
                gene, location = find_location(data_human.iloc[i]['R_PE_name'])
                pathway = data_human.iloc[i]['pathway']
                db_id = data_human.iloc[i]['db_id']
                # If a gene symbol is not one word, collect all gene symbols of the same db_id
                # and find the common word in the list (which is the gene symbol in most cases)
                # e.g "proKLK5" "KLK5" "propeptide KLK5"
                if len(gene.split(" ")) > 1:
                    if db_id in db_ids.keys():
                        gene = db_ids[data_human.iloc[i]['db_id']]
                    else:
                        gene_symbols = data_human[data_human['db_id'] ==
                                                  db_id]['R_PE_name'].values
                        gene_symbols = [
                            find_location(i, True) for i in gene_symbols
                        ]
                        if len(set(gene_symbols)) > 1:
                            stemed = findstem(gene_symbols)
                        else:
                            stemed = gene_symbols[-1]

                        if not (stemed.isdigit() and stemed in ["", " "]
                                and len(stemed) == 1):
                            db_ids.update({db_id: stemed})
                        gene = stemed
                if not gene.isdigit() and not len(gene) == 1 and not gene in [
                        "", " "
                ]:
                    f.write("(AndLink\n")
                    f.write(member(gene, pathway))
                    f.write(eva('l', gene, location))
                    f.write(")\n")
                    if not gene in genes:
                        genes.append(gene)
                    if not pathway in pathways:
                        pathways.append(pathway)
            version = "NCBI2reactome_pathway_mapping:latest"
            num_pathways = {"Reactome Pathway": len(pathways)}
            metadata.update_meta(version,
                                 ncbi,
                                 script,
                                 genes=len(genes),
                                 pathways=num_pathways)
        elif "UniProt" in dataset:
            molecules = []
            pathways = []
            for i in range(len(data_human)):
                prot = str(data_human.iloc[i]['R_PE_name'])
                loc = prot[prot.find("[") + 1:prot.find("]")]
                prot_name = prot.split("[" + loc + "]")[0]
                pathway = data_human.iloc[i]['pathway']
                protein = [
                    i for i in str(data_human.iloc[i]['db_id']).split("-")
                    if not i.strip().isdigit()
                ][-1]
                f.write("(AndLink\n")
                f.write(member("Uniprot:" + str(protein), pathway))
                f.write(eva('l', "Uniprot:" + str(protein), loc))
                f.write(")\n")
                if not protein in molecules:
                    molecules.append(protein)
                    f.write(eva("n", "Uniprot:" + str(protein), prot_name))
                if not pathway in pathways:
                    pathways.append(pathway)
            version = "Uniprot2reactome_pathway_mapping:latest"
            num_pathways = {"Reactome Pathway": len(pathways)}
            metadata.update_meta(version,
                                 ncbi,
                                 script,
                                 prot=len(molecules),
                                 pathways=num_pathways)
        elif "ChEBI" in dataset:
            molecules = []
            pathways = []
            for i in range(len(data_human)):
                chebi = str(data_human.iloc[i]['R_PE_name'])
                loc = chebi[chebi.find("[") + 1:chebi.find("]")]
                chebi_name = chebi.split("[" + loc + "]")[0].replace('"', "")
                chebi_id = str(data_human.iloc[i]['db_id'])
                pathway = data_human.iloc[i]['pathway']
                f.write("(AndLink \n")
                f.write(member("ChEBI:" + chebi_id, pathway))
                f.write(eva('l', "ChEBI:" + chebi_id, loc))
                f.write(")\n")
                if not chebi_id in molecules:
                    molecules.append(chebi_id)
                    f.write(eva("n", "ChEBI:" + chebi_id, chebi_name))
                if not pathway in pathways:
                    pathways.append(pathway)
            version = "Chebi2reactome_pathway_mapping:latest"
            num_pathways = {"Reactome Pathway": len(pathways)}
            metadata.update_meta(version,
                                 ncbi,
                                 script,
                                 chebi=len(molecules),
                                 pathways=num_pathways)
    print("Done")