def import_proteins(gene_level=False): pathways = [] proteins = [] genes = [] source = "http://smpdb.ca/downloads/smpdb_proteins.csv.zip" if not "smpdb_proteins.csv.zip" in os.listdir("raw_data/"): print("Started downloading smpdb_proteins.csv, It will take some time to download \n") wget.download(source, "raw_data") ZipFile("raw_data/smpdb_proteins.csv.zip").extractall("raw_data/smpdb_prot") pathway_prot = os.listdir("raw_data/smpdb_prot") print("Started importing {} files of smpdb_proteins".format(len(pathway_prot))) if gene_level: g = open("gene-level/smpdb_gene_{}.scm".format(str(date.today())), "w") with open("dataset/smpdb_protein_{}.scm".format(str(date.today())), 'w') as f: for filename in pathway_prot: data = pd.read_csv("raw_data/smpdb_prot/"+filename, low_memory=False) for r,c in data.iterrows(): protein = filter_nan(str(data.iloc[r]['Uniprot ID']).split(".")[0].strip()) protein_name = filter_nan(str(data.iloc[r]['Protein Name']).strip()) gene = filter_nan(str(data.iloc[r]['Gene Name']).upper().strip()) smpdb_id = filter_nan(str(data.iloc[r]['SMPDB ID']).strip()) smpdb_name = filter_nan(str(data.iloc[r]['Pathway Name']).strip()) try: member = CMemberLink(CGeneNode(gene), SMPNode(smpdb_id)) f.write(member.recursive_print() + "\n") expression = CEvaluationLink(CPredicateNode("expresses"), CListLink(CGeneNode(gene), ProteinNode(protein))) f.write(expression.recursive_print() + "\n") if gene_level: g.write(member.recursive_print() + "\n") if not smpdb_id in pathways: smp_name = CEvaluationLink(CPredicateNode("has_name"), CListLink(SMPNode(smpdb_id), CConceptNode(smpdb_name))) f.write(smp_name.recursive_print() + "\n") pathways.append(smpdb_id) if not protein in proteins: prot_name = CEvaluationLink(CPredicateNode("has_name"), CListLink(ProteinNode(protein), CConceptNode(protein_name))) f.write(prot_name.recursive_print() + "\n") proteins.append(protein) if not gene in genes: genes.append(gene) except AttributeError: print("Null value detected") continue num_pathways = {"SMPDB Pathway": len(pathways)} metadata.update_meta("smpdb_proteins: Latest",source, script,genes=len(genes), prot=len(proteins),pathways=num_pathways) print("Done. Check dataset/smpdb_protein.scm and gene-level/smpdb_gene.scm")
def import_metabolites(gene_level=False): pathways = [] chebis = [] source = "http://smpdb.ca/downloads/smpdb_metabolites.csv.zip" if not "smpdb_metabolites.csv.zip" in os.listdir("raw_data/"): print("Started downloading smpdb_metabolites.csv, it will take some time to download") wget.download(source, "raw_data/") ZipFile("raw_data/smpdb_metabolites.csv.zip").extractall("raw_data/smpdb_chebi") pathway_chebi = os.listdir("raw_data/smpdb_chebi") print("Started importing {} files of smpdb_metabolites".format(len(pathway_chebi))) # For a gene level dataset, excelude the name if gene_level: if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')): os.makedirs('gene-level') g = open("gene-level/smpdb_chebi_{}.scm".format(str(date.today())), "w") with open("dataset/smpdb_chebi_{}.scm".format(str(date.today())), 'w') as f: for filename in pathway_chebi: data = pd.read_csv("raw_data/smpdb_chebi/"+filename, low_memory=False) for r,c in data.iterrows(): chebi_id = filter_nan(str(data.iloc[r]['ChEBI ID']).split(".")[0].strip()) smpdb_id = filter_nan(str(data.iloc[r]['SMPDB ID']).strip()) chebi_name = filter_nan(str(data.iloc[r]['IUPAC']).strip()) try: if chebi_id: chebi_id= "ChEBI:" + chebi_id member = CMemberLink(ChebiNode(chebi_id), SMPNode(smpdb_id)) f.write(member.recursive_print() + "\n") if gene_level: g.write(member.recursive_print() + "\n") if not chebi_id in chebis: ch_name = CEvaluationLink(CPredicateNode("has_name"), CListLink(ChebiNode(chebi_id), CConceptNode(chebi_name))) f.write(ch_name.recursive_print() + "\n") chebis.append(chebi_id) if not smpdb_id in pathways: pathways.append(smpdb_id) except AttributeError: print("Null value detected") continue num_pathways = {"SMPDB Pathway": len(pathways)} metadata.update_meta("smpdb_metabolites: Latest",source, script,chebi=len(chebis), pathways=num_pathways) print("Done. Check dataset/smpdb_chebi.scm")
def to_atomese(data): print("importing the data") df = data.dropna() genes = [] proteins = [] if not os.path.exists(os.path.join(os.getcwd(), 'dataset')): os.makedirs('dataset') output_file = "dataset/biogridgene2uniprot_{}.scm".format(str( date.today())) with open(output_file, 'w') as f: for i in range(df.shape[0]): gene = df.iloc[i]['gene_symbol'].upper().strip() biogrid_id = str(df.iloc[i]['biogrid_id']) prot = df.iloc[i]['uniprot'].strip() if gene and biogrid_id and prot: if gene not in genes: genes.append(gene) if prot not in proteins: proteins.append(prot) expresion = CEvaluationLink( CPredicateNode("expresses"), CListLink(CGeneNode(gene), ProteinNode(prot))) bio_prot = CEvaluationLink( CPredicateNode("has_biogridID"), CListLink(ProteinNode(prot), CConceptNode("Bio:" + biogrid_id))) bio_gene = CEvaluationLink( CPredicateNode("has_biogridID"), CListLink(CGeneNode(gene), CConceptNode("Bio:" + biogrid_id))) f.write("\n".join([ x.recursive_print() for x in [expresion, bio_prot, bio_gene] ])) metadata.update_meta("Biogrid-Gene2uniprot:latest", "uniprot2biogrid.csv, gene2biogrid.csv", script, genes=str(len(genes)), prot=len(proteins)) print("Done, check {}".format(output_file))
def to_atomese(data): print("importing the data") df = data.dropna() genes = [] proteins = [] if not os.path.exists(os.path.join(os.getcwd(), 'dataset')): os.makedirs('dataset') output_file = "dataset/biogridgene2uniprot_{}.scm".format(str( date.today())) with open(output_file, 'w') as f: for i in range(df.shape[0]): gene = df.iloc[i]['gene_symbol'].upper().strip() biogrid_id = str(df.iloc[i]['biogrid_id']) prot = df.iloc[i]['uniprot'].strip() if gene and biogrid_id and prot: if gene not in genes: genes.append(gene) if prot not in proteins: proteins.append(prot) f.write('(EvaluationLink \n' + '\t(PredicateNode "expresses")\n' + '\t(ListLink \n' + '\t\t(GeneNode "' + gene + '")\n' + '\t\t(MoleculeNode "Uniprot:' + prot + '")))\n\n' + '(EvaluationLink \n' + '\t(PredicateNode "has_biogridID")\n' + '\t(ListLink \n' + '\t\t(MoleculeNode "Uniprot:' + prot + '")\n' + '\t\t(ConceptNode "Bio:' + biogrid_id + '")))\n\n' + '(EvaluationLink \n' + '\t(PredicateNode "has_biogridID")\n' + '\t(ListLink \n' + '\t\t(GeneNode "' + gene + '")\n' + '\t\t(ConceptNode "Bio:' + biogrid_id + '")))\n\n') metadata.update_meta("Biogrid-Gene2uniprot:latest", "uniprot2biogrid.csv, gene2biogrid.csv", script, genes=str(len(genes)), prot=len(proteins)) print("Done, check {}".format(output_file))
def import_data(data, source, version, gene_level=False, form='tab2'): # Set the gene_level to True to get only the GGI without extra entrez and pubmedID info print("started importing") if not os.path.exists(os.path.join(os.getcwd(), 'dataset')): os.makedirs('dataset') if gene_level: if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')): os.makedirs('gene-level') g = open( 'gene-level/COVID-19-biogrid_' + version + "_gene-level_" + str(date.today()) + '.scm', 'w') with open( 'dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm', 'w') as f: gene_pairs = [] protein_pairs = [] entrez = [] covid_genes = [] proteins = [] for i in range(len(data)): if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(data.iloc[i]['Official Symbol Interactor B'])): gene1 = str(data.iloc[i] ['Official Symbol Interactor A']).upper().strip() gene2 = str(data.iloc[i] ['Official Symbol Interactor B']).upper().strip() prot1 = str(data.iloc[i] ['SWISS-PROT Accessions Interactor A']).strip() prot2 = str(data.iloc[i] ['SWISS-PROT Accessions Interactor B']).strip() score = data.iloc[i]['Score'] entrez1 = str(data.iloc[i]['Entrez Gene Interactor A']).strip() entrez2 = str(data.iloc[i]['Entrez Gene Interactor B']).strip() taxonomy_id_1 = int(data.iloc[i]['Organism Interactor A']) taxonomy_id_2 = int(data.iloc[i]['Organism Interactor B']) gene_node_1 = CGeneNode(gene1) gene_node_2 = CGeneNode(gene2) prot_node_1 = ProteinNode(prot1) prot_node_2 = ProteinNode(prot2) stv_node = None if not str(score) in ["-", "nan"]: stv_node = CStv(1.0, round(float(score), 3)) if (gene1, gene2) not in gene_pairs: if not gene1 in entrez: entrez_ln_1 = CEvaluationLink( CPredicateNode("has_entrez_id"), CListLink(gene_node_1, Entrez(entrez1))) f.write(entrez_ln_1.recursive_print() + "\n") entrez.append(gene1) if not gene2 in entrez: eval_ln_2 = CEvaluationLink( CPredicateNode("has_entrez_id"), CListLink(gene_node_2, Entrez(entrez2))) f.write(eval_ln_2.recursive_print() + "\n") entrez.append(gene2) interacts_ln = CEvaluationLink( CPredicateNode("interacts_with"), CSetLink(gene_node_1, gene_node_2), stv=stv_node) f.write(interacts_ln.recursive_print() + "\n") if gene_level: g.write(interacts_ln.recursive_print() + "\n") if taxonomy_id_1 == 2697049: covid_genes.append(gene1) organism_ln_1 = CEvaluationLink( CPredicateNode("from_organism"), CListLink( gene_node_1, NcbiTaxonomy("taxid:{}".format( str(taxonomy_id_1))))) organism_ln_2 = CEvaluationLink( CPredicateNode("from_organism"), CListLink( prot_node_1, NcbiTaxonomy("taxid:{}".format( str(taxonomy_id_1))))) f.write(organism_ln_1.recursive_print() + "\n") f.write(organism_ln_2.recursive_print() + "\n") if gene_level: g.write(organism_ln_1.recursive_print() + "\n") if taxonomy_id_2 == 2697049: covid_genes.append(gene2) organism_ln_1 = CEvaluationLink( CPredicateNode("from_organism"), CListLink( gene_node_2, NcbiTaxonomy("taxid:{}".format( str(taxonomy_id_2))))) organism_ln_2 = CEvaluationLink( CPredicateNode("from_organism"), CListLink( prot_node_2, NcbiTaxonomy("taxid:{}".format( str(taxonomy_id_2))))) f.write(organism_ln_1.recursive_print() + "\n") f.write(organism_ln_2.recursive_print() + "\n") if gene_level: g.write(organism_ln_1.recursive_print() + "\n") gene_pairs.append((gene1, gene2)) if (prot1, prot2) not in protein_pairs: interacts_ln = CEvaluationLink( CPredicateNode("interacts_with"), CSetLink(prot_node_1, prot_node_2), stv=stv_node) f.write(interacts_ln.recursive_print() + "\n") bio_1 = str( data.iloc[i]['BioGRID ID Interactor A']).strip() bio_2 = str( data.iloc[i]['BioGRID ID Interactor B']).strip() add_protein_interaction(proteins, prot_node_1, gene_node_1, prot_node_2, gene_node_2, bio_1, bio_2, f) protein_pairs.append((prot1, prot2)) org_name_ln = CEvaluationLink( CPredicateNode("has_name"), CListLink(NcbiTaxonomy("taxid:2697049"), CConceptNode("SARS-CoV-2"))) f.write(org_name_ln.recursive_print() + "\n") g.write(org_name_ln.recursive_print() + "\n") gene_pairs = set((a, b) if a <= b else (b, a) for a, b in gene_pairs) number_of_interactions = len(gene_pairs) script = "https://github.com/MOZI-AI/knowledge-import/coronavirus_biogrid.py" metadata.update_meta("Coronavirus Biogrid:" + version, source, script, genes=str(len(set(entrez))), prot=len(set(proteins)), interactions=str(number_of_interactions)) print("Done, check " + 'dataset/COVID-19-biogrid_' + version + "_" + str(date.today()) + '.scm') with open("Covid19-genes", "w") as co: co.write("\n".join(list(set(covid_genes))))
prot = [] go = [] if not os.path.isfile('raw_data/goa_human.gaf.gz'): print("Downloading dataset") lines = gzip.open(wget.download(dataset_url, "raw_data/")).readlines() lines = [l.decode("utf-8") for l in lines] else: lines = open('raw_data/goa_human.gaf.gz').readlines() with open("raw_data/go-namespace.json", "r") as ns: go_namespace = json.load(ns) init_namespace = len(go_namespace) with open("dataset/uniprot2GO_{}.scm".format(str(date.today())), 'w') as f: print("\nStarted importing") for i in lines: if 'UniProtKB' in i: go_namespace, go_term = find_gons.find_type(i.split('\t')[4], go_namespace) protein = ProteinNode(i.split('\t')[1]) if go_term: f.write(CMemberLink(protein,go_term).recursive_print() + "\n") prot.append(i.split('\t')[1]) go.append(go_term) if len(go_namespace) > init_namespace: with open("raw_data/go-namespace.json", "w") as ns: json.dump(go_namespace, ns, indent=2) script = "https://github.com/MOZI-AI/knowledge-import/uniprot2GO.py" metadata.update_meta("Uniprot-GO:latest", dataset_url,script,prot=len(set(prot)), goterms={"go-terms":len(set(go))}) print("Done, check dataset/uniprot2GO.scm")
for i in range(len(df)): rna = df.iloc[i]["transcript_stable_id"] gene = df.iloc[i]['Approved symbol'].strip().upper() prot = df.iloc[i]["xref"] rnas.append(rna) genes.append(gene) proteins.append(prot) if gene: trans = CEvaluationLink(CPredicateNode("transcribed_to"), CListLink(CGeneNode(gene), CRNANode(rna))) f.write(trans.recursive_print() + "\n") if rna: trans = CEvaluationLink( CPredicateNode("translated_to"), CListLink(CRNANode(rna), ProteinNode(prot))) f.write(trans.recursive_print() + "\n") expr = CEvaluationLink( CPredicateNode("expresses"), CListLink(CGeneNode(gene), ProteinNode(prot))) f.write(expr.recursive_print() + "\n") version = dataset.split(".")[1] script = "https://github.com/MOZI-AI/knowledge-import/codingRNA.py" metadata.update_meta("codingRNA:{}".format(version), dataset, script, genes=len(set(genes)), rna=len(set(rnas))) print("Done")
os.makedirs('gene-level') if not os.path.exists(os.path.join(os.getcwd(), 'dataset')): os.makedirs('dataset') f_annotation = open('dataset/GO_annotation_{}.scm'.format(str(date.today())), 'a') g_annotation = open('gene-level/GO_annotation_gene-level_{}.scm'.format(str(date.today())), 'a') #add GOC Validation Date f_annotation.write(";"+((lines[0]).split('!')[1]).split('$')[0]+ "\n") f_annotation.write(";"+((lines[1]).split('!')[1]).split('$')[0]+ "\n\n") genes = [] go = [] #loop through lines for l in lines_annotate: db_object_symbol =l.split('\t')[2] go_id = (l.split('\t')[4]).split(':')[1] qualifier = l.split('\t')[3] gene_name = l.split('\t')[9] memberLink(db_object_symbol,go_id,qualifier) go.append(go_id) if not db_object_symbol in genes: genes.append(db_object_symbol) evaLink(db_object_symbol,gene_name, qualifier) f_annotation.close() g_annotation.close() script = "https://github.com/MOZI-AI/knowledge-import/GO_Annotation_scm.py" metadata.update_meta("GO_Annotation:latest", source,script,genes=len(genes), goterms={"go-terms":len(set(go))}) print("Done, check dataset/GO_annotation.scm and gene-level/GO_annotation.scm")
prot = [] genes = [] if not os.path.exists(os.getcwd() + '/dataset'): os.makedirs('dataset') with open("dataset/entrez_to_protein_{}.scm".format(str(date.today())), 'w') as f: for i in range(len(data)): try: g = data.iloc[i]['symbol'] p = data.iloc[i]['uniprot'].strip() genes.append(g) prot.append(p) f.write( expres("expresses", '(GeneNode ' + '"' + g + '")\n', '(MoleculeNode "' + 'Uniprot:' + p + '")\n')) except: continue if not math.isnan(data.iloc[i]['entrez']): f.write( expres( "has_entrez_id", '(GeneNode ' + '"' + g + '")\n', '(ConceptNode "' + 'entrez:' + str(int(data.iloc[i]['entrez'])) + '")\n')) metadata.update_meta("gene2proteinMapping:latest", "entrez2uniprot.csv", script, genes=len(set(genes)), prot=len(set(prot))) print("Done")
goterm[namespace].append(idd) # go_definition(idd, definition) # if len(synonym) != 0: # sy_len = 0 # while sy_len < len(synonym): # go_synonyms(idd, synonym[sy_len], synonym_type[sy_len]) # sy_len = sy_len + 1 if len(is_a) != 0: isa_len = 0 while isa_len < len(is_a): go_isa(idd, is_a[isa_len]) isa_len = isa_len + 1 # if len(alt_id) != 0: # altid_len = 0 # while altid_len < len(alt_id): # go_altid(idd, alt_id[altid_len]) # altid_len = altid_len + 1 # if len(relationship) != 0: # parts_len = 0 # while parts_len < len(relationship): # go_relationship(idd, relationship[parts_len], relationship_type[parts_len]) # parts_len = parts_len + 1 i= i + 1 f_go.close() ns = {} for k in goterm.keys(): ns[k] = len(set(goterm[k])) script = "https://github.com/MOZI-AI/agi-bio/blob/master/knowledge-import/SNET/GO_scm.py" metadata.update_meta("GO Obo:latest", source,script,goterms=ns) print("Done, check dataset/GO.scm")
def import_metabolites(gene_level=False): pathways = [] chebis = [] source = "http://smpdb.ca/downloads/smpdb_metabolites.csv.zip" if not "smpdb_metabolites.csv.zip" in os.listdir("raw_data/"): print( "Started downloading smpdb_metabolites.csv, it will take some time to download" ) wget.download(source, "raw_data/") ZipFile("raw_data/smpdb_metabolites.csv.zip").extractall( "raw_data/smpdb_chebi") pathway_chebi = os.listdir("raw_data/smpdb_chebi") print("Started importing {} files of smpdb_metabolites".format( len(pathway_chebi))) # For a gene level dataset, excelude the name if gene_level: if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')): os.makedirs('gene-level') g = open("gene-level/smpdb_chebi_{}.scm".format(str(date.today())), "w") with open("dataset/smpdb_chebi_{}.scm".format(str(date.today())), 'w') as f: for filename in pathway_chebi: data = pd.read_csv("raw_data/smpdb_chebi/" + filename, low_memory=False) for r, c in data.iterrows(): chebi_id = str(data.iloc[r]['ChEBI ID']).split(".")[0].strip() smpdb_id = str(data.iloc[r]['SMPDB ID']).strip() chebi_name = str(data.iloc[r]['IUPAC']).strip() if not chebi_id in chebis: chebis.append(chebi_id) if not smpdb_id in pathways: pathways.append(smpdb_id) f.write( atomese(chebi_id, 'MoleculeNode', smpdb_id, 'ConceptNode', node1_prefix='ChEBI:')) g.write( atomese(chebi_id, 'MoleculeNode', smpdb_id, 'ConceptNode', node1_prefix='ChEBI:')) f.write( atomese(chebi_id, 'MoleculeNode', chebi_name, 'ConceptNode', node1_prefix='ChEBI:', predicate='has_name')) num_pathways = {"SMPDB Pathway": len(pathways)} metadata.update_meta("smpdb_metabolites: Latest", source, script, chebi=len(chebis), pathways=num_pathways) print("Done. Check dataset/smpdb_chebi.scm")
def import_proteins(gene_level=False): pathways = [] proteins = [] genes = [] source = "http://smpdb.ca/downloads/smpdb_proteins.csv.zip" if not "smpdb_proteins.csv.zip" in os.listdir("raw_data/"): print( "Started downloading smpdb_proteins.csv, It will take some time to download \n" ) wget.download(source, "raw_data") ZipFile("raw_data/smpdb_proteins.csv.zip").extractall( "raw_data/smpdb_prot") pathway_prot = os.listdir("raw_data/smpdb_prot") print("Started importing {} files of smpdb_proteins".format( len(pathway_prot))) if gene_level: g = open("gene-level/smpdb_gene_{}.scm".format(str(date.today())), "w") with open("dataset/smpdb_protein_{}.scm".format(str(date.today())), 'w') as f: for filename in pathway_prot: data = pd.read_csv("raw_data/smpdb_prot/" + filename, low_memory=False) for r, c in data.iterrows(): protein = str(data.iloc[r]['Uniprot ID']).split(".")[0].strip() protein_name = str(data.iloc[r]['Protein Name']).strip() gene = str(data.iloc[r]['Gene Name']).upper().strip() smpdb_id = str(data.iloc[r]['SMPDB ID']).strip() smpdb_name = str(data.iloc[r]['Pathway Name']).strip() if not protein in proteins: proteins.append(protein) if not gene in genes: genes.append(gene) if not smpdb_id in pathways: pathways.append(smpdb_id) f.write( atomese(gene, 'GeneNode', protein, 'MoleculeNode', node2_prefix='Uniprot:', predicate='expresses')) f.write(atomese(gene, 'GeneNode', smpdb_id, 'ConceptNode')) if gene_level: g.write(atomese(gene, 'GeneNode', smpdb_id, 'ConceptNode')) f.write( atomese(protein, 'MoleculeNode', smpdb_id, 'ConceptNode', node1_prefix='Uniprot:')) f.write( atomese(smpdb_id, 'ConceptNode', smpdb_name, 'ConceptNode', predicate='has_name')) f.write( atomese(protein, 'MoleculeNode', protein_name, 'ConceptNode', node1_prefix='Uniprot:', predicate='has_name')) # print("Imported "+filename) num_pathways = {"SMPDB Pathway": len(pathways)} metadata.update_meta("smpdb_proteins: Latest", source, script, genes=len(genes), prot=len(proteins), pathways=num_pathways) print( "Done. Check dataset/smpdb_protein.scm and gene-level/smpdb_gene.scm")
def import_data(data, source, version, gene_level=False, form='tab2'): # Set the gene_level to True to get only the GGI without extra entrez and pubmedID info if form == 'tab2': pubsource = 'Pubmed ID' elif form == 'tab3': pubsource = 'Publication Source' else: raise RuntimeError("format {0} is not yet supported".format(form)) data = data[[ 'Entrez Gene Interactor A', 'Entrez Gene Interactor B', 'Official Symbol Interactor A', 'Official Symbol Interactor B', pubsource ]] print("started importing") dataset_path = os.path.join(os.getcwd(), 'dataset') if not os.path.exists(dataset_path): os.makedirs('dataset') if gene_level: if not os.path.exists(os.path.join(os.getcwd(), 'gene-level')): os.makedirs('gene-level') g = open( 'gene-level/biogrid_gene_gene_' + version + "_gene-level_" + str(date.today()) + '.scm', 'w') biogrid_path = os.path.join( dataset_path, 'biogrid_gene_gene_' + version + '_' + str(date.today()) + '.scm') with open(biogrid_path, 'w') as f: pairs = collections.defaultdict(list) entrez = [] for i in range(len(data)): if not (pd.isnull(data.iloc[i]['Official Symbol Interactor A']) or pd.isnull(data.iloc[i]['Official Symbol Interactor B'])): node1 = str(data.iloc[i] ['Official Symbol Interactor A']).upper().strip() node2 = str(data.iloc[i] ['Official Symbol Interactor B']).upper().strip() gene_1 = CGeneNode(node1) gene_2 = CGeneNode(node2) pubmed = data.iloc[i][pubsource] if node1 > node2: interactors = node1 + ':' + node2 else: interactors = node2 + ':' + node1 pubmed_link = 'https://www.ncbi.nlm.nih.gov/pubmed/?term=' + str( pubmed) pairs[interactors].append(CConceptNode(pubmed_link)) if not node1 in entrez: entrez_1 = Entrez( str(data.iloc[i]['Entrez Gene Interactor A'])) eval_ln = CEvaluationLink(CPredicateNode("has_entrez_id"), CListLink(gene_1, entrez_1)) f.write(eval_ln.recursive_print() + "\n") entrez.append(node1) if not node2 in entrez: entrez_2 = Entrez( str(data.iloc[i]['Entrez Gene Interactor B'])) eval_ln = CEvaluationLink(CPredicateNode("has_entrez_id"), CListLink(gene_2, entrez_2)) f.write(eval_ln.recursive_print() + "\n") entrez.append(node2) number_of_genes = [] for p in pairs.keys(): gene_1 = CGeneNode(p.split(':')[0]) gene_2 = CGeneNode(p.split(':')[1]) interacts_ln = CEvaluationLink(CPredicateNode("interacts_with"), CSetLink(gene_1, gene_2)) eval_ln = CEvaluationLink( CPredicateNode("has_pubmedID"), CListLink(interacts_ln, CListLink(*pairs[p]))) f.write(eval_ln.recursive_print() + "\n") if gene_level: g.write(eval_ln.recursive_print() + "\n") number_of_genes.append(gene_1.name) number_of_genes.append(gene_2.name) number_of_interactions = len(set(pairs.keys())) script = "https://github.com/MOZI-AI/knowledge-import/biogrid.py" metadata.update_meta("Biogrid:" + version, source, script, genes=str(len(set(number_of_genes))), interactions=str(number_of_interactions)) print("Done, check " + 'dataset/biogrid_gene_gene_' + version + "_" + str(date.today()) + '.scm')
names=["ID", "name", "Species"]) pathway_list = pathway_list[pathway_list['Species'] == 'H**o sapiens'] max_len = max(len(pathway_list), len(pathway_relation)) print("Started importing") script = "https://github.com/MOZI-AI/agi-bio/blob/master/knowledge-import/SNET/reactome_pathway.py" pathways = pathway_relation['parent'].values + pathway_relation['child'].values if not os.path.exists(os.path.join(os.getcwd(), 'dataset')): os.makedirs('dataset') with open("dataset/reactome.scm", 'a') as f: for i in range(max_len): try: f.write( eva(pathway_list.iloc[i]['name'], pathway_list.iloc[i]['ID'])) f.write( inherit(pathway_relation.iloc[i]['parent'], pathway_relation.iloc[i]['child'])) except IndexError: f.write( inherit(pathway_relation.iloc[i]['parent'], pathway_relation.iloc[i]['child'])) num_pathways = {"Reactome Pathway": len(set(pathways))} metadata.update_meta("Reactome Pathways relationship:latest", pathway_rln + " " + pathway, script, pathways=num_pathways) print("Done")
def import_dataset(dataset, delim, without_location=False): print("Started importing " + dataset) if "UniProt" in dataset or "ChEBI" in dataset: data = pd.read_csv(dataset, low_memory=False, delimiter=delim, names=[ "db_id", "R_PE_id", "R_PE_name", "pathway", "url", "event_name", "evidence_code", "species", "un1", "un2", "un3", "un4", "un5", "un6" ]) else: data = pd.read_csv(dataset, low_memory=False, delimiter=delim, names=[ "db_id", "R_PE_id", "R_PE_name", "pathway", "url", "event_name", "evidence_code", "species" ]) mapping_entrez = pd.read_csv("raw_data/entrez.txt", low_memory=False, sep="\t") # Take only symbols of Human species data_human = data[data['species'] == 'H**o sapiens'][[ 'db_id', 'R_PE_name', 'pathway' ]] if without_location: if not os.path.exists( os.path.join(os.getcwd(), 'gene-level-without-location')): os.makedirs('gene-level-without-location') file_name = open( "gene-level-without-location/" + dataset.split("/")[-1] + "_without_location_{}.scm".format(str(date.today())), "w") if not os.path.exists(os.path.join(os.getcwd(), 'dataset')): os.makedirs('dataset') with open( "dataset/" + dataset.split("/")[-1] + "_{}.scm".format(str(date.today())), 'w') as f: if "NCBI" in dataset: genes = [] pathways = [] infered = {} gene_symbols = mapping_entrez["Approved symbol"].values for i in range(len(data_human)): gene_sym, location = find_location( data_human.iloc[i]['R_PE_name']) pathway = data_human.iloc[i]['pathway'] db_id = data_human.iloc[i]['db_id'] try: gene = mapping_entrez[ mapping_entrez["NCBI Gene ID"] == int( db_id)]["Approved symbol"].values[0] except: if len(gene_sym.split(" ")) > 1: if str(db_id) in infered.keys(): gene = infered[str(db_id)] else: # non_exist.append(gene_sym + '\t' +str(db_id)) continue else: if gene_sym in gene_symbols: gene = gene_sym infered[str(db_id)] = gene else: continue if not gene.isdigit() and not len(gene) == 1 and not gene in [ "", " " ]: gene = gene.strip() member = CMemberLink(CGeneNode(gene), ReactomeNode(pathway)) eva = CEvaluationLink( CPredicateNode("has_location"), CListLink(CGeneNode(gene), CConceptNode(location))) cont = CContextLink(member, eva) f.write(cont.recursive_print() + "\n") if without_location: file_name.write(member.recursive_print() + "\n") if not gene in genes: genes.append(gene) if not pathway in pathways: pathways.append(pathway) version = "NCBI2reactome_pathway_mapping:latest" num_pathways = {"Reactome Pathway": len(pathways)} metadata.update_meta(version, ncbi, script, genes=len(genes), pathways=num_pathways) elif "UniProt" in dataset: molecules = [] pathways = [] for i in range(len(data_human)): prot = str(data_human.iloc[i]['R_PE_name']) loc = prot[prot.find("[") + 1:prot.find("]")] prot_name = prot.split("[" + loc + "]")[0] pathway = data_human.iloc[i]['pathway'] protein = [ i for i in str(data_human.iloc[i]['db_id']).split("-") if not i.strip().isdigit() ][-1] protein = protein.strip() member = CMemberLink(ProteinNode(protein), ReactomeNode(pathway)) eva_loc = CEvaluationLink( CPredicateNode("has_location"), CListLink(ProteinNode(protein), CConceptNode(loc))) eva_name = CEvaluationLink( CPredicateNode("has_name"), CListLink(ProteinNode(protein), CConceptNode(prot_name))) cont = CContextLink(member, eva_loc) f.write(cont.recursive_print() + "\n") if without_location: file_name.write(member.recursive_print() + "\n") if not protein in molecules: molecules.append(protein) f.write(eva_name.recursive_print() + "\n") if not pathway in pathways: pathways.append(pathway) version = "Uniprot2reactome_pathway_mapping:latest" num_pathways = {"Reactome Pathway": len(pathways)} metadata.update_meta(version, ncbi, script, prot=len(molecules), pathways=num_pathways) elif "ChEBI" in dataset: molecules = [] pathways = [] for i in range(len(data_human)): chebi = str(data_human.iloc[i]['R_PE_name']) loc = chebi[chebi.find("[") + 1:chebi.find("]")] chebi_name = chebi.split("[" + loc + "]")[0].replace('"', "") chebi_id = str(data_human.iloc[i]['db_id']) if not chebi_id is "nan": chebi_id = "ChEBI:" + str(chebi_id.strip()) pathway = data_human.iloc[i]['pathway'] member = CMemberLink(ChebiNode(chebi_id), ReactomeNode(pathway)) eva_loc = CEvaluationLink( CPredicateNode("has_location"), CListLink(ChebiNode(chebi_id), CConceptNode(loc))) eva_name = CEvaluationLink( CPredicateNode("has_name"), CListLink(ChebiNode(chebi_id), CConceptNode(chebi_name))) cont = CContextLink(member, eva_loc) f.write(cont.recursive_print() + "\n") if without_location: file_name.write(member.recursive_print() + "\n") if not chebi_id in molecules: molecules.append(chebi_id) f.write(eva_name.recursive_print() + "\n") if not pathway in pathways: pathways.append(pathway) version = "Chebi2reactome_pathway_mapping:latest" num_pathways = {"Reactome Pathway": len(pathways)} metadata.update_meta(version, ncbi, script, chebi=len(molecules), pathways=num_pathways) print("Done")
def import_dataset(dataset, delim): print("Started importing " + dataset) if "UniProt" in dataset or "ChEBI" in dataset: data = pd.read_csv(dataset, low_memory=False, delimiter=delim, names=[ "db_id", "R_PE_id", "R_PE_name", "pathway", "url", "event_name", "evidence_code", "species", "un1", "un2", "un3", "un4", "un5", "un6" ]) else: data = pd.read_csv(dataset, low_memory=False, delimiter=delim, names=[ "db_id", "R_PE_id", "R_PE_name", "pathway", "url", "event_name", "evidence_code", "species" ]) # Take only symbols of Human species data_human = data[data['species'] == 'H**o sapiens'][[ 'db_id', 'R_PE_name', 'pathway' ]] if not os.path.exists(os.path.join(os.getcwd(), 'dataset')): os.makedirs('dataset') with open("dataset/" + dataset.split("/")[-1] + ".scm", 'w') as f: if "NCBI" in dataset: genes = [] pathways = [] db_ids = {} for i in range(len(data_human)): gene, location = find_location(data_human.iloc[i]['R_PE_name']) pathway = data_human.iloc[i]['pathway'] db_id = data_human.iloc[i]['db_id'] # If a gene symbol is not one word, collect all gene symbols of the same db_id # and find the common word in the list (which is the gene symbol in most cases) # e.g "proKLK5" "KLK5" "propeptide KLK5" if len(gene.split(" ")) > 1: if db_id in db_ids.keys(): gene = db_ids[data_human.iloc[i]['db_id']] else: gene_symbols = data_human[data_human['db_id'] == db_id]['R_PE_name'].values gene_symbols = [ find_location(i, True) for i in gene_symbols ] if len(set(gene_symbols)) > 1: stemed = findstem(gene_symbols) else: stemed = gene_symbols[-1] if not (stemed.isdigit() and stemed in ["", " "] and len(stemed) == 1): db_ids.update({db_id: stemed}) gene = stemed if not gene.isdigit() and not len(gene) == 1 and not gene in [ "", " " ]: f.write("(AndLink\n") f.write(member(gene, pathway)) f.write(eva('l', gene, location)) f.write(")\n") if not gene in genes: genes.append(gene) if not pathway in pathways: pathways.append(pathway) version = "NCBI2reactome_pathway_mapping:latest" num_pathways = {"Reactome Pathway": len(pathways)} metadata.update_meta(version, ncbi, script, genes=len(genes), pathways=num_pathways) elif "UniProt" in dataset: molecules = [] pathways = [] for i in range(len(data_human)): prot = str(data_human.iloc[i]['R_PE_name']) loc = prot[prot.find("[") + 1:prot.find("]")] prot_name = prot.split("[" + loc + "]")[0] pathway = data_human.iloc[i]['pathway'] protein = [ i for i in str(data_human.iloc[i]['db_id']).split("-") if not i.strip().isdigit() ][-1] f.write("(AndLink\n") f.write(member("Uniprot:" + str(protein), pathway)) f.write(eva('l', "Uniprot:" + str(protein), loc)) f.write(")\n") if not protein in molecules: molecules.append(protein) f.write(eva("n", "Uniprot:" + str(protein), prot_name)) if not pathway in pathways: pathways.append(pathway) version = "Uniprot2reactome_pathway_mapping:latest" num_pathways = {"Reactome Pathway": len(pathways)} metadata.update_meta(version, ncbi, script, prot=len(molecules), pathways=num_pathways) elif "ChEBI" in dataset: molecules = [] pathways = [] for i in range(len(data_human)): chebi = str(data_human.iloc[i]['R_PE_name']) loc = chebi[chebi.find("[") + 1:chebi.find("]")] chebi_name = chebi.split("[" + loc + "]")[0].replace('"', "") chebi_id = str(data_human.iloc[i]['db_id']) pathway = data_human.iloc[i]['pathway'] f.write("(AndLink \n") f.write(member("ChEBI:" + chebi_id, pathway)) f.write(eva('l', "ChEBI:" + chebi_id, loc)) f.write(")\n") if not chebi_id in molecules: molecules.append(chebi_id) f.write(eva("n", "ChEBI:" + chebi_id, chebi_name)) if not pathway in pathways: pathways.append(pathway) version = "Chebi2reactome_pathway_mapping:latest" num_pathways = {"Reactome Pathway": len(pathways)} metadata.update_meta(version, ncbi, script, chebi=len(molecules), pathways=num_pathways) print("Done")