db="TESTING") cursor = conn_obj.cursor() cursor.execute("SELECT NAME FROM CSA_catalyticCofactors") print cursor.fetchall() cursor.execute("SHOW TABLES FROM TESTING") for table in cursor.fetchall(): print list(table) cursor.execute("SHOW FIELDS FROM CSA_catalyticCofactors") print cursor.fetchall() cursor.execute( "SELECT column_name FROM information_schema.columns WHERE table_name='CSA_catalyticCofactors'" ) print cursor.fetchall() cursor.execute( "SELECT PDBID, CHAIN,NUMBER, UPNUMBER , TYPE FROM CSA_CATALYTICRESIDUES") RESULTS = [list(X) for X in cursor.fetchall()] print len(RESULTS) RESULTS = unique(RESULTS) print RESULTS print len(RESULTS) RESULTS_NULL = [X for X in RESULTS if X[3] == None] print len(RESULTS_NULL), RESULTS_NULL
#CANCER_NETWORK_MAPPING_2 - DONE IN NETWORK WITH PRE-JULY DATA #Check how many genes are actual present in network import pickle from FUNCTIONS import UNIPROT_TO_GENES_CLASS, unique #Load CANCER_to_Gene dict FILE_IN1=open("DATABASES/OBJECTS/DICT_CANCER_TO_GENES.pi") CANCER_DICT=pickle.load(FILE_IN1) FILE_IN1.close() #Get all cancer genes ALL_CANCER_GENES=[] for genes in CANCER_DICT.itervalues(): ALL_CANCER_GENES=ALL_CANCER_GENES+genes ALL_CANCER_GENES=unique(ALL_CANCER_GENES) print len(ALL_CANCER_GENES) #Load NETWORK UNIPROTS FILE_IN2=open("NETWORK/ENDOGENOUS_UNIPROT") UNIPROTS=FILE_IN2.read().splitlines() #Run class UNIPROT_CLASS=UNIPROT_TO_GENES_CLASS(UNIPROTS) NETWORK_GENES=UNIPROT_CLASS.get_genes() NETWORK_UNIPROT_GENES=UNIPROT_CLASS.uniprot_gene() print "uniprot in network has match", len(UNIPROT_CLASS.has_genes_uniprots()) print "total cancer genes in network", len([X for X in ALL_CANCER_GENES if X in NETWORK_GENES]) #Check per cancer and make UNIPROT_DICT CANCER_TO_UNIPROT_DICT=dict((X, []) for X in CANCER_DICT.iterkeys())
METABOLITES_ZERO=[] for metabolite in METABO_EN.nodes(): if METABO_EN.in_degree(metabolite)==0 and METABO_EN.out_degree(metabolite)>0: METABOLITES.append(metabolite) elif METABO_EN.in_degree(metabolite)==0 and METABO_EN.out_degree(metabolite)==0: METABOLITES_ZERO.append(metabolite) print "METABOLITES:", len (METABOLITES) print len (METABOLITES) + len (METABOLITES_ZERO) print METABOLITES[:20] #Get proteins only PROTEINS=[] for protein in METABO_EN.nodes(): if METABO_EN.in_degree(protein)>0: PROTEINS.append(protein) print "PROTEINS:",len (PROTEINS) print "UNIQUE PROTEINS:", len (unique(PROTEINS)) print PROTEINS[:5] #DRAW NETWORK plt.figure(figsize=(25,12)) pos=NX.spectral_layout(METABO_EN) NX.draw_networkx_nodes(METABO_EN, pos, nodelist=METABOLITES, node_color="r", node_size=100) #NX.draw_networkx_nodes(METABO_EN, pos, nodelist=METABOLITES_ZERO, node_color="b", node_size=100) #EXCLUDE ZEROES ALTOGETHER #FOR NOW NX.draw_networkx_nodes(METABO_EN, pos, nodelist=PROTEINS, node_color="g", node_size=10) NX.draw_networkx_edges(METABO_EN, pos, width=0.2) plt.show()
#Separate cancer genes by cancer type import subprocess, pickle from FUNCTIONS import unique #Get genes in stats GENES_STATS=subprocess.check_output("ls", cwd="DATABASES/CANCER_DATA/geneStats").splitlines() #CANCER TYPES CANCER_DICT={"LUSC":[], "READ":[], "GBM":[], "KIRC":[], "UCEC":[], "OV":[], "BRCA":[], "COAD":[]} #Assign to CANCER TYPES for gene in GENES_STATS: FILE_IN=open("DATABASES/CANCER_DATA/geneStats/%s"%gene) RECORDS=FILE_IN.read().splitlines() FILE_IN.close() for record in RECORDS: CANCER_DICT[record.split()[4].strip()]=CANCER_DICT[record.split()[4].strip()]+[gene.split(".")[0]] for cancer in CANCER_DICT.iterkeys(): CANCER_DICT[cancer]=unique(CANCER_DICT[cancer]) print cancer, CANCER_DICT[cancer] #Store as object PICKLE_OUT=open("DATABASES/OBJECTS/DICT_CANCER_TO_GENES.pi", "w") pickle.dump(CANCER_DICT, PICKLE_OUT) for cancer in CANCER_DICT.iterkeys(): print cancer, len(CANCER_DICT[cancer])
metabolite) == 0 and METABO_EN.out_degree(metabolite) > 0: METABOLITES.append(metabolite) elif METABO_EN.in_degree(metabolite) == 0 and METABO_EN.out_degree( metabolite) == 0: METABOLITES_ZERO.append(metabolite) print "METABOLITES:", len(METABOLITES) print len(METABOLITES) + len(METABOLITES_ZERO) print METABOLITES[:20] #Get proteins only PROTEINS = [] for protein in METABO_EN.nodes(): if METABO_EN.in_degree(protein) > 0: PROTEINS.append(protein) print "PROTEINS:", len(PROTEINS) print "UNIQUE PROTEINS:", len(unique(PROTEINS)) print PROTEINS[:5] #DRAW NETWORK plt.figure(figsize=(25, 12)) pos = NX.spectral_layout(METABO_EN) NX.draw_networkx_nodes(METABO_EN, pos, nodelist=METABOLITES, node_color="r", node_size=100) #NX.draw_networkx_nodes(METABO_EN, pos, nodelist=METABOLITES_ZERO, node_color="b", node_size=100) #EXCLUDE ZEROES ALTOGETHER #FOR NOW NX.draw_networkx_nodes(METABO_EN, pos,
UNIPROT_GENE_DICT = {} for cancer in CANCER_DICT.itervalues(): for record in cancer: UNIPROT_GENE_DICT[record.split("|")[0]] = record.split("|")[1] #BREAK DOWN NODES BY CANCER OV_PROTEINS = [ X for X in PROTEINS if X.split("|")[1] in [Y.split("|")[0] for Y in CANCER_DICT["OV"]] ] print "OV_PROTEINS", len(OV_PROTEINS) OV_METABOLITES = [] for protein in OV_PROTEINS: OV_METABOLITES = OV_METABOLITES + METABO_EN.neighbors(protein) OV_METABOLITES = unique(OV_METABOLITES) print "OV_METABOLITES", len(OV_METABOLITES) OV_EDGES = METABO_EN.edges(OV_PROTEINS) OV_EDGES = unique(OV_EDGES) OV_LABELS_PROTEINS = {} for protein in OV_PROTEINS: OV_LABELS_PROTEINS[protein] = UNIPROT_GENE_DICT[protein.split("|")[1]] OV_LABELS_METABOLITES = dict( (X[0], X[1]) for X in zip(OV_METABOLITES, OV_METABOLITES)) #DRAW NETWORK plt.figure(figsize=(25, 12)) pos = NX.spring_layout(METABO_EN)
#GET ALL ENDOGENOUS UNIPROTS FROM NETWORK from FUNCTIONS import unique FILE_IN=open("NETWORK/ENDOGENOUS.ADJ").read().splitlines() ENDOGENOUS_UNIPROT=[] for record in FILE_IN: if len(record.split())>1: ENDOGENOUS_UNIPROT=ENDOGENOUS_UNIPROT+[X.split("|")[1].strip() for X in record.split()[1:]] print len(ENDOGENOUS_UNIPROT), ENDOGENOUS_UNIPROT ENDOGENOUS_UNIPROT=unique(ENDOGENOUS_UNIPROT) print len(ENDOGENOUS_UNIPROT) FILE_OUT=open("NETWORK/ENDOGENOUS_UNIPROT", "w") for record in ENDOGENOUS_UNIPROT: FILE_OUT.write(record+"\n") FILE_OUT.close()
#Create connection object that represents database conn_obj=MQ.connect(host="localhost",user="******", passwd="mysql", db="TESTING") cursor=conn_obj.cursor() cursor.execute("SELECT NAME FROM CSA_catalyticCofactors") print cursor.fetchall() cursor.execute("SHOW TABLES FROM TESTING") for table in cursor.fetchall(): print list(table) cursor.execute("SHOW FIELDS FROM CSA_catalyticCofactors") print cursor.fetchall() cursor.execute("SELECT column_name FROM information_schema.columns WHERE table_name='CSA_catalyticCofactors'") print cursor.fetchall() cursor.execute("SELECT PDBID, CHAIN,NUMBER, UPNUMBER , TYPE FROM CSA_CATALYTICRESIDUES") RESULTS= [list(X) for X in cursor.fetchall()] print len(RESULTS) RESULTS=unique(RESULTS) print RESULTS print len(RESULTS) RESULTS_NULL=[X for X in RESULTS if X[3]==None] print len(RESULTS_NULL), RESULTS_NULL
#GET ALL ENDOGENOUS UNIPROTS FROM NETWORK from FUNCTIONS import unique FILE_IN = open("NETWORK/ENDOGENOUS.ADJ").read().splitlines() ENDOGENOUS_UNIPROT = [] for record in FILE_IN: if len(record.split()) > 1: ENDOGENOUS_UNIPROT = ENDOGENOUS_UNIPROT + [ X.split("|")[1].strip() for X in record.split()[1:] ] print len(ENDOGENOUS_UNIPROT), ENDOGENOUS_UNIPROT ENDOGENOUS_UNIPROT = unique(ENDOGENOUS_UNIPROT) print len(ENDOGENOUS_UNIPROT) FILE_OUT = open("NETWORK/ENDOGENOUS_UNIPROT", "w") for record in ENDOGENOUS_UNIPROT: FILE_OUT.write(record + "\n") FILE_OUT.close()
TO_R2=open("/Users/jzamalloa/Desktop/FOLDER/LAB/ECLIPSE_TO_R/HUBS_90_DEGREE", "w") HUBS_90_DEGREE=[] HUBS_90_PROTEINS=[] for metabolite in CLEAN_TC_LIST: HUBS_90_DEGREE.append(METABO_NT.out_degree(metabolite)) TO_R2.write(str(METABO_NT.out_degree(metabolite))+"\n") PROTEINS=METABO_NT.neighbors(metabolite) for prot in PROTEINS: HUBS_90_PROTEINS.append(prot.split("|")[1]) TO_R2.close() print len(HUBS_90_DEGREE) print len(HUBS_90_PROTEINS) print HUBS_90_PROTEINS[:5] HUBS_90_PROTEINS=unique(HUBS_90_PROTEINS) print len(HUBS_90_PROTEINS) #Plot degree distribution plt.hist(HUBS_90_DEGREE) plt.show() #Get gene names out of these proteins, use uniprot_sprot.dat HUBS_90_GENES=[] for protein in HUBS_90_PROTEINS: PROT_GENE=UNIPROT_GENE_PLUS(protein) for gene in PROT_GENE: HUBS_90_GENES.append(protein+"$"+gene) print HUBS_90_GENES[0:5] print len(HUBS_90_GENES)
CANCER_DICT=pickle.load(FILE_IN1) #Make {UNIPROT:GENE} dictionary UNIPROT_GENE_DICT={} for cancer in CANCER_DICT.itervalues(): for record in cancer: UNIPROT_GENE_DICT[record.split("|")[0]]=record.split("|")[1] #BREAK DOWN NODES BY CANCER OV_PROTEINS=[X for X in PROTEINS if X.split("|")[1] in [Y.split("|")[0] for Y in CANCER_DICT["OV"]]] print "OV_PROTEINS", len(OV_PROTEINS) OV_METABOLITES=[] for protein in OV_PROTEINS: OV_METABOLITES=OV_METABOLITES+METABO_EN.neighbors(protein) OV_METABOLITES=unique(OV_METABOLITES) print "OV_METABOLITES", len(OV_METABOLITES) OV_EDGES=METABO_EN.edges(OV_PROTEINS) OV_EDGES=unique(OV_EDGES) OV_LABELS_PROTEINS={} for protein in OV_PROTEINS: OV_LABELS_PROTEINS[protein]=UNIPROT_GENE_DICT[protein.split("|")[1]] OV_LABELS_METABOLITES=dict((X[0],X[1]) for X in zip(OV_METABOLITES, OV_METABOLITES)) #DRAW NETWORK plt.figure(figsize=(25,12)) pos=NX.spring_layout(METABO_EN)
#CANCER_NETWORK_MAPPING_2 - DONE IN NETWORK WITH PRE-JULY DATA #Check how many genes are actual present in network import pickle from FUNCTIONS import UNIPROT_TO_GENES_CLASS, unique #Load CANCER_to_Gene dict FILE_IN1 = open("DATABASES/OBJECTS/DICT_CANCER_TO_GENES.pi") CANCER_DICT = pickle.load(FILE_IN1) FILE_IN1.close() #Get all cancer genes ALL_CANCER_GENES = [] for genes in CANCER_DICT.itervalues(): ALL_CANCER_GENES = ALL_CANCER_GENES + genes ALL_CANCER_GENES = unique(ALL_CANCER_GENES) print len(ALL_CANCER_GENES) #Load NETWORK UNIPROTS FILE_IN2 = open("NETWORK/ENDOGENOUS_UNIPROT") UNIPROTS = FILE_IN2.read().splitlines() #Run class UNIPROT_CLASS = UNIPROT_TO_GENES_CLASS(UNIPROTS) NETWORK_GENES = UNIPROT_CLASS.get_genes() NETWORK_UNIPROT_GENES = UNIPROT_CLASS.uniprot_gene() print "uniprot in network has match", len(UNIPROT_CLASS.has_genes_uniprots()) print "total cancer genes in network", len( [X for X in ALL_CANCER_GENES if X in NETWORK_GENES]) #Check per cancer and make UNIPROT_DICT
"LUSC": [], "READ": [], "GBM": [], "KIRC": [], "UCEC": [], "OV": [], "BRCA": [], "COAD": [] } #Assign to CANCER TYPES for gene in GENES_STATS: FILE_IN = open("DATABASES/CANCER_DATA/geneStats/%s" % gene) RECORDS = FILE_IN.read().splitlines() FILE_IN.close() for record in RECORDS: CANCER_DICT[record.split()[4].strip( )] = CANCER_DICT[record.split()[4].strip()] + [gene.split(".")[0]] for cancer in CANCER_DICT.iterkeys(): CANCER_DICT[cancer] = unique(CANCER_DICT[cancer]) print cancer, CANCER_DICT[cancer] #Store as object PICKLE_OUT = open("DATABASES/OBJECTS/DICT_CANCER_TO_GENES.pi", "w") pickle.dump(CANCER_DICT, PICKLE_OUT) for cancer in CANCER_DICT.iterkeys(): print cancer, len(CANCER_DICT[cancer])