def dropviews(label): tlines="DROP TABLE IF EXISTS #REPLACE#_feature_pathways" ds_lines = tlines.replace("#REPLACE#", label) print ds_lines cmd = "mysql -h %s --port %s -u%s -p%s -e \"%s\" %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), ds_lines, db_util.getDBSchema(config)) os.system(cmd);
def executeHide(schemafile, config): schemafile_path = schemafile.name cmd = "mysql -h %s --port %s -u%s -p%s < %s" % ( db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), schemafile_path) os.system(cmd)
def fix(label): tlines="""DROP VIEW IF EXISTS v_#REPLACE#_feature_categorical_labels; CREATE VIEW v_#REPLACE#_feature_categorical_labels as SELECT DISTINCT label, alias, source, interesting_score from #REPLACE#_features where source = 'CLIN' UNION SELECT DISTINCT label, alias, source, interesting_score from #REPLACE#_features where source = 'SAMP' UNION SELECT DISTINCT label, alias, source, interesting_score from #REPLACE#_features where source = 'PRDM'; """ ds_lines = tlines.replace("#REPLACE#", label) print ds_lines cmd = "mysql -h %s --port %s -u%s -p%s -e \"%s\" %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), ds_lines, db_util.getDBSchema(config)) os.system(cmd)
#!/usr/bin/python """ Processing features from in data_matrix and gexp interesting. Sample values and identifiers will also be extracted. The required dataset label indicates the set of schemas. """ import sys import os import time import math import decimal import db_util mydb = db_util.getDBSchema() #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser() #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword() #config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost() myport = db_util.getDBPort() features_hash = {} edges_hash = {} gene_interesting_hash = {} dataset_label = "" load_desc = "" feature_table = "" sample_table = "" feature_table_label = "tcga.gbm_pw" def is_numeric(val): try: float(val)
#!/usr/bin/python import db_util import math import sys import os import time myhost = db_util.getDBHost() #config.get("mysql_jdbc_configs", "host") myport = db_util.getDBPort() mydb = db_util.getDBSchema() #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser() #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword() #config.get("mysql_jdbc_configs", "password") print "Parsing samples/patients and values kicked off %s" % time.ctime() if (len(sys.argv) != 3): print 'Usage is py2.6 parse_sample_values.py data_associations.csv dataset_label' sys.exit(1) infile = open(sys.argv[1]) dataset_label = sys.argv[2] schema = mydb feature_table = schema + "." + dataset_label + "_features" samples_table = schema + "." + dataset_label + "_patients" feature_helper = {} feature_helper["IMP2"] = "C:GENO:IMP2:chrIX:53981:55021:-" feature_helper["AAP1"] = "C:GENO:AAP1:chrVIII:198740:201310:-" features_hash = {} fshout = open('load_strain_values_' + dataset_label + '.sh', 'w') outfile = open('./strain_values_out_' + dataset_label + '.sql', 'w') #fshout = open('loadKruglyakValuesSql_pv.sh','w')
def executeSchema(schemafile_path, config): cmd = "mysql -h %s --port %s -u%s -p%s < %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), schemafile_path) print "Running system call %s" %(cmd) os.system(cmd)
def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta, config, annotations, quantileFeatures, results_path, interestingFile): global features_hash print("processing feature set: matrix file %s annotation file %s" % (matrix_file, annotations)) out_hash = {} features_hash = {} summary_hash = {} mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) if (not os.path.isfile(matrix_file)): print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR" sys.exit(-1) feature_matrix_file = open(matrix_file, "r") feature_table = mydb + "." + dataset_label + "_features" sample_table = mydb + "." + dataset_label + "_patients" fshout = open(results_path + dataset_label + '_load_features.sh', 'w') outfile = open(results_path + dataset_label + '_features_out.tsv', 'w') alidfile = open(results_path + dataset_label + '_features_alias_id.tsv', 'w') sampleOutfile = open( results_path + dataset_label + '_sample_values_out.tsv', 'w') featureId = 0 annotation_hash, ftypes = process_feature_annotations(annotations) sub_afm_out = {} for q in quantileFeatures.split(","): sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm', 'w') fIntHash = get_feature_interest_hash(interestingFile) for line in feature_matrix_file: tokens = line.strip().split('\t') afmid = "" ftype = "" interesting_score = 0 if (featureId == 0): sampleIds = tokens #not part of core function in RE import pipeline #if (persist_sample_meta == 1): # populate_sample_meta(sampleIds.split(":"), config) #sampleOutfile.write(sampleIds + "\n"); featureId += 1 continue if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1): valuesArray = [] alias = tokens[0] originalAFM = tokens[0] data = alias.split(':') if (len(data) < 4): #afmid = alias annotated_feature = annotation_hash.get(alias) if (annotated_feature == None): print "ERROR: AFM feature %s is not in annotation" % ( alias) sys.exit(-1) #hasAnnotations = True #put features in sub afm files for quantile calculation ftype = annotated_feature.split("\t")[1] alias = annotated_feature.replace("\t", ":") featureId = int(alias.split(":")[-1]) data = alias.split(':') if (ftype == ""): ftype = data[1] afmid = alias #if (fIntHash[alias] != None): try: interesting_score = fIntHash[originalAFM] except KeyError: #print "Key error with fInterestingHash " + alias interesting_score = 0 if (sub_afm_out.get(ftype) != None): sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) + "\n") features_hash[tokens[0]] = featureId if (len(data) <= 7): if (data[1] == 'CLIN' or data[1] == 'SAMP'): alias = ":".join(data[0:3]) + ":::::" data = alias.split(':') else: data.append("") if len(data[3]) > 3: data[3] = data[3][3:] patient_values = ":".join(tokens[1:]) for val in tokens[1:]: if (db_util.is_numeric(val)): valuesArray.append(float(val)) else: valuesArray.append(0.0) #make sure that the number patient ids match values if (featureId == 1): start = len(sampleIds) - len(valuesArray) sampleStr = ":".join(sampleIds[start:]) sampleOutfile.write(sampleStr + "\n") patient_value_mean = sum(valuesArray) / len(valuesArray) accumulate_summary_counts(summary_hash, data[1]) alidfile.write(originalAFM + "\t" + str(featureId) + "\t" + alias + "\n") out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join( data) + "\t" + patient_values + "\t" + str( patient_value_mean) + "\t" + str(interesting_score) else: print "duplicated feature in feature set:" + tokens[0] featureId += 1 for val in out_hash.values(): outfile.write(val + "\n") summary_out = open( resultsPath + "feature_summary_" + dataset_label + ".json", "w") summary_json = "{" for feature_type in summary_hash: summary_json = summary_json + '"%s":%i,' % (feature_type, summary_hash[feature_type]) summary_out.write(summary_json[0:-1] + "}\n") summary_out.close() feature_matrix_file.close() outfile.close() alidfile.close() sampleOutfile.close() fshout.write("#!/bin/bash\n") fshout.write( "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" % (myhost, myport, myuser, mypw, mydb)) fshout.write("load data local infile '" + outfile.name + "' replace INTO TABLE " + feature_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("load data local infile '" + sampleOutfile.name + "' replace INTO TABLE " + sample_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("\ncommit;\nEOFMYSQL") fshout.close() print "processing done, running bulk load on %i features %s" % ( len(features_hash), time.ctime()) if (persist_sample_meta == 1): os.system("sh " + fshout.name) return annotation_hash
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): """ Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA """ edges_hash = {} max_pv = -1000.0 max_pv_corr = -1000.0 mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser(config) #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword(config) #config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) edges_file = open(pairwised_file) fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile) edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" efshout = open(results_path + 'load_edges_' + dataset_label + '.sh','w') solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh','w') edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_pw_re.tsv','w') edges_out_pc = open(results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv','w') edges_meta_json = open(results_path + 'edges_out_' + dataset_label + '_meta.json','w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv' unmappedout = open(unmappedPath,'w') features_file = open(results_path + dataset_label + '_features_out.tsv','r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() validEdgeId = 1 invalidEdges = 0 dupeEdges = 0 totalEdges = 0 cnan = 0 pcc = 0 unMapped = 0 for line in edges_file: totalEdges += 1 line = line.strip() tokens = line.split('\t') if (len(tokens) < 11): if (validEdgeId == 1): print "Skipping header/line 1 for insufficient token reasons" continue print "ERROR: requires 11 tokens, found:" + str(len(tokens)) + " Skipping line\n" + line continue nodeA = tokens[0] nodeB = tokens[1] try: f1genescore = fIntHash[nodeA] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[nodeB] except KeyError: f2genescore = 0 if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0): unmappedout.write(nodeA + "\t" + nodeB + "\n") unMapped += 1 continue #nodeA = nodeA.replace('|', '_') #nodeB = nodeB.replace('|', '_') try: features_hash[nodeA] except KeyError: print "key error in resolving featureId for " + nodeA + " skipping edge." continue try: features_hash[nodeB] except KeyError: print "key error in resolving featureId for " + nodeB + " skipping edge." continue if (features_hash[nodeA] and features_hash[nodeB]): if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)): feature1id = ""#str(features_hash[nodeA]) feature2id = ""#str(features_hash[nodeB]) #This will need to be improve once all pairs has annotations try: feature1id = str(features_hash[nodeA][0]) except KeyError: print "ERROR: key error in resolving featureId for " + nodeA try: feature2id = str(features_hash[nodeB][0]) except: print "ERROR: key error in resolving featureId for " + nodeB edges_hash[nodeA + "_" + nodeB] = validEdgeId validEdgeId += 1 dataA = process_feature_alias(nodeA) label1_desc = "" dataB = process_feature_alias(nodeB) label2_desc = "" if (len(dataA) == 7): dataA.append("") nodeA = nodeA + ":" if (len(dataB) == 7): dataB.append("") nodeB = nodeB + ":" correlation_str = tokens[2] try: correlation = float(correlation_str) except ValueError: #Align correlation value to NaN cnan += 1 correlation = float('nan') correlation_str = '' numna = tokens[3] pv_str = tokens[4] bonf = tokens[5] pv_bonf_str = tokens[6] numnaf1 = tokens[7] pvf1_str = tokens[8] numnaf2 = tokens[9] pvf2_str = tokens[10] try: pv = str(pvlambda(float(pv_str))) pv_bonf = str(pvlambda(float(pv_bonf_str))) pvf1 = str(pvlambda(float(pvf1_str))) pvf2 = str(pvlambda(float(pvf2_str))) except ValueError: #error in pairwise script, ignore these associations for now continue; if (float(pv) > max_pv): max_pv = float(pv) if (float(pv_bonf) > max_pv_corr): max_pv_corr = float(pv_bonf) rho = str(db_util.sign(correlation)*abs(float(pv))) link_distance = 500000000 if ( len(tokens) >= 12 ): link_distance = int(tokens[11]) else: if (len(dataA) >=5 and len(dataB)>=5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]): link_distance = abs(int(dataB[4]) - int(dataA[4])) edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n") if (do_pubcrawl == "yes"): #call andrea code getPairwiseInfo.processLine(line, edges_out_pc) pcc += 1 else: print "duplicated edge:" + nodeA + "_" + nodeB dupeEdges += 1 else: print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB invalidEdges += 1 print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" %(validEdgeId-1, dupeEdges, cnan, unMapped,unmappedPath, totalEdges, max_pv, max_pv_corr) edges_meta_json.write('{"max_logpv":%f}' %(max_pv)) edges_file.close() edges_out_re.close() edges_out_pc.close() edges_meta_json.close() unmappedout.close() efshout.write("#!/bin/bash\n") efshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") efshout.write("\ncommit;") efshout.write("\nEOFMYSQL") efshout.close() print "Begin pairwise db bulk upload " + time.ctime() os.system("sh " + efshout.name) #create sharded association files for solr import solrshout.write("#!/bin/bash\n"); solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin pairwise solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == "yes"): print "senting Pubcrawl notification to " + contacts smtp.main("*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
def executeDrop(schemafile, config): schemafile_path = schemafile.name cmd = "mysql -h %s --port %s -u%s -p%s < %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), schemafile_path) os.system(cmd)
#!/usr/bin/python import db_util import math import sys import os import time myhost = db_util.getDBHost() # config.get("mysql_jdbc_configs", "host") myport = db_util.getDBPort() mydb = db_util.getDBSchema() # config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser() # config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword() # config.get("mysql_jdbc_configs", "password") print "Parsing samples/patients and values kicked off %s" % time.ctime() if len(sys.argv) != 3: print "Usage is py2.6 parse_sample_values.py data_associations.csv dataset_label" sys.exit(1) infile = open(sys.argv[1]) dataset_label = sys.argv[2] schema = mydb feature_table = schema + "." + dataset_label + "_features" samples_table = schema + "." + dataset_label + "_patients" feature_helper = {} feature_helper["IMP2"] = "C:GENO:IMP2:chrIX:53981:55021:-" feature_helper["AAP1"] = "C:GENO:AAP1:chrVIII:198740:201310:-" features_hash = {} fshout = open("load_strain_values_" + dataset_label + ".sh", "w") outfile = open("./strain_values_out_" + dataset_label + ".sql", "w") # fshout = open('loadKruglyakValuesSql_pv.sh','w')
def createMysqlDumps(self): print "Creating MySQL dumps" config = self.config for dslabel in self.labels: print "Creating MySQL dumps for labelname '%s'" % (dslabel) dump_edges = getDumpEdgesFile(config, dslabel) dump_nodes = getDumpNodesFile(config, dslabel) dump_patients = getDumpPatientsFile(config, dslabel) table_feat = dslabel + "_features" table_edge = "mv_" + dslabel + "_feature_networks" table_patient = dslabel + "_patients" mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = str(db_util.getDBPort(config)) # Get nodes if not os.path.isfile(dump_nodes): nodeColumnTypes = dict() columns = [ "alias", "type", "source", "label", "chr", "start", "end", "strand", "label_desc", "patient_values", "patient_values_mean", "quantile_val", "quantile", "gene_interesting_score", ] cursor = db_util.getCursor(config) for column in columns: rows = cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" % (table_feat, column) ) result = cursor.fetchone()[0] # tuple -> [0] nodeColumnTypes[column] = getDataType(result) cursor.close() # pvalue AS pvalue__varchar => dump header columns format is like pvalue__varchar os.system( 'mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \ -e " SELECT %s FROM %s;" > %s' % ( myhost, myport, myuser, mypw, mydb, ", ".join(["%s AS %s__%s" % (key, key, value) for (key, value) in nodeColumnTypes.items()]), table_feat, dump_nodes, ) ) else: print "MySQL dump file at %s exists, skipping." % (dump_nodes) # Get edges if not os.path.isfile(dump_edges): columns = [ "pvalue", "importance", "correlation", "patientct", "alias1", "alias2", "f1chr", "f1start", "f1end", "f2chr", "f2start", "f2end", ] edgeColumnTypes = dict() cursor = db_util.getCursor(config) for column in columns: cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" % (table_edge, column) ) result = cursor.fetchone()[0] # tuple -> [0] edgeColumnTypes[column] = getDataType(result) cursor.close() os.system( 'mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \ -e "select %s FROM %s;" > %s' % ( myhost, myport, myuser, mypw, mydb, ", ".join(["%s AS %s__%s" % (key, key, value) for (key, value) in edgeColumnTypes.items()]), table_edge, dump_edges, ) ) else: print "MySQL dump file at %s exists, skipping." % (dump_edges) # Get patient barcodes: if not os.path.isfile(dump_patients): columns = ["barcode"] patientColumnTypes = dict() cursor = db_util.getCursor(config) for column in columns: cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" % (table_patient, column) ) result = cursor.fetchone()[0] # tuple -> [0] patientColumnTypes[column] = getDataType(result) cursor.close() os.system( 'mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \ -e "select %s FROM %s;" > %s' % ( myhost, myport, myuser, mypw, mydb, ", ".join(["%s AS %s__%s" % (key, key, value) for (key, value) in patientColumnTypes.items()]), table_patient, dump_patients, ) ) else: print "MySQL dump file at %s exists, skipping." % (dump_patients) print "Finished creating MySQL dumps"
#!/usr/bin/python """ Processing features from in data_matrix and gexp interesting. Sample values and identifiers will also be extracted. The required dataset label indicates the set of schemas. """ import sys import os import time import math import decimal import db_util mydb = db_util.getDBSchema() # config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser() # config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword() # config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost() myport = db_util.getDBPort() features_hash = {} edges_hash = {} gene_interesting_hash = {} dataset_label = "" load_desc = "" feature_table = "" sample_table = "" feature_table_label = "tcga.gbm_pw" def is_numeric(val): try:
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): """ Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA """ edges_hash = {} max_pv = -1000.0 max_pv_corr = -1000.0 mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser( config) #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword( config) #config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) edges_file = open(pairwised_file) fIntHash = parse_features_rfex.get_feature_interest_hash( featureInterestingFile) edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" efshout = open(results_path + 'load_edges_' + dataset_label + '.sh', 'w') solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh', 'w') edges_out_re = open( results_path + 'edges_out_' + dataset_label + '_pw_re.tsv', 'w') edges_out_pc = open( results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv', 'w') edges_meta_json = open( results_path + 'edges_out_' + dataset_label + '_meta.json', 'w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv' unmappedout = open(unmappedPath, 'w') features_file = open(results_path + dataset_label + '_features_out.tsv', 'r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() validEdgeId = 1 invalidEdges = 0 dupeEdges = 0 totalEdges = 0 cnan = 0 pcc = 0 unMapped = 0 for line in edges_file: totalEdges += 1 line = line.strip() tokens = line.split('\t') if (len(tokens) < 11): if (validEdgeId == 1): print "Skipping header/line 1 for insufficient token reasons" continue print "ERROR: requires 11 tokens, found:" + str( len(tokens)) + " Skipping line\n" + line continue nodeA = tokens[0] nodeB = tokens[1] try: f1genescore = fIntHash[nodeA] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[nodeB] except KeyError: f2genescore = 0 if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0): unmappedout.write(nodeA + "\t" + nodeB + "\n") unMapped += 1 continue #nodeA = nodeA.replace('|', '_') #nodeB = nodeB.replace('|', '_') try: features_hash[nodeA] except KeyError: print "key error in resolving featureId for " + nodeA + " skipping edge." continue try: features_hash[nodeB] except KeyError: print "key error in resolving featureId for " + nodeB + " skipping edge." continue if (features_hash[nodeA] and features_hash[nodeB]): if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)): feature1id = "" #str(features_hash[nodeA]) feature2id = "" #str(features_hash[nodeB]) #This will need to be improve once all pairs has annotations try: feature1id = str(features_hash[nodeA][0]) except KeyError: print "ERROR: key error in resolving featureId for " + nodeA try: feature2id = str(features_hash[nodeB][0]) except: print "ERROR: key error in resolving featureId for " + nodeB edges_hash[nodeA + "_" + nodeB] = validEdgeId validEdgeId += 1 dataA = process_feature_alias(nodeA) label1_desc = "" dataB = process_feature_alias(nodeB) label2_desc = "" if (len(dataA) == 7): dataA.append("") nodeA = nodeA + ":" if (len(dataB) == 7): dataB.append("") nodeB = nodeB + ":" correlation_str = tokens[2] try: correlation = float(correlation_str) except ValueError: #Align correlation value to NaN cnan += 1 correlation = float('nan') correlation_str = '' numna = tokens[3] pv_str = tokens[4] bonf = tokens[5] pv_bonf_str = tokens[6] numnaf1 = tokens[7] pvf1_str = tokens[8] numnaf2 = tokens[9] pvf2_str = tokens[10] try: pv = str(pvlambda(float(pv_str))) pv_bonf = str(pvlambda(float(pv_bonf_str))) pvf1 = str(pvlambda(float(pvf1_str))) pvf2 = str(pvlambda(float(pvf2_str))) except ValueError: #error in pairwise script, ignore these associations for now continue if (float(pv) > max_pv): max_pv = float(pv) if (float(pv_bonf) > max_pv_corr): max_pv_corr = float(pv_bonf) rho = str(db_util.sign(correlation) * abs(float(pv))) link_distance = 500000000 if (len(tokens) >= 12): link_distance = int(tokens[11]) else: if (len(dataA) >= 5 and len(dataB) >= 5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]): link_distance = abs(int(dataB[4]) - int(dataA[4])) edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n") if (do_pubcrawl == "yes"): #call andrea code getPairwiseInfo.processLine(line, edges_out_pc) pcc += 1 else: print "duplicated edge:" + nodeA + "_" + nodeB dupeEdges += 1 else: print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB invalidEdges += 1 print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" % ( validEdgeId - 1, dupeEdges, cnan, unMapped, unmappedPath, totalEdges, max_pv, max_pv_corr) edges_meta_json.write('{"max_logpv":%f}' % (max_pv)) edges_file.close() edges_out_re.close() edges_out_pc.close() edges_meta_json.close() unmappedout.close() efshout.write("#!/bin/bash\n") efshout.write( "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" % (myhost, myport, myuser, mypw, mydb)) efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") efshout.write("\ncommit;") efshout.write("\nEOFMYSQL") efshout.close() print "Begin pairwise db bulk upload " + time.ctime() os.system("sh " + efshout.name) #create sharded association files for solr import solrshout.write("#!/bin/bash\n") solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write( "curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin pairwise solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == "yes"): print "senting Pubcrawl notification to " + contacts smtp.main( "*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta, config, annotations, quantileFeatures, results_path, interestingFile): global features_hash print ("processing feature set: matrix file %s annotation file %s"%(matrix_file, annotations)) out_hash = {} features_hash = {} summary_hash = {} mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) if (not os.path.isfile(matrix_file)): print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR" sys.exit(-1) feature_matrix_file = open(matrix_file, "r") feature_table = mydb + "." + dataset_label + "_features" sample_table = mydb + "." + dataset_label + "_patients" fshout = open(results_path + dataset_label + '_load_features.sh','w') outfile = open(results_path + dataset_label + '_features_out.tsv','w') alidfile = open(results_path + dataset_label + '_features_alias_id.tsv','w') sampleOutfile = open(results_path + dataset_label + '_sample_values_out.tsv','w') featureId = 0 annotation_hash, ftypes = process_feature_annotations(annotations) sub_afm_out = {} for q in quantileFeatures.split(","): sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm','w') fIntHash = get_feature_interest_hash(interestingFile) for line in feature_matrix_file: tokens = line.strip().split('\t') afmid = "" ftype = "" interesting_score = 0 if (featureId == 0): sampleIds = tokens #not part of core function in RE import pipeline #if (persist_sample_meta == 1): # populate_sample_meta(sampleIds.split(":"), config) #sampleOutfile.write(sampleIds + "\n"); featureId += 1 continue if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1): valuesArray = [] alias = tokens[0] originalAFM = tokens[0] data = alias.split(':') if (len(data) < 4): #afmid = alias annotated_feature = annotation_hash.get(alias) if (annotated_feature == None): print "ERROR: AFM feature %s is not in annotation" %(alias) sys.exit(-1) #hasAnnotations = True #put features in sub afm files for quantile calculation ftype = annotated_feature.split("\t")[1] alias = annotated_feature.replace("\t", ":") featureId = int(alias.split(":")[-1]) data = alias.split(':') if (ftype == ""): ftype = data[1] afmid = alias #if (fIntHash[alias] != None): try: interesting_score = fIntHash[originalAFM] except KeyError: #print "Key error with fInterestingHash " + alias interesting_score = 0 if (sub_afm_out.get(ftype) != None): sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) + "\n") features_hash[tokens[0]] = featureId if (len(data) <= 7): if (data[1] == 'CLIN' or data[1] == 'SAMP'): alias = ":".join(data[0:3]) + ":::::" data = alias.split(':') else: data.append("") if len(data[3]) > 3: data[3] = data[3][3:] patient_values = ":".join(tokens[1:]) for val in tokens[1:]: if (db_util.is_numeric(val)): valuesArray.append(float(val)) else: valuesArray.append(0.0) #make sure that the number patient ids match values if (featureId == 1): start = len(sampleIds) - len(valuesArray) sampleStr = ":".join(sampleIds[start:]) sampleOutfile.write(sampleStr + "\n"); patient_value_mean = sum(valuesArray)/len(valuesArray) accumulate_summary_counts(summary_hash,data[1]) alidfile.write(originalAFM + "\t" + str(featureId) + "\t" + alias + "\n") out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join(data) + "\t" + patient_values + "\t" + str(patient_value_mean) + "\t" + str(interesting_score) else: print "duplicated feature in feature set:" + tokens[0] featureId += 1 for val in out_hash.values(): outfile.write(val + "\n") summary_out = open(resultsPath + "feature_summary_" + dataset_label + ".json", "w") summary_json = "{" for feature_type in summary_hash: summary_json = summary_json + '"%s":%i,' %(feature_type, summary_hash[feature_type]) summary_out.write(summary_json[0:-1] + "}\n") summary_out.close() feature_matrix_file.close() outfile.close() alidfile.close() sampleOutfile.close() fshout.write("#!/bin/bash\n") fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) fshout.write("load data local infile '" + outfile.name + "' replace INTO TABLE " + feature_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("load data local infile '" + sampleOutfile.name + "' replace INTO TABLE " + sample_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("\ncommit;\nEOFMYSQL") fshout.close() print "processing done, running bulk load on %i features %s" %(len(features_hash), time.ctime()) if (persist_sample_meta == 1): os.system("sh " + fshout.name) return annotation_hash
def createMysqlDumps(self): print "Creating MySQL dumps" config = self.config for dslabel in self.labels: print "Creating MySQL dumps for labelname '%s'" % (dslabel) dump_edges = getDumpEdgesFile(config, dslabel) dump_nodes = getDumpNodesFile(config, dslabel) dump_patients = getDumpPatientsFile(config, dslabel) table_feat = dslabel + "_features" table_edge = "mv_" + dslabel + "_feature_networks" table_patient = dslabel + "_patients" mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = str(db_util.getDBPort(config)) # Get nodes if (not os.path.isfile(dump_nodes)): nodeColumnTypes = dict() columns = [ "alias", "type", "source", "label", "chr", \ "start", "end", "strand", "label_desc", \ "patient_values", "patient_values_mean", "quantile_val", "quantile", "gene_interesting_score" ] cursor = db_util.getCursor(config) for column in columns: rows = cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" \ % ( table_feat, column ) ) result = cursor.fetchone()[0] #tuple -> [0] nodeColumnTypes[column] = getDataType(result) cursor.close() # pvalue AS pvalue__varchar => dump header columns format is like pvalue__varchar os.system( "mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \ -e \" SELECT %s FROM %s;\" > %s" % (myhost, myport, myuser, mypw, mydb, \ ", ".join(['%s AS %s__%s' % (key,key,value) for (key,value) in nodeColumnTypes.items() ] ), table_feat, dump_nodes) ) else: print "MySQL dump file at %s exists, skipping." % (dump_nodes) # Get edges if (not os.path.isfile(dump_edges)): columns = ["pvalue", "importance", "correlation", "patientct", \ "alias1", "alias2", "f1chr", "f1start", "f1end", "f2chr", "f2start", "f2end"] edgeColumnTypes = dict() cursor = db_util.getCursor(config) for column in columns: cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" \ % ( table_edge, column ) ) result = cursor.fetchone()[0] #tuple -> [0] edgeColumnTypes[column] = getDataType(result) cursor.close() os.system( "mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \ -e \"select %s FROM %s;\" > %s" \ % (myhost, myport, myuser, mypw, mydb, \ ", ".join(['%s AS %s__%s' % (key,key,value) \ for (key,value) in edgeColumnTypes.items() ] ), table_edge, dump_edges ) ) else: print "MySQL dump file at %s exists, skipping." % (dump_edges) # Get patient barcodes: if (not os.path.isfile(dump_patients)): columns = ["barcode"] patientColumnTypes = dict() cursor = db_util.getCursor(config) for column in columns: cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" \ % ( table_patient, column ) ) result = cursor.fetchone()[0] #tuple -> [0] patientColumnTypes[column] = getDataType(result) cursor.close() os.system( "mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \ -e \"select %s FROM %s;\" > %s" \ % (myhost, myport, myuser, mypw, mydb, \ ", ".join(['%s AS %s__%s' % (key,key,value) \ for (key,value) in patientColumnTypes.items() ] ), table_patient, dump_patients ) ) else: print "MySQL dump file at %s exists, skipping." % ( dump_patients) print "Finished creating MySQL dumps"
def process_associations_rfex(dataset_label, matrixfile, associationsfile, config, annotations, collapse_direction, reverse_direction, results_path, pv_lambda, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) if (not os.path.isfile(associationsfile)): print associationsfile + " does not exist; unrecoverable ERROR" sys.exit(-1) associations_table = mydb + ".mv_" + dataset_label + "_feature_networks" print "Begin processing associations %s Applying processing_pubcrawl %s" %(time.ctime(), do_pubcrawl) fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile) edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w') associations_in = open(associationsfile,'r') annotation_hash, ftype = parse_features_rfex.process_feature_annotations(annotations) fshout = open(results_path + 'load_sql_associations_' + dataset_label + '.sh','w') solrshout = open(results_path + 'load_solr_assocations_' + dataset_label + '.sh','w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_rface_unmapped.tsv' unmappedout = open(unmappedPath,'w') features_file = open(results_path + dataset_label + '_features_out.tsv','r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() aliasid_file = open(results_path + dataset_label + '_features_alias_id.tsv','r') aliasid_hash = {} for fl in aliasid_file.readlines(): ftk = fl.strip().split("\t") aliasid_hash[ftk[0]] = ftk aliasid_file.close() tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w') pubcrawl_tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_pc.tsv','w') lc = 0 edgeCount = 0 pcc = 0 unMapped = 0 pvalueCutCount = 0 impCut = 0 lines = associations_in.readlines() associations_in.close() associations_dic = {} for line in lines: lc = lc + 1 columns = line.strip().split('\t') if (len(columns) < 5): print "Missing required tokens in associations lineIndex %i lineValue %s" %(lc, line) continue f1alias = columns[0] #afm_ids will be used for directionality collapsing, if needed f1afm_id = columns[0] f2afm_id = columns[1] if (len(f1alias.split(":")) < 3): annotated_feature = annotation_hash.get(f1alias) if (annotated_feature == None): print "ERROR: Target feature %s is not in afm/annotation %i" %(f1alias, len(annotation_hash)) continue f1alias = annotated_feature.replace("\t", ":") f2alias = columns[1] if (len(f2alias.split(":")) < 3): annotated_feature = annotation_hash.get(f2alias) if (annotated_feature == None): print "ERROR: Predictor feature %s is not in afm/annotation" %(f2alias) continue f2alias = annotated_feature.replace("\t", ":") try: f1genescore = fIntHash[f1alias] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[f2alias] except KeyError: f2genescore = 0 f1data = f1alias.split(':') f2data = f2alias.split(':') if len(f1data) > 4: f1data[3] = f1data[3][3:] if len(f2data) > 4: f2data[3] = f2data[3][3:] if (len(f1data) <= 7 and (f1data[1] == 'CLIN' or f1data[1] == 'SAMP')): f1alias = ":".join(f1data[0:3]) + ":::::" f1data = f1alias.split(':') elif (len(f1data) == 7): f1data.append("") if (len(f2data) <= 7 and (f2data[1] == 'CLIN' or f2data[1] == 'SAMP')): f2alias = ":".join(f2data[0:3]) + ":::::" f2data = f2alias.split(':') elif (len(f2data) == 7): f2data.append("") f1aliasOmic = f1alias f2aliasOmic = f2alias #for annotations try: f1id = features_hash[f1alias][0] except KeyError: try: f1id = aliasid_hash[f1alias][1] f1aliasOmic = aliasid_hash[f1alias][2] f1data = f1aliasOmic.split(':') f1data[3] = f1data[3][3:] except KeyError: print "Skipping Key error with alias1 " + f1alias continue try: f2id = features_hash[f2alias][0]#f2alias.split(":")[-1] except KeyError: try: f2id = aliasid_hash[f2alias][1] f2aliasOmic = aliasid_hash[f2alias][2] f2data = f2aliasOmic.split(':') f2data[3] = f2data[3][3:] except KeyError: print "Skipping Key error with alias2 " + f2alias continue pvalue = float(columns[2]) pvalue = str(pv_lambda(pvalue)) importance = columns[3] correlation = columns[4] patientct = columns[5] if (db_util.isUnmappedAssociation(f1alias, f2alias) and keep_unmapped == 0): unmappedout.write(f1alias + "\t" + f2alias + "\n") unMapped += 1 continue rhoscore = "" link_distance = -1 if (len(f1data) >=5 and len(f2data)>=5 and db_util.is_numeric(f1data[4]) >= 1 and db_util.is_numeric(f2data[4]) >= 1 and f1data[3] == f2data[3]): link_distance = abs(int(f2data[4]) - int(f1data[4])) if (collapse_direction == 0): associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" else: #check whether (f1 -> f2 or f2 -> f1) exists, if yes, take the more important #if not, store pair if ((associations_dic.get(f1afm_id + "_" + f2afm_id) == None) and (associations_dic.get(f2afm_id + "_" + f1afm_id) == None)): associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" else: existingLink = associations_dic.get(f1afm_id + "_" + f2afm_id) ekey = f1afm_id + "_" + f2afm_id if (existingLink == None): existingLink = associations_dic.get(f2afm_id + "_" + f1afm_id) ekey = f2afm_id + "_" + f1afm_id prevImportance = existingLink.split("\t")[3] if (float(importance) > float(prevImportance)): associations_dic[ekey] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" if (reverse_direction == 1): associations_dic[f2afm_id + "_" + f1afm_id] = f2aliasOmic + "\t" + f1aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + str(f2genescore) + "\t" + str(f1genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" edgeCount = edgeCount + 1 edgeCount = edgeCount + 1 if (do_pubcrawl == "yes"): getRFACEInfo.processLine(line, pubcrawl_tsvout) pcc += 1 for ei in associations_dic: tsvout.write(associations_dic[ei]) fshout.write("#!/bin/bash\n") fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) fshout.write("load data local infile '" + tsvout.name + "' replace INTO TABLE " + associations_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';") fshout.write("\nEOFMYSQL\n") tsvout.close() unmappedout.close() pubcrawl_tsvout.close() fshout.close() print "\nReport: ValidEdges %i ImportanceCutoff %i edges filtered %i \nunMapped Edges %i Saved to %s" %(len(associations_dic), impCut, pvalueCutCount, unMapped, unmappedPath) print "Begin RF-ACE db bulk upload %s os.system sh %s" %(time.ctime(), fshout.name) os.system("sh " + fshout.name) solrshout.write("#!/bin/bash\n") solrshout.write("python createRFShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin rface solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == 'yes'): smtp.main("*****@*****.**", contacts, "Notification - New RFAce " + dataset_label + " Associations for PubCrawl", "New RFAce associations ready for PubCrawl load\n" + pubcrawl_tsvout.name + "\n" + str(pcc) + " Total Edges\n" + tsvout.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + "\n\n") print "Done processing associations %s" %(time.ctime()) associations_dic = None