def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta, config, annotations, quantileFeatures, results_path, interestingFile): global features_hash print("processing feature set: matrix file %s annotation file %s" % (matrix_file, annotations)) out_hash = {} features_hash = {} summary_hash = {} mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) if (not os.path.isfile(matrix_file)): print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR" sys.exit(-1) feature_matrix_file = open(matrix_file, "r") feature_table = mydb + "." + dataset_label + "_features" sample_table = mydb + "." + dataset_label + "_patients" fshout = open(results_path + dataset_label + '_load_features.sh', 'w') outfile = open(results_path + dataset_label + '_features_out.tsv', 'w') alidfile = open(results_path + dataset_label + '_features_alias_id.tsv', 'w') sampleOutfile = open( results_path + dataset_label + '_sample_values_out.tsv', 'w') featureId = 0 annotation_hash, ftypes = process_feature_annotations(annotations) sub_afm_out = {} for q in quantileFeatures.split(","): sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm', 'w') fIntHash = get_feature_interest_hash(interestingFile) for line in feature_matrix_file: tokens = line.strip().split('\t') afmid = "" ftype = "" interesting_score = 0 if (featureId == 0): sampleIds = tokens #not part of core function in RE import pipeline #if (persist_sample_meta == 1): # populate_sample_meta(sampleIds.split(":"), config) #sampleOutfile.write(sampleIds + "\n"); featureId += 1 continue if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1): valuesArray = [] alias = tokens[0] originalAFM = tokens[0] data = alias.split(':') if (len(data) < 4): #afmid = alias annotated_feature = annotation_hash.get(alias) if (annotated_feature == None): print "ERROR: AFM feature %s is not in annotation" % ( alias) sys.exit(-1) #hasAnnotations = True #put features in sub afm files for quantile calculation ftype = annotated_feature.split("\t")[1] alias = annotated_feature.replace("\t", ":") featureId = int(alias.split(":")[-1]) data = alias.split(':') if (ftype == ""): ftype = data[1] afmid = alias #if (fIntHash[alias] != None): try: interesting_score = fIntHash[originalAFM] except KeyError: #print "Key error with fInterestingHash " + alias interesting_score = 0 if (sub_afm_out.get(ftype) != None): sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) + "\n") features_hash[tokens[0]] = featureId if (len(data) <= 7): if (data[1] == 'CLIN' or data[1] == 'SAMP'): alias = ":".join(data[0:3]) + ":::::" data = alias.split(':') else: data.append("") if len(data[3]) > 3: data[3] = data[3][3:] patient_values = ":".join(tokens[1:]) for val in tokens[1:]: if (db_util.is_numeric(val)): valuesArray.append(float(val)) else: valuesArray.append(0.0) #make sure that the number patient ids match values if (featureId == 1): start = len(sampleIds) - len(valuesArray) sampleStr = ":".join(sampleIds[start:]) sampleOutfile.write(sampleStr + "\n") patient_value_mean = sum(valuesArray) / len(valuesArray) accumulate_summary_counts(summary_hash, data[1]) alidfile.write(originalAFM + "\t" + str(featureId) + "\t" + alias + "\n") out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join( data) + "\t" + patient_values + "\t" + str( patient_value_mean) + "\t" + str(interesting_score) else: print "duplicated feature in feature set:" + tokens[0] featureId += 1 for val in out_hash.values(): outfile.write(val + "\n") summary_out = open( resultsPath + "feature_summary_" + dataset_label + ".json", "w") summary_json = "{" for feature_type in summary_hash: summary_json = summary_json + '"%s":%i,' % (feature_type, summary_hash[feature_type]) summary_out.write(summary_json[0:-1] + "}\n") summary_out.close() feature_matrix_file.close() outfile.close() alidfile.close() sampleOutfile.close() fshout.write("#!/bin/bash\n") fshout.write( "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" % (myhost, myport, myuser, mypw, mydb)) fshout.write("load data local infile '" + outfile.name + "' replace INTO TABLE " + feature_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("load data local infile '" + sampleOutfile.name + "' replace INTO TABLE " + sample_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("\ncommit;\nEOFMYSQL") fshout.close() print "processing done, running bulk load on %i features %s" % ( len(features_hash), time.ctime()) if (persist_sample_meta == 1): os.system("sh " + fshout.name) return annotation_hash
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): """ Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA """ edges_hash = {} max_pv = -1000.0 max_pv_corr = -1000.0 mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser(config) #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword(config) #config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) edges_file = open(pairwised_file) fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile) edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" efshout = open(results_path + 'load_edges_' + dataset_label + '.sh','w') solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh','w') edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_pw_re.tsv','w') edges_out_pc = open(results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv','w') edges_meta_json = open(results_path + 'edges_out_' + dataset_label + '_meta.json','w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv' unmappedout = open(unmappedPath,'w') features_file = open(results_path + dataset_label + '_features_out.tsv','r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() validEdgeId = 1 invalidEdges = 0 dupeEdges = 0 totalEdges = 0 cnan = 0 pcc = 0 unMapped = 0 for line in edges_file: totalEdges += 1 line = line.strip() tokens = line.split('\t') if (len(tokens) < 11): if (validEdgeId == 1): print "Skipping header/line 1 for insufficient token reasons" continue print "ERROR: requires 11 tokens, found:" + str(len(tokens)) + " Skipping line\n" + line continue nodeA = tokens[0] nodeB = tokens[1] try: f1genescore = fIntHash[nodeA] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[nodeB] except KeyError: f2genescore = 0 if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0): unmappedout.write(nodeA + "\t" + nodeB + "\n") unMapped += 1 continue #nodeA = nodeA.replace('|', '_') #nodeB = nodeB.replace('|', '_') try: features_hash[nodeA] except KeyError: print "key error in resolving featureId for " + nodeA + " skipping edge." continue try: features_hash[nodeB] except KeyError: print "key error in resolving featureId for " + nodeB + " skipping edge." continue if (features_hash[nodeA] and features_hash[nodeB]): if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)): feature1id = ""#str(features_hash[nodeA]) feature2id = ""#str(features_hash[nodeB]) #This will need to be improve once all pairs has annotations try: feature1id = str(features_hash[nodeA][0]) except KeyError: print "ERROR: key error in resolving featureId for " + nodeA try: feature2id = str(features_hash[nodeB][0]) except: print "ERROR: key error in resolving featureId for " + nodeB edges_hash[nodeA + "_" + nodeB] = validEdgeId validEdgeId += 1 dataA = process_feature_alias(nodeA) label1_desc = "" dataB = process_feature_alias(nodeB) label2_desc = "" if (len(dataA) == 7): dataA.append("") nodeA = nodeA + ":" if (len(dataB) == 7): dataB.append("") nodeB = nodeB + ":" correlation_str = tokens[2] try: correlation = float(correlation_str) except ValueError: #Align correlation value to NaN cnan += 1 correlation = float('nan') correlation_str = '' numna = tokens[3] pv_str = tokens[4] bonf = tokens[5] pv_bonf_str = tokens[6] numnaf1 = tokens[7] pvf1_str = tokens[8] numnaf2 = tokens[9] pvf2_str = tokens[10] try: pv = str(pvlambda(float(pv_str))) pv_bonf = str(pvlambda(float(pv_bonf_str))) pvf1 = str(pvlambda(float(pvf1_str))) pvf2 = str(pvlambda(float(pvf2_str))) except ValueError: #error in pairwise script, ignore these associations for now continue; if (float(pv) > max_pv): max_pv = float(pv) if (float(pv_bonf) > max_pv_corr): max_pv_corr = float(pv_bonf) rho = str(db_util.sign(correlation)*abs(float(pv))) link_distance = 500000000 if ( len(tokens) >= 12 ): link_distance = int(tokens[11]) else: if (len(dataA) >=5 and len(dataB)>=5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]): link_distance = abs(int(dataB[4]) - int(dataA[4])) edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n") if (do_pubcrawl == "yes"): #call andrea code getPairwiseInfo.processLine(line, edges_out_pc) pcc += 1 else: print "duplicated edge:" + nodeA + "_" + nodeB dupeEdges += 1 else: print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB invalidEdges += 1 print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" %(validEdgeId-1, dupeEdges, cnan, unMapped,unmappedPath, totalEdges, max_pv, max_pv_corr) edges_meta_json.write('{"max_logpv":%f}' %(max_pv)) edges_file.close() edges_out_re.close() edges_out_pc.close() edges_meta_json.close() unmappedout.close() efshout.write("#!/bin/bash\n") efshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") efshout.write("\ncommit;") efshout.write("\nEOFMYSQL") efshout.close() print "Begin pairwise db bulk upload " + time.ctime() os.system("sh " + efshout.name) #create sharded association files for solr import solrshout.write("#!/bin/bash\n"); solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin pairwise solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == "yes"): print "senting Pubcrawl notification to " + contacts smtp.main("*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta, config, annotations, quantileFeatures, results_path, interestingFile): global features_hash print ("processing feature set: matrix file %s annotation file %s"%(matrix_file, annotations)) out_hash = {} features_hash = {} summary_hash = {} mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) if (not os.path.isfile(matrix_file)): print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR" sys.exit(-1) feature_matrix_file = open(matrix_file, "r") feature_table = mydb + "." + dataset_label + "_features" sample_table = mydb + "." + dataset_label + "_patients" fshout = open(results_path + dataset_label + '_load_features.sh','w') outfile = open(results_path + dataset_label + '_features_out.tsv','w') alidfile = open(results_path + dataset_label + '_features_alias_id.tsv','w') sampleOutfile = open(results_path + dataset_label + '_sample_values_out.tsv','w') featureId = 0 annotation_hash, ftypes = process_feature_annotations(annotations) sub_afm_out = {} for q in quantileFeatures.split(","): sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm','w') fIntHash = get_feature_interest_hash(interestingFile) for line in feature_matrix_file: tokens = line.strip().split('\t') afmid = "" ftype = "" interesting_score = 0 if (featureId == 0): sampleIds = tokens #not part of core function in RE import pipeline #if (persist_sample_meta == 1): # populate_sample_meta(sampleIds.split(":"), config) #sampleOutfile.write(sampleIds + "\n"); featureId += 1 continue if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1): valuesArray = [] alias = tokens[0] originalAFM = tokens[0] data = alias.split(':') if (len(data) < 4): #afmid = alias annotated_feature = annotation_hash.get(alias) if (annotated_feature == None): print "ERROR: AFM feature %s is not in annotation" %(alias) sys.exit(-1) #hasAnnotations = True #put features in sub afm files for quantile calculation ftype = annotated_feature.split("\t")[1] alias = annotated_feature.replace("\t", ":") featureId = int(alias.split(":")[-1]) data = alias.split(':') if (ftype == ""): ftype = data[1] afmid = alias #if (fIntHash[alias] != None): try: interesting_score = fIntHash[originalAFM] except KeyError: #print "Key error with fInterestingHash " + alias interesting_score = 0 if (sub_afm_out.get(ftype) != None): sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) + "\n") features_hash[tokens[0]] = featureId if (len(data) <= 7): if (data[1] == 'CLIN' or data[1] == 'SAMP'): alias = ":".join(data[0:3]) + ":::::" data = alias.split(':') else: data.append("") if len(data[3]) > 3: data[3] = data[3][3:] patient_values = ":".join(tokens[1:]) for val in tokens[1:]: if (db_util.is_numeric(val)): valuesArray.append(float(val)) else: valuesArray.append(0.0) #make sure that the number patient ids match values if (featureId == 1): start = len(sampleIds) - len(valuesArray) sampleStr = ":".join(sampleIds[start:]) sampleOutfile.write(sampleStr + "\n"); patient_value_mean = sum(valuesArray)/len(valuesArray) accumulate_summary_counts(summary_hash,data[1]) alidfile.write(originalAFM + "\t" + str(featureId) + "\t" + alias + "\n") out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join(data) + "\t" + patient_values + "\t" + str(patient_value_mean) + "\t" + str(interesting_score) else: print "duplicated feature in feature set:" + tokens[0] featureId += 1 for val in out_hash.values(): outfile.write(val + "\n") summary_out = open(resultsPath + "feature_summary_" + dataset_label + ".json", "w") summary_json = "{" for feature_type in summary_hash: summary_json = summary_json + '"%s":%i,' %(feature_type, summary_hash[feature_type]) summary_out.write(summary_json[0:-1] + "}\n") summary_out.close() feature_matrix_file.close() outfile.close() alidfile.close() sampleOutfile.close() fshout.write("#!/bin/bash\n") fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) fshout.write("load data local infile '" + outfile.name + "' replace INTO TABLE " + feature_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("load data local infile '" + sampleOutfile.name + "' replace INTO TABLE " + sample_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") fshout.write("\ncommit;\nEOFMYSQL") fshout.close() print "processing done, running bulk load on %i features %s" %(len(features_hash), time.ctime()) if (persist_sample_meta == 1): os.system("sh " + fshout.name) return annotation_hash
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): """ Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA """ edges_hash = {} max_pv = -1000.0 max_pv_corr = -1000.0 mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser( config) #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword( config) #config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) edges_file = open(pairwised_file) fIntHash = parse_features_rfex.get_feature_interest_hash( featureInterestingFile) edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" efshout = open(results_path + 'load_edges_' + dataset_label + '.sh', 'w') solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh', 'w') edges_out_re = open( results_path + 'edges_out_' + dataset_label + '_pw_re.tsv', 'w') edges_out_pc = open( results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv', 'w') edges_meta_json = open( results_path + 'edges_out_' + dataset_label + '_meta.json', 'w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv' unmappedout = open(unmappedPath, 'w') features_file = open(results_path + dataset_label + '_features_out.tsv', 'r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() validEdgeId = 1 invalidEdges = 0 dupeEdges = 0 totalEdges = 0 cnan = 0 pcc = 0 unMapped = 0 for line in edges_file: totalEdges += 1 line = line.strip() tokens = line.split('\t') if (len(tokens) < 11): if (validEdgeId == 1): print "Skipping header/line 1 for insufficient token reasons" continue print "ERROR: requires 11 tokens, found:" + str( len(tokens)) + " Skipping line\n" + line continue nodeA = tokens[0] nodeB = tokens[1] try: f1genescore = fIntHash[nodeA] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[nodeB] except KeyError: f2genescore = 0 if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0): unmappedout.write(nodeA + "\t" + nodeB + "\n") unMapped += 1 continue #nodeA = nodeA.replace('|', '_') #nodeB = nodeB.replace('|', '_') try: features_hash[nodeA] except KeyError: print "key error in resolving featureId for " + nodeA + " skipping edge." continue try: features_hash[nodeB] except KeyError: print "key error in resolving featureId for " + nodeB + " skipping edge." continue if (features_hash[nodeA] and features_hash[nodeB]): if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)): feature1id = "" #str(features_hash[nodeA]) feature2id = "" #str(features_hash[nodeB]) #This will need to be improve once all pairs has annotations try: feature1id = str(features_hash[nodeA][0]) except KeyError: print "ERROR: key error in resolving featureId for " + nodeA try: feature2id = str(features_hash[nodeB][0]) except: print "ERROR: key error in resolving featureId for " + nodeB edges_hash[nodeA + "_" + nodeB] = validEdgeId validEdgeId += 1 dataA = process_feature_alias(nodeA) label1_desc = "" dataB = process_feature_alias(nodeB) label2_desc = "" if (len(dataA) == 7): dataA.append("") nodeA = nodeA + ":" if (len(dataB) == 7): dataB.append("") nodeB = nodeB + ":" correlation_str = tokens[2] try: correlation = float(correlation_str) except ValueError: #Align correlation value to NaN cnan += 1 correlation = float('nan') correlation_str = '' numna = tokens[3] pv_str = tokens[4] bonf = tokens[5] pv_bonf_str = tokens[6] numnaf1 = tokens[7] pvf1_str = tokens[8] numnaf2 = tokens[9] pvf2_str = tokens[10] try: pv = str(pvlambda(float(pv_str))) pv_bonf = str(pvlambda(float(pv_bonf_str))) pvf1 = str(pvlambda(float(pvf1_str))) pvf2 = str(pvlambda(float(pvf2_str))) except ValueError: #error in pairwise script, ignore these associations for now continue if (float(pv) > max_pv): max_pv = float(pv) if (float(pv_bonf) > max_pv_corr): max_pv_corr = float(pv_bonf) rho = str(db_util.sign(correlation) * abs(float(pv))) link_distance = 500000000 if (len(tokens) >= 12): link_distance = int(tokens[11]) else: if (len(dataA) >= 5 and len(dataB) >= 5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]): link_distance = abs(int(dataB[4]) - int(dataA[4])) edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n") if (do_pubcrawl == "yes"): #call andrea code getPairwiseInfo.processLine(line, edges_out_pc) pcc += 1 else: print "duplicated edge:" + nodeA + "_" + nodeB dupeEdges += 1 else: print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB invalidEdges += 1 print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" % ( validEdgeId - 1, dupeEdges, cnan, unMapped, unmappedPath, totalEdges, max_pv, max_pv_corr) edges_meta_json.write('{"max_logpv":%f}' % (max_pv)) edges_file.close() edges_out_re.close() edges_out_pc.close() edges_meta_json.close() unmappedout.close() efshout.write("#!/bin/bash\n") efshout.write( "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" % (myhost, myport, myuser, mypw, mydb)) efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") efshout.write("\ncommit;") efshout.write("\nEOFMYSQL") efshout.close() print "Begin pairwise db bulk upload " + time.ctime() os.system("sh " + efshout.name) #create sharded association files for solr import solrshout.write("#!/bin/bash\n") solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write( "curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin pairwise solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == "yes"): print "senting Pubcrawl notification to " + contacts smtp.main( "*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
def process_associations_rfex(dataset_label, matrixfile, associationsfile, config, annotations, collapse_direction, reverse_direction, results_path, pv_lambda, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) if (not os.path.isfile(associationsfile)): print associationsfile + " does not exist; unrecoverable ERROR" sys.exit(-1) associations_table = mydb + ".mv_" + dataset_label + "_feature_networks" print "Begin processing associations %s Applying processing_pubcrawl %s" %(time.ctime(), do_pubcrawl) fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile) edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w') associations_in = open(associationsfile,'r') annotation_hash, ftype = parse_features_rfex.process_feature_annotations(annotations) fshout = open(results_path + 'load_sql_associations_' + dataset_label + '.sh','w') solrshout = open(results_path + 'load_solr_assocations_' + dataset_label + '.sh','w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_rface_unmapped.tsv' unmappedout = open(unmappedPath,'w') features_file = open(results_path + dataset_label + '_features_out.tsv','r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() aliasid_file = open(results_path + dataset_label + '_features_alias_id.tsv','r') aliasid_hash = {} for fl in aliasid_file.readlines(): ftk = fl.strip().split("\t") aliasid_hash[ftk[0]] = ftk aliasid_file.close() tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w') pubcrawl_tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_pc.tsv','w') lc = 0 edgeCount = 0 pcc = 0 unMapped = 0 pvalueCutCount = 0 impCut = 0 lines = associations_in.readlines() associations_in.close() associations_dic = {} for line in lines: lc = lc + 1 columns = line.strip().split('\t') if (len(columns) < 5): print "Missing required tokens in associations lineIndex %i lineValue %s" %(lc, line) continue f1alias = columns[0] #afm_ids will be used for directionality collapsing, if needed f1afm_id = columns[0] f2afm_id = columns[1] if (len(f1alias.split(":")) < 3): annotated_feature = annotation_hash.get(f1alias) if (annotated_feature == None): print "ERROR: Target feature %s is not in afm/annotation %i" %(f1alias, len(annotation_hash)) continue f1alias = annotated_feature.replace("\t", ":") f2alias = columns[1] if (len(f2alias.split(":")) < 3): annotated_feature = annotation_hash.get(f2alias) if (annotated_feature == None): print "ERROR: Predictor feature %s is not in afm/annotation" %(f2alias) continue f2alias = annotated_feature.replace("\t", ":") try: f1genescore = fIntHash[f1alias] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[f2alias] except KeyError: f2genescore = 0 f1data = f1alias.split(':') f2data = f2alias.split(':') if len(f1data) > 4: f1data[3] = f1data[3][3:] if len(f2data) > 4: f2data[3] = f2data[3][3:] if (len(f1data) <= 7 and (f1data[1] == 'CLIN' or f1data[1] == 'SAMP')): f1alias = ":".join(f1data[0:3]) + ":::::" f1data = f1alias.split(':') elif (len(f1data) == 7): f1data.append("") if (len(f2data) <= 7 and (f2data[1] == 'CLIN' or f2data[1] == 'SAMP')): f2alias = ":".join(f2data[0:3]) + ":::::" f2data = f2alias.split(':') elif (len(f2data) == 7): f2data.append("") f1aliasOmic = f1alias f2aliasOmic = f2alias #for annotations try: f1id = features_hash[f1alias][0] except KeyError: try: f1id = aliasid_hash[f1alias][1] f1aliasOmic = aliasid_hash[f1alias][2] f1data = f1aliasOmic.split(':') f1data[3] = f1data[3][3:] except KeyError: print "Skipping Key error with alias1 " + f1alias continue try: f2id = features_hash[f2alias][0]#f2alias.split(":")[-1] except KeyError: try: f2id = aliasid_hash[f2alias][1] f2aliasOmic = aliasid_hash[f2alias][2] f2data = f2aliasOmic.split(':') f2data[3] = f2data[3][3:] except KeyError: print "Skipping Key error with alias2 " + f2alias continue pvalue = float(columns[2]) pvalue = str(pv_lambda(pvalue)) importance = columns[3] correlation = columns[4] patientct = columns[5] if (db_util.isUnmappedAssociation(f1alias, f2alias) and keep_unmapped == 0): unmappedout.write(f1alias + "\t" + f2alias + "\n") unMapped += 1 continue rhoscore = "" link_distance = -1 if (len(f1data) >=5 and len(f2data)>=5 and db_util.is_numeric(f1data[4]) >= 1 and db_util.is_numeric(f2data[4]) >= 1 and f1data[3] == f2data[3]): link_distance = abs(int(f2data[4]) - int(f1data[4])) if (collapse_direction == 0): associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" else: #check whether (f1 -> f2 or f2 -> f1) exists, if yes, take the more important #if not, store pair if ((associations_dic.get(f1afm_id + "_" + f2afm_id) == None) and (associations_dic.get(f2afm_id + "_" + f1afm_id) == None)): associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" else: existingLink = associations_dic.get(f1afm_id + "_" + f2afm_id) ekey = f1afm_id + "_" + f2afm_id if (existingLink == None): existingLink = associations_dic.get(f2afm_id + "_" + f1afm_id) ekey = f2afm_id + "_" + f1afm_id prevImportance = existingLink.split("\t")[3] if (float(importance) > float(prevImportance)): associations_dic[ekey] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" if (reverse_direction == 1): associations_dic[f2afm_id + "_" + f1afm_id] = f2aliasOmic + "\t" + f1aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + str(f2genescore) + "\t" + str(f1genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" edgeCount = edgeCount + 1 edgeCount = edgeCount + 1 if (do_pubcrawl == "yes"): getRFACEInfo.processLine(line, pubcrawl_tsvout) pcc += 1 for ei in associations_dic: tsvout.write(associations_dic[ei]) fshout.write("#!/bin/bash\n") fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) fshout.write("load data local infile '" + tsvout.name + "' replace INTO TABLE " + associations_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';") fshout.write("\nEOFMYSQL\n") tsvout.close() unmappedout.close() pubcrawl_tsvout.close() fshout.close() print "\nReport: ValidEdges %i ImportanceCutoff %i edges filtered %i \nunMapped Edges %i Saved to %s" %(len(associations_dic), impCut, pvalueCutCount, unMapped, unmappedPath) print "Begin RF-ACE db bulk upload %s os.system sh %s" %(time.ctime(), fshout.name) os.system("sh " + fshout.name) solrshout.write("#!/bin/bash\n") solrshout.write("python createRFShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin rface solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == 'yes'): smtp.main("*****@*****.**", contacts, "Notification - New RFAce " + dataset_label + " Associations for PubCrawl", "New RFAce associations ready for PubCrawl load\n" + pubcrawl_tsvout.name + "\n" + str(pcc) + " Total Edges\n" + tsvout.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + "\n\n") print "Done processing associations %s" %(time.ctime()) associations_dic = None