Esempio n. 1
0
def dropviews(label):

    tlines="DROP TABLE IF EXISTS #REPLACE#_feature_pathways"
    ds_lines = tlines.replace("#REPLACE#", label)
    print ds_lines
    cmd = "mysql -h %s --port %s -u%s -p%s -e \"%s\" %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), ds_lines, db_util.getDBSchema(config))
    os.system(cmd);
Esempio n. 2
0
def executeHide(schemafile, config):
    schemafile_path = schemafile.name
    cmd = "mysql -h %s --port %s -u%s -p%s < %s" % (
        db_util.getDBHost(config), db_util.getDBPort(config),
        db_util.getDBUser(config), db_util.getDBPassword(config),
        schemafile_path)
    os.system(cmd)
Esempio n. 3
0
def fix(label):
    
    tlines="""DROP VIEW IF EXISTS v_#REPLACE#_feature_categorical_labels;
    CREATE VIEW v_#REPLACE#_feature_categorical_labels as
    SELECT DISTINCT label, alias, source, interesting_score from #REPLACE#_features where source = 'CLIN'
    UNION SELECT DISTINCT label, alias, source, interesting_score from #REPLACE#_features where source = 'SAMP'
    UNION SELECT DISTINCT label, alias, source, interesting_score from #REPLACE#_features where source = 'PRDM';
    """
    ds_lines = tlines.replace("#REPLACE#", label)
    print ds_lines
    cmd = "mysql -h %s --port %s -u%s -p%s -e \"%s\" %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), ds_lines, db_util.getDBSchema(config))
    os.system(cmd)
#!/usr/bin/python
"""
Processing features from in data_matrix and gexp interesting. Sample values and identifiers will also be extracted. 
The required dataset label indicates the set of schemas. 
"""
import sys
import os
import time
import math
import decimal
import db_util

mydb = db_util.getDBSchema()  #config.get("mysql_jdbc_configs", "db")
myuser = db_util.getDBUser()  #config.get("mysql_jdbc_configs", "username")
mypw = db_util.getDBPassword()  #config.get("mysql_jdbc_configs", "password")
myhost = db_util.getDBHost()
myport = db_util.getDBPort()

features_hash = {}
edges_hash = {}
gene_interesting_hash = {}
dataset_label = ""
load_desc = ""
feature_table = ""
sample_table = ""
feature_table_label = "tcga.gbm_pw"


def is_numeric(val):
    try:
        float(val)
#!/usr/bin/python
import db_util
import math
import sys
import os
import time

myhost = db_util.getDBHost()  #config.get("mysql_jdbc_configs", "host")
myport = db_util.getDBPort()
mydb = db_util.getDBSchema()  #config.get("mysql_jdbc_configs", "db")
myuser = db_util.getDBUser()  #config.get("mysql_jdbc_configs", "username")
mypw = db_util.getDBPassword()  #config.get("mysql_jdbc_configs", "password")

print "Parsing samples/patients and values kicked off %s" % time.ctime()
if (len(sys.argv) != 3):
    print 'Usage is py2.6 parse_sample_values.py data_associations.csv dataset_label'
    sys.exit(1)

infile = open(sys.argv[1])
dataset_label = sys.argv[2]
schema = mydb
feature_table = schema + "." + dataset_label + "_features"
samples_table = schema + "." + dataset_label + "_patients"

feature_helper = {}
feature_helper["IMP2"] = "C:GENO:IMP2:chrIX:53981:55021:-"
feature_helper["AAP1"] = "C:GENO:AAP1:chrVIII:198740:201310:-"
features_hash = {}
fshout = open('load_strain_values_' + dataset_label + '.sh', 'w')
outfile = open('./strain_values_out_' + dataset_label + '.sql', 'w')
#fshout = open('loadKruglyakValuesSql_pv.sh','w')
Esempio n. 6
0
def executeSchema(schemafile_path, config):
	cmd = "mysql -h %s --port %s -u%s -p%s < %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), schemafile_path)
	print "Running system call %s" %(cmd)
	os.system(cmd)	
def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta,
                           config, annotations, quantileFeatures, results_path,
                           interestingFile):
    global features_hash
    print("processing feature set: matrix file %s annotation file %s" %
          (matrix_file, annotations))
    out_hash = {}
    features_hash = {}
    summary_hash = {}
    mydb = db_util.getDBSchema(config)
    myuser = db_util.getDBUser(config)
    mypw = db_util.getDBPassword(config)
    myhost = db_util.getDBHost(config)
    myport = db_util.getDBPort(config)
    if (not os.path.isfile(matrix_file)):
        print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR"
        sys.exit(-1)
    feature_matrix_file = open(matrix_file, "r")
    feature_table = mydb + "." + dataset_label + "_features"
    sample_table = mydb + "." + dataset_label + "_patients"
    fshout = open(results_path + dataset_label + '_load_features.sh', 'w')
    outfile = open(results_path + dataset_label + '_features_out.tsv', 'w')
    alidfile = open(results_path + dataset_label + '_features_alias_id.tsv',
                    'w')
    sampleOutfile = open(
        results_path + dataset_label + '_sample_values_out.tsv', 'w')
    featureId = 0
    annotation_hash, ftypes = process_feature_annotations(annotations)
    sub_afm_out = {}
    for q in quantileFeatures.split(","):
        sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm',
                              'w')
    fIntHash = get_feature_interest_hash(interestingFile)
    for line in feature_matrix_file:
        tokens = line.strip().split('\t')
        afmid = ""
        ftype = ""
        interesting_score = 0
        if (featureId == 0):
            sampleIds = tokens
            #not part of core function in RE import pipeline
            #if (persist_sample_meta == 1):
            #	populate_sample_meta(sampleIds.split(":"), config)
            #sampleOutfile.write(sampleIds + "\n");
            featureId += 1
            continue
        if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1):
            valuesArray = []
            alias = tokens[0]
            originalAFM = tokens[0]
            data = alias.split(':')
            if (len(data) < 4):
                #afmid = alias
                annotated_feature = annotation_hash.get(alias)
                if (annotated_feature == None):
                    print "ERROR: AFM feature %s is not in annotation" % (
                        alias)
                    sys.exit(-1)
                #hasAnnotations = True
                #put features in sub afm files for quantile calculation
                ftype = annotated_feature.split("\t")[1]
                alias = annotated_feature.replace("\t", ":")
                featureId = int(alias.split(":")[-1])
            data = alias.split(':')
            if (ftype == ""):
                ftype = data[1]
            afmid = alias
            #if (fIntHash[alias] != None):
            try:
                interesting_score = fIntHash[originalAFM]
            except KeyError:
                #print "Key error with fInterestingHash " + alias
                interesting_score = 0
            if (sub_afm_out.get(ftype) != None):
                sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) +
                                         "\n")
            features_hash[tokens[0]] = featureId
            if (len(data) <= 7):
                if (data[1] == 'CLIN' or data[1] == 'SAMP'):
                    alias = ":".join(data[0:3]) + ":::::"
                    data = alias.split(':')
                else:
                    data.append("")
            if len(data[3]) > 3:
                data[3] = data[3][3:]
            patient_values = ":".join(tokens[1:])
            for val in tokens[1:]:
                if (db_util.is_numeric(val)):
                    valuesArray.append(float(val))
                else:
                    valuesArray.append(0.0)
            #make sure that the number patient ids match values
            if (featureId == 1):
                start = len(sampleIds) - len(valuesArray)
                sampleStr = ":".join(sampleIds[start:])
                sampleOutfile.write(sampleStr + "\n")

            patient_value_mean = sum(valuesArray) / len(valuesArray)
            accumulate_summary_counts(summary_hash, data[1])
            alidfile.write(originalAFM + "\t" + str(featureId) + "\t" + alias +
                           "\n")
            out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join(
                data) + "\t" + patient_values + "\t" + str(
                    patient_value_mean) + "\t" + str(interesting_score)
        else:
            print "duplicated feature in feature set:" + tokens[0]
        featureId += 1
    for val in out_hash.values():
        outfile.write(val + "\n")

    summary_out = open(
        resultsPath + "feature_summary_" + dataset_label + ".json", "w")
    summary_json = "{"

    for feature_type in summary_hash:
        summary_json = summary_json + '"%s":%i,' % (feature_type,
                                                    summary_hash[feature_type])
    summary_out.write(summary_json[0:-1] + "}\n")
    summary_out.close()
    feature_matrix_file.close()
    outfile.close()
    alidfile.close()
    sampleOutfile.close()
    fshout.write("#!/bin/bash\n")
    fshout.write(
        "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n"
        % (myhost, myport, myuser, mypw, mydb))
    fshout.write("load data local infile '" + outfile.name +
                 "' replace INTO TABLE " + feature_table +
                 " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
    fshout.write("load data local infile '" + sampleOutfile.name +
                 "' replace INTO TABLE " + sample_table +
                 " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
    fshout.write("\ncommit;\nEOFMYSQL")
    fshout.close()
    print "processing done, running bulk load on %i features  %s" % (
        len(features_hash), time.ctime())
    if (persist_sample_meta == 1):
        os.system("sh " + fshout.name)
    return annotation_hash
Esempio n. 8
0
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile):
	"""
	Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash
	Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA	
	"""
	edges_hash = {}
	max_pv = -1000.0
	max_pv_corr = -1000.0
	mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db")
	myuser = db_util.getDBUser(config) #config.get("mysql_jdbc_configs", "username")
	mypw = db_util.getDBPassword(config) #config.get("mysql_jdbc_configs", "password")
	myhost = db_util.getDBHost(config)
	myport = db_util.getDBPort(config)
	mysolr = db_util.getSolrPath(config)
	edges_file = open(pairwised_file)
	fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile)
	edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" 
	efshout = open(results_path + 'load_edges_' + dataset_label + '.sh','w')
	solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh','w')
	edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_pw_re.tsv','w')
	edges_out_pc = open(results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv','w')
	edges_meta_json = open(results_path + 'edges_out_' + dataset_label + '_meta.json','w')
	unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv'
	unmappedout = open(unmappedPath,'w')
	features_file = open(results_path + dataset_label + '_features_out.tsv','r')
	features_hash = {}
	for fl in features_file.readlines():
		ftk = fl.strip().split("\t")
		features_hash[ftk[1]] = ftk
		features_file.close()

	validEdgeId = 1
	invalidEdges = 0
	dupeEdges = 0
	totalEdges = 0
	cnan = 0
	pcc = 0
	unMapped = 0
	for line in edges_file:
		totalEdges += 1 
		line = line.strip()
		tokens = line.split('\t')
		if (len(tokens) < 11):
			if (validEdgeId == 1):
				print "Skipping header/line 1 for insufficient token reasons"
				continue
			print "ERROR: requires 11 tokens, found:" + str(len(tokens)) + " Skipping line\n" + line
			continue
		nodeA = tokens[0]
		nodeB = tokens[1]

		try:
			f1genescore = fIntHash[nodeA]
		except KeyError:
			f1genescore = 0
		try:
			f2genescore = fIntHash[nodeB]
		except KeyError:
			f2genescore = 0

		if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0):
			unmappedout.write(nodeA + "\t" + nodeB + "\n")
			unMapped += 1
			continue
		#nodeA = nodeA.replace('|', '_')
		#nodeB = nodeB.replace('|', '_')
		try:
			features_hash[nodeA]
		except KeyError:
			print "key error in resolving featureId for " + nodeA + " skipping edge."
			continue
		try:
			features_hash[nodeB]
		except KeyError:
			print "key error in resolving featureId for " + nodeB + " skipping edge."
			continue

		if (features_hash[nodeA] and features_hash[nodeB]):
			if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)):
				feature1id = ""#str(features_hash[nodeA]) 
				feature2id = ""#str(features_hash[nodeB])
				#This will need to be improve once all pairs has annotations 
				try:
					feature1id = str(features_hash[nodeA][0])
				except KeyError:
					print "ERROR: key error in resolving featureId for " + nodeA
				try:
					feature2id = str(features_hash[nodeB][0])
				except:
					print "ERROR: key error in resolving featureId for " + nodeB

				edges_hash[nodeA + "_" + nodeB] = validEdgeId
				validEdgeId += 1
				dataA = process_feature_alias(nodeA)
				label1_desc = ""
				dataB = process_feature_alias(nodeB)
				label2_desc = ""
				if (len(dataA) == 7):
					dataA.append("")
					nodeA = nodeA + ":"
				if (len(dataB) == 7):
					dataB.append("")
					nodeB = nodeB + ":"
				correlation_str = tokens[2]
				try:
					correlation = float(correlation_str)
				except ValueError:
					#Align correlation value to NaN
					cnan += 1
					correlation = float('nan')
					correlation_str = ''
				numna = tokens[3]
				pv_str = tokens[4]
				bonf = tokens[5]
				pv_bonf_str = tokens[6]
				numnaf1 = tokens[7]
				pvf1_str = tokens[8]
				numnaf2 = tokens[9]
				pvf2_str = tokens[10]
				try:
					pv = str(pvlambda(float(pv_str)))
					pv_bonf = str(pvlambda(float(pv_bonf_str)))
					pvf1 = str(pvlambda(float(pvf1_str)))
					pvf2 = str(pvlambda(float(pvf2_str)))
				except ValueError:
					#error in pairwise script, ignore these associations for now
					continue;

				if (float(pv) > max_pv):
					max_pv = float(pv)
				
				if (float(pv_bonf) > max_pv_corr):
					max_pv_corr = float(pv_bonf)

				rho = str(db_util.sign(correlation)*abs(float(pv)))
				
				link_distance = 500000000
				if ( len(tokens) >= 12 ):
  					link_distance = int(tokens[11])
				else:
					if (len(dataA) >=5 and len(dataB)>=5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]):
						link_distance = abs(int(dataB[4]) - int(dataA[4]))
				edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n")
				if (do_pubcrawl == "yes"):
					#call andrea code
					getPairwiseInfo.processLine(line, edges_out_pc)
					pcc += 1
			else:
				print "duplicated edge:" + nodeA + "_" + nodeB
				dupeEdges += 1
		else:
			print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB
			invalidEdges += 1
	print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" %(validEdgeId-1, dupeEdges, cnan, unMapped,unmappedPath, totalEdges, max_pv, max_pv_corr)	
	edges_meta_json.write('{"max_logpv":%f}' %(max_pv))
	edges_file.close()
	edges_out_re.close()
	edges_out_pc.close()
	edges_meta_json.close()	
	unmappedout.close()
	efshout.write("#!/bin/bash\n")
	efshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
	efshout.write("\ncommit;")
	efshout.write("\nEOFMYSQL")
	efshout.close()
	print "Begin pairwise db bulk upload " + time.ctime() 
	os.system("sh " + efshout.name)
	#create sharded association files for solr import
	solrshout.write("#!/bin/bash\n");
	solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") 
	solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv  -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.close()
	print "Begin pairwise solr upload " + time.ctime()
	os.system("sh " + solrshout.name)
	if (do_pubcrawl == "yes"):
		print "senting Pubcrawl notification to " + contacts
		smtp.main("*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
Esempio n. 9
0
def executeDrop(schemafile, config):
    schemafile_path = schemafile.name
    cmd = "mysql -h %s --port %s -u%s -p%s < %s" %(db_util.getDBHost(config), db_util.getDBPort(config), db_util.getDBUser(config), db_util.getDBPassword(config), schemafile_path)
    os.system(cmd)
#!/usr/bin/python
import db_util
import math
import sys
import os
import time

myhost = db_util.getDBHost()  # config.get("mysql_jdbc_configs", "host")
myport = db_util.getDBPort()
mydb = db_util.getDBSchema()  # config.get("mysql_jdbc_configs", "db")
myuser = db_util.getDBUser()  # config.get("mysql_jdbc_configs", "username")
mypw = db_util.getDBPassword()  # config.get("mysql_jdbc_configs", "password")

print "Parsing samples/patients and values kicked off %s" % time.ctime()
if len(sys.argv) != 3:
    print "Usage is py2.6 parse_sample_values.py data_associations.csv dataset_label"
    sys.exit(1)

infile = open(sys.argv[1])
dataset_label = sys.argv[2]
schema = mydb
feature_table = schema + "." + dataset_label + "_features"
samples_table = schema + "." + dataset_label + "_patients"

feature_helper = {}
feature_helper["IMP2"] = "C:GENO:IMP2:chrIX:53981:55021:-"
feature_helper["AAP1"] = "C:GENO:AAP1:chrVIII:198740:201310:-"
features_hash = {}
fshout = open("load_strain_values_" + dataset_label + ".sh", "w")
outfile = open("./strain_values_out_" + dataset_label + ".sql", "w")
# fshout = open('loadKruglyakValuesSql_pv.sh','w')
Esempio n. 11
0
    def createMysqlDumps(self):
        print "Creating MySQL dumps"
        config = self.config

        for dslabel in self.labels:
            print "Creating MySQL dumps for labelname '%s'" % (dslabel)
            dump_edges = getDumpEdgesFile(config, dslabel)
            dump_nodes = getDumpNodesFile(config, dslabel)
            dump_patients = getDumpPatientsFile(config, dslabel)

            table_feat = dslabel + "_features"
            table_edge = "mv_" + dslabel + "_feature_networks"
            table_patient = dslabel + "_patients"

            mydb = db_util.getDBSchema(config)
            myuser = db_util.getDBUser(config)
            mypw = db_util.getDBPassword(config)
            myhost = db_util.getDBHost(config)
            myport = str(db_util.getDBPort(config))

            # Get nodes
            if not os.path.isfile(dump_nodes):
                nodeColumnTypes = dict()
                columns = [
                    "alias",
                    "type",
                    "source",
                    "label",
                    "chr",
                    "start",
                    "end",
                    "strand",
                    "label_desc",
                    "patient_values",
                    "patient_values_mean",
                    "quantile_val",
                    "quantile",
                    "gene_interesting_score",
                ]

                cursor = db_util.getCursor(config)
                for column in columns:
                    rows = cursor.execute(
                        "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'"
                        % (table_feat, column)
                    )
                    result = cursor.fetchone()[0]  # tuple -> [0]
                    nodeColumnTypes[column] = getDataType(result)
                cursor.close()

                # pvalue AS pvalue__varchar => dump header columns format is like pvalue__varchar
                os.system(
                    'mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \
					-e " SELECT %s FROM %s;" > %s'
                    % (
                        myhost,
                        myport,
                        myuser,
                        mypw,
                        mydb,
                        ", ".join(["%s AS %s__%s" % (key, key, value) for (key, value) in nodeColumnTypes.items()]),
                        table_feat,
                        dump_nodes,
                    )
                )
            else:
                print "MySQL dump file at %s exists, skipping." % (dump_nodes)

                # Get edges
            if not os.path.isfile(dump_edges):
                columns = [
                    "pvalue",
                    "importance",
                    "correlation",
                    "patientct",
                    "alias1",
                    "alias2",
                    "f1chr",
                    "f1start",
                    "f1end",
                    "f2chr",
                    "f2start",
                    "f2end",
                ]
                edgeColumnTypes = dict()

                cursor = db_util.getCursor(config)
                for column in columns:
                    cursor.execute(
                        "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'"
                        % (table_edge, column)
                    )
                    result = cursor.fetchone()[0]  # tuple -> [0]
                    edgeColumnTypes[column] = getDataType(result)
                cursor.close()

                os.system(
                    'mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \
					-e "select %s FROM %s;" > %s'
                    % (
                        myhost,
                        myport,
                        myuser,
                        mypw,
                        mydb,
                        ", ".join(["%s AS %s__%s" % (key, key, value) for (key, value) in edgeColumnTypes.items()]),
                        table_edge,
                        dump_edges,
                    )
                )
            else:
                print "MySQL dump file at %s exists, skipping." % (dump_edges)

                # Get patient barcodes:
            if not os.path.isfile(dump_patients):
                columns = ["barcode"]
                patientColumnTypes = dict()

                cursor = db_util.getCursor(config)
                for column in columns:
                    cursor.execute(
                        "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'"
                        % (table_patient, column)
                    )
                    result = cursor.fetchone()[0]  # tuple -> [0]
                    patientColumnTypes[column] = getDataType(result)
                cursor.close()

                os.system(
                    'mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \
					-e "select %s FROM %s;" > %s'
                    % (
                        myhost,
                        myport,
                        myuser,
                        mypw,
                        mydb,
                        ", ".join(["%s AS %s__%s" % (key, key, value) for (key, value) in patientColumnTypes.items()]),
                        table_patient,
                        dump_patients,
                    )
                )
            else:
                print "MySQL dump file at %s exists, skipping." % (dump_patients)

        print "Finished creating MySQL dumps"
#!/usr/bin/python
"""
Processing features from in data_matrix and gexp interesting. Sample values and identifiers will also be extracted. 
The required dataset label indicates the set of schemas. 
"""
import sys
import os
import time
import math
import decimal
import db_util

mydb = db_util.getDBSchema()  # config.get("mysql_jdbc_configs", "db")
myuser = db_util.getDBUser()  # config.get("mysql_jdbc_configs", "username")
mypw = db_util.getDBPassword()  # config.get("mysql_jdbc_configs", "password")
myhost = db_util.getDBHost()
myport = db_util.getDBPort()


features_hash = {}
edges_hash = {}
gene_interesting_hash = {}
dataset_label = ""
load_desc = ""
feature_table = ""
sample_table = ""
feature_table_label = "tcga.gbm_pw"


def is_numeric(val):
    try:
Esempio n. 13
0
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda,
                           config, results_path, do_pubcrawl, contacts,
                           keep_unmapped, featureInterestingFile):
    """
	Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash
	Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA	
	"""
    edges_hash = {}
    max_pv = -1000.0
    max_pv_corr = -1000.0
    mydb = db_util.getDBSchema(config)  #config.get("mysql_jdbc_configs", "db")
    myuser = db_util.getDBUser(
        config)  #config.get("mysql_jdbc_configs", "username")
    mypw = db_util.getDBPassword(
        config)  #config.get("mysql_jdbc_configs", "password")
    myhost = db_util.getDBHost(config)
    myport = db_util.getDBPort(config)
    mysolr = db_util.getSolrPath(config)
    edges_file = open(pairwised_file)
    fIntHash = parse_features_rfex.get_feature_interest_hash(
        featureInterestingFile)
    edge_table = mydb + ".mv_" + dataset_label + "_feature_networks"
    efshout = open(results_path + 'load_edges_' + dataset_label + '.sh', 'w')
    solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh', 'w')
    edges_out_re = open(
        results_path + 'edges_out_' + dataset_label + '_pw_re.tsv', 'w')
    edges_out_pc = open(
        results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv', 'w')
    edges_meta_json = open(
        results_path + 'edges_out_' + dataset_label + '_meta.json', 'w')
    unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv'
    unmappedout = open(unmappedPath, 'w')
    features_file = open(results_path + dataset_label + '_features_out.tsv',
                         'r')
    features_hash = {}
    for fl in features_file.readlines():
        ftk = fl.strip().split("\t")
        features_hash[ftk[1]] = ftk
        features_file.close()

    validEdgeId = 1
    invalidEdges = 0
    dupeEdges = 0
    totalEdges = 0
    cnan = 0
    pcc = 0
    unMapped = 0
    for line in edges_file:
        totalEdges += 1
        line = line.strip()
        tokens = line.split('\t')
        if (len(tokens) < 11):
            if (validEdgeId == 1):
                print "Skipping header/line 1 for insufficient token reasons"
                continue
            print "ERROR: requires 11 tokens, found:" + str(
                len(tokens)) + " Skipping line\n" + line
            continue
        nodeA = tokens[0]
        nodeB = tokens[1]

        try:
            f1genescore = fIntHash[nodeA]
        except KeyError:
            f1genescore = 0
        try:
            f2genescore = fIntHash[nodeB]
        except KeyError:
            f2genescore = 0

        if (db_util.isUnmappedAssociation(nodeA, nodeB)
                and keep_unmapped == 0):
            unmappedout.write(nodeA + "\t" + nodeB + "\n")
            unMapped += 1
            continue
        #nodeA = nodeA.replace('|', '_')
        #nodeB = nodeB.replace('|', '_')
        try:
            features_hash[nodeA]
        except KeyError:
            print "key error in resolving featureId for " + nodeA + " skipping edge."
            continue
        try:
            features_hash[nodeB]
        except KeyError:
            print "key error in resolving featureId for " + nodeB + " skipping edge."
            continue

        if (features_hash[nodeA] and features_hash[nodeB]):
            if (not edges_hash.get(nodeA + "_" + nodeB)
                    and not edges_hash.get(nodeA + "_" + nodeB)):
                feature1id = ""  #str(features_hash[nodeA])
                feature2id = ""  #str(features_hash[nodeB])
                #This will need to be improve once all pairs has annotations
                try:
                    feature1id = str(features_hash[nodeA][0])
                except KeyError:
                    print "ERROR: key error in resolving featureId for " + nodeA
                try:
                    feature2id = str(features_hash[nodeB][0])
                except:
                    print "ERROR: key error in resolving featureId for " + nodeB

                edges_hash[nodeA + "_" + nodeB] = validEdgeId
                validEdgeId += 1
                dataA = process_feature_alias(nodeA)
                label1_desc = ""
                dataB = process_feature_alias(nodeB)
                label2_desc = ""
                if (len(dataA) == 7):
                    dataA.append("")
                    nodeA = nodeA + ":"
                if (len(dataB) == 7):
                    dataB.append("")
                    nodeB = nodeB + ":"
                correlation_str = tokens[2]
                try:
                    correlation = float(correlation_str)
                except ValueError:
                    #Align correlation value to NaN
                    cnan += 1
                    correlation = float('nan')
                    correlation_str = ''
                numna = tokens[3]
                pv_str = tokens[4]
                bonf = tokens[5]
                pv_bonf_str = tokens[6]
                numnaf1 = tokens[7]
                pvf1_str = tokens[8]
                numnaf2 = tokens[9]
                pvf2_str = tokens[10]
                try:
                    pv = str(pvlambda(float(pv_str)))
                    pv_bonf = str(pvlambda(float(pv_bonf_str)))
                    pvf1 = str(pvlambda(float(pvf1_str)))
                    pvf2 = str(pvlambda(float(pvf2_str)))
                except ValueError:
                    #error in pairwise script, ignore these associations for now
                    continue

                if (float(pv) > max_pv):
                    max_pv = float(pv)

                if (float(pv_bonf) > max_pv_corr):
                    max_pv_corr = float(pv_bonf)

                rho = str(db_util.sign(correlation) * abs(float(pv)))

                link_distance = 500000000
                if (len(tokens) >= 12):
                    link_distance = int(tokens[11])
                else:
                    if (len(dataA) >= 5 and len(dataB) >= 5
                            and db_util.is_numeric(dataA[4]) >= 1
                            and db_util.is_numeric(dataB[4]) >= 1
                            and dataA[3] == dataB[3]):
                        link_distance = abs(int(dataB[4]) - int(dataA[4]))
                edges_out_re.write(feature1id + "\t" + feature2id + "\t" +
                                   nodeA + "\t" + "\t".join(dataA) + "\t" +
                                   nodeB + "\t" + "\t".join(dataB) + "\t" +
                                   correlation_str + "\t" + numna + "\t" + pv +
                                   "\t" + bonf + "\t" + pv_bonf + "\t" +
                                   numnaf1 + "\t" + pvf1 + "\t" + numnaf2 +
                                   "\t" + pvf2 + "\t" + rho + "\t" +
                                   str(link_distance) + "\t" +
                                   str(f1genescore) + "\t" + str(f2genescore) +
                                   "\n")
                if (do_pubcrawl == "yes"):
                    #call andrea code
                    getPairwiseInfo.processLine(line, edges_out_pc)
                    pcc += 1
            else:
                print "duplicated edge:" + nodeA + "_" + nodeB
                dupeEdges += 1
        else:
            print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB
            invalidEdges += 1
    print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" % (
        validEdgeId - 1, dupeEdges, cnan, unMapped, unmappedPath, totalEdges,
        max_pv, max_pv_corr)
    edges_meta_json.write('{"max_logpv":%f}' % (max_pv))
    edges_file.close()
    edges_out_re.close()
    edges_out_pc.close()
    edges_meta_json.close()
    unmappedout.close()
    efshout.write("#!/bin/bash\n")
    efshout.write(
        "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n"
        % (myhost, myport, myuser, mypw, mydb))
    efshout.write("load data local infile '" + edges_out_re.name +
                  "' replace INTO TABLE " + edge_table +
                  " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
    efshout.write("\ncommit;")
    efshout.write("\nEOFMYSQL")
    efshout.close()
    print "Begin pairwise db bulk upload " + time.ctime()
    os.system("sh " + efshout.name)
    #create sharded association files for solr import
    solrshout.write("#!/bin/bash\n")
    solrshout.write("python createPWShardedDataset.py " + edges_out_re.name +
                    " " + dataset_label + "\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core2_final.tsv  -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.close()
    print "Begin pairwise solr upload " + time.ctime()
    os.system("sh " + solrshout.name)
    if (do_pubcrawl == "yes"):
        print "senting Pubcrawl notification to " + contacts
        smtp.main(
            "*****@*****.**", contacts,
            "Notification - New Pairwise Associations for PubCrawl",
            "New pairwise associations ready for PubCrawl load\n" +
            edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" +
            edges_out_re.name +
            " loaded into RegulomeExplorer, dataset label is " +
            dataset_label + " \n\n")
def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta, config, annotations, quantileFeatures, results_path, interestingFile):	
	global features_hash
	print ("processing feature set: matrix file %s annotation file %s"%(matrix_file, annotations))
	out_hash = {}
	features_hash = {}
	summary_hash = {}
	mydb = db_util.getDBSchema(config) 
	myuser = db_util.getDBUser(config) 
	mypw = db_util.getDBPassword(config)
	myhost = db_util.getDBHost(config)
	myport = db_util.getDBPort(config)
	if (not os.path.isfile(matrix_file)):
	        print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR"
		sys.exit(-1)
	feature_matrix_file = open(matrix_file, "r")
	feature_table = mydb + "." + dataset_label + "_features"
	sample_table = mydb + "." + dataset_label + "_patients"
	fshout = open(results_path + dataset_label + '_load_features.sh','w')
	outfile = open(results_path + dataset_label + '_features_out.tsv','w')
	alidfile = open(results_path + dataset_label + '_features_alias_id.tsv','w')
	sampleOutfile = open(results_path + dataset_label + '_sample_values_out.tsv','w')
	featureId = 0
	annotation_hash, ftypes = process_feature_annotations(annotations)
	sub_afm_out = {}
	for q in quantileFeatures.split(","):
		sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm','w')
	fIntHash = get_feature_interest_hash(interestingFile) 
	for line in feature_matrix_file:
		tokens = line.strip().split('\t')
		afmid = ""
		ftype = ""
		interesting_score = 0
		if (featureId == 0):               
                	sampleIds = tokens
			#not part of core function in RE import pipeline
			#if (persist_sample_meta == 1):
			#	populate_sample_meta(sampleIds.split(":"), config)	
			#sampleOutfile.write(sampleIds + "\n");
			featureId += 1
			continue
		if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1):
			valuesArray = []
			alias = tokens[0]
			originalAFM = tokens[0]
			data = alias.split(':')
			if (len(data) < 4):
				#afmid = alias
				annotated_feature = annotation_hash.get(alias)
				if (annotated_feature == None):
					print "ERROR: AFM feature %s is not in annotation" %(alias)
					sys.exit(-1)
				#hasAnnotations = True
				#put features in sub afm files for quantile calculation
				ftype = annotated_feature.split("\t")[1]
				alias = annotated_feature.replace("\t", ":")
				featureId = int(alias.split(":")[-1])
			data = alias.split(':')
			if (ftype == ""):
				ftype = data[1]
			afmid = alias
			#if (fIntHash[alias] != None):
			try:
				interesting_score = fIntHash[originalAFM]
			except KeyError:
				#print "Key error with fInterestingHash " + alias
				interesting_score = 0
			if (sub_afm_out.get(ftype) != None):
				sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) + "\n")
			features_hash[tokens[0]] = featureId
			if (len(data) <= 7):
				if (data[1] == 'CLIN' or data[1] == 'SAMP'):
					alias = ":".join(data[0:3]) + ":::::"
					data = alias.split(':')
				else:
					data.append("")
			if len(data[3]) > 3:
				data[3] = data[3][3:]
			patient_values = ":".join(tokens[1:])
			for val in tokens[1:]:
				if (db_util.is_numeric(val)):
					valuesArray.append(float(val))
				else:
					valuesArray.append(0.0)
			#make sure that the number patient ids match values
			if (featureId == 1):
				start = len(sampleIds) - len(valuesArray)
				sampleStr = ":".join(sampleIds[start:])
				sampleOutfile.write(sampleStr + "\n");

			patient_value_mean = sum(valuesArray)/len(valuesArray)
			accumulate_summary_counts(summary_hash,data[1])
			alidfile.write(originalAFM + "\t" + str(featureId) + "\t" +  alias + "\n")
			out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join(data) + "\t" + patient_values + "\t" + str(patient_value_mean) + "\t" + str(interesting_score)
		else:
			print "duplicated feature in feature set:" + tokens[0]
		featureId += 1
	for val in out_hash.values():
		outfile.write(val + "\n")

	summary_out = open(resultsPath + "feature_summary_" + dataset_label + ".json", "w")
	summary_json = "{"

	for feature_type in summary_hash:
		summary_json = summary_json + '"%s":%i,' %(feature_type, summary_hash[feature_type])
	summary_out.write(summary_json[0:-1] + "}\n")
	summary_out.close()
	feature_matrix_file.close()
	outfile.close()
	alidfile.close()
	sampleOutfile.close()
	fshout.write("#!/bin/bash\n")
	fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	fshout.write("load data local infile '" + outfile.name + "' replace INTO TABLE " + feature_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
	fshout.write("load data local infile '" + sampleOutfile.name + "' replace INTO TABLE " + sample_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
	fshout.write("\ncommit;\nEOFMYSQL")
	fshout.close()
	print "processing done, running bulk load on %i features  %s" %(len(features_hash), time.ctime())
	if (persist_sample_meta == 1):
		os.system("sh " + fshout.name)
	return annotation_hash
Esempio n. 15
0
    def createMysqlDumps(self):
        print "Creating MySQL dumps"
        config = self.config

        for dslabel in self.labels:
            print "Creating MySQL dumps for labelname '%s'" % (dslabel)
            dump_edges = getDumpEdgesFile(config, dslabel)
            dump_nodes = getDumpNodesFile(config, dslabel)
            dump_patients = getDumpPatientsFile(config, dslabel)

            table_feat = dslabel + "_features"
            table_edge = "mv_" + dslabel + "_feature_networks"
            table_patient = dslabel + "_patients"

            mydb = db_util.getDBSchema(config)
            myuser = db_util.getDBUser(config)
            mypw = db_util.getDBPassword(config)
            myhost = db_util.getDBHost(config)
            myport = str(db_util.getDBPort(config))

            # Get nodes
            if (not os.path.isfile(dump_nodes)):
                nodeColumnTypes = dict()
                columns = [ "alias", "type", "source", "label", "chr", \
                "start", "end", "strand", "label_desc", \
                "patient_values", "patient_values_mean", "quantile_val", "quantile", "gene_interesting_score" ]

                cursor = db_util.getCursor(config)
                for column in columns:
                    rows = cursor.execute(
                     "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" \
                     % ( table_feat, column ) )
                    result = cursor.fetchone()[0]  #tuple -> [0]
                    nodeColumnTypes[column] = getDataType(result)
                cursor.close()

                # pvalue AS pvalue__varchar => dump header columns format is like pvalue__varchar
                os.system( "mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \
					-e \" SELECT %s FROM %s;\" > %s"                                                   % (myhost, myport, myuser, mypw, mydb, \
                  ", ".join(['%s AS %s__%s' % (key,key,value) for (key,value) in nodeColumnTypes.items() ] ), table_feat, dump_nodes) )
            else:
                print "MySQL dump file at %s exists, skipping." % (dump_nodes)

            # Get edges
            if (not os.path.isfile(dump_edges)):
                columns = ["pvalue", "importance", "correlation", "patientct", \
                "alias1", "alias2", "f1chr", "f1start", "f1end", "f2chr", "f2start", "f2end"]
                edgeColumnTypes = dict()

                cursor = db_util.getCursor(config)
                for column in columns:
                    cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" \
                     % ( table_edge, column ) )
                    result = cursor.fetchone()[0]  #tuple -> [0]
                    edgeColumnTypes[column] = getDataType(result)
                cursor.close()

                os.system( "mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \
					-e \"select %s FROM %s;\" > %s"             \
                 % (myhost, myport, myuser, mypw, mydb, \
                  ", ".join(['%s AS %s__%s' % (key,key,value) \
                   for (key,value) in edgeColumnTypes.items() ] ), table_edge, dump_edges ) )
            else:
                print "MySQL dump file at %s exists, skipping." % (dump_edges)

            # Get patient barcodes:
            if (not os.path.isfile(dump_patients)):
                columns = ["barcode"]
                patientColumnTypes = dict()

                cursor = db_util.getCursor(config)
                for column in columns:
                    cursor.execute( "select DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = '%s' AND COLUMN_NAME = '%s'" \
                     % ( table_patient, column ) )
                    result = cursor.fetchone()[0]  #tuple -> [0]
                    patientColumnTypes[column] = getDataType(result)
                cursor.close()

                os.system( "mysql --host=%s --port=%s --user=%s --password=%s --database=%s --batch --raw \
					-e \"select %s FROM %s;\" > %s"             \
                 % (myhost, myport, myuser, mypw, mydb, \
                  ", ".join(['%s AS %s__%s' % (key,key,value) \
                   for (key,value) in patientColumnTypes.items() ] ), table_patient, dump_patients ) )
            else:
                print "MySQL dump file at %s exists, skipping." % (
                    dump_patients)

        print "Finished creating MySQL dumps"
Esempio n. 16
0
def process_associations_rfex(dataset_label, matrixfile, associationsfile, config, annotations, collapse_direction, reverse_direction, results_path, pv_lambda, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile):
	mydb = db_util.getDBSchema(config) 
	myuser = db_util.getDBUser(config) 
	mypw = db_util.getDBPassword(config) 
	myhost = db_util.getDBHost(config) 
	myport = db_util.getDBPort(config)
	mysolr = db_util.getSolrPath(config)
	if (not os.path.isfile(associationsfile)):
		print associationsfile + " does not exist; unrecoverable ERROR"
		sys.exit(-1)
	associations_table = mydb + ".mv_" + dataset_label + "_feature_networks"
	print "Begin processing associations %s Applying processing_pubcrawl %s" %(time.ctime(), do_pubcrawl)
	fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile)

	edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w')
	associations_in = open(associationsfile,'r')
	annotation_hash, ftype = parse_features_rfex.process_feature_annotations(annotations)
	fshout = open(results_path + 'load_sql_associations_' + dataset_label + '.sh','w')
	solrshout = open(results_path + 'load_solr_assocations_' + dataset_label + '.sh','w')
	unmappedPath = results_path  + 'edges_out_' + dataset_label + '_rface_unmapped.tsv'
	unmappedout = open(unmappedPath,'w')
	features_file = open(results_path + dataset_label + '_features_out.tsv','r')
	features_hash = {}
	for fl in features_file.readlines():
		ftk = fl.strip().split("\t")
		features_hash[ftk[1]] = ftk
	features_file.close()
		
	aliasid_file = open(results_path + dataset_label + '_features_alias_id.tsv','r')
	aliasid_hash = {}
	for fl in aliasid_file.readlines():
		ftk = fl.strip().split("\t")
		aliasid_hash[ftk[0]] = ftk
	aliasid_file.close()	
	
	tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w')
	pubcrawl_tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_pc.tsv','w')
	lc = 0
	edgeCount = 0
	pcc = 0
	unMapped = 0
	pvalueCutCount = 0
	impCut = 0
	lines = associations_in.readlines()
	associations_in.close()
	associations_dic = {}
	for line in lines:
		lc = lc + 1
		columns = line.strip().split('\t')
		if (len(columns) < 5):
			print "Missing required tokens in associations lineIndex %i lineValue %s" %(lc, line)
			continue
		f1alias = columns[0]
		#afm_ids will be used for directionality collapsing, if needed
		f1afm_id = columns[0]
		f2afm_id = columns[1]
		if (len(f1alias.split(":")) < 3):
			annotated_feature = annotation_hash.get(f1alias)
			if (annotated_feature == None):
				print "ERROR: Target feature %s is not in afm/annotation %i" %(f1alias, len(annotation_hash))
				continue
			f1alias = annotated_feature.replace("\t", ":")
		f2alias = columns[1]
		if (len(f2alias.split(":")) < 3):
			annotated_feature = annotation_hash.get(f2alias)
			if (annotated_feature == None):
				print "ERROR: Predictor feature %s is not in afm/annotation" %(f2alias)
				continue
			f2alias = annotated_feature.replace("\t", ":")
		try:
			f1genescore = fIntHash[f1alias]
		except KeyError:
			f1genescore = 0
		try:
			f2genescore = fIntHash[f2alias]
		except KeyError:
			f2genescore = 0
		
		f1data = f1alias.split(':')
		f2data = f2alias.split(':')

		if len(f1data) > 4:
			f1data[3] = f1data[3][3:]
		if len(f2data) > 4:
			f2data[3] = f2data[3][3:]
		
		if (len(f1data) <= 7 and (f1data[1] == 'CLIN' or f1data[1] == 'SAMP')):
			f1alias = ":".join(f1data[0:3]) + ":::::"
			f1data = f1alias.split(':')
		elif (len(f1data) == 7):
			f1data.append("")
		if (len(f2data) <= 7 and (f2data[1] == 'CLIN' or f2data[1] == 'SAMP')):
			f2alias = ":".join(f2data[0:3]) + ":::::"
			f2data = f2alias.split(':')
		elif (len(f2data) == 7):
			f2data.append("") 
		f1aliasOmic = f1alias
		f2aliasOmic = f2alias
		#for annotations
		try:    
			f1id = features_hash[f1alias][0]
		except KeyError:
			try:
				f1id = aliasid_hash[f1alias][1]
				f1aliasOmic = aliasid_hash[f1alias][2]
				f1data = f1aliasOmic.split(':')
				f1data[3] = f1data[3][3:]
			except KeyError:
				print "Skipping Key error with alias1 " + f1alias
                                continue
			
		try:
			f2id = features_hash[f2alias][0]#f2alias.split(":")[-1]
		except KeyError:
			try:
				f2id = aliasid_hash[f2alias][1]
				f2aliasOmic = aliasid_hash[f2alias][2]
				f2data = f2aliasOmic.split(':')
				f2data[3] = f2data[3][3:]	
			except KeyError:
				print "Skipping Key error with alias2 " + f2alias
				continue		

		pvalue = float(columns[2])
		pvalue = str(pv_lambda(pvalue))
		
		importance = columns[3]
		correlation = columns[4]
		patientct = columns[5]
		if (db_util.isUnmappedAssociation(f1alias, f2alias) and keep_unmapped == 0):
			unmappedout.write(f1alias + "\t" + f2alias + "\n")
			unMapped += 1
			continue	
		rhoscore = ""
		link_distance = -1
		if (len(f1data) >=5 and len(f2data)>=5 and db_util.is_numeric(f1data[4]) >= 1 and db_util.is_numeric(f2data[4]) >= 1 and f1data[3] == f2data[3]):
			link_distance = abs(int(f2data[4]) - int(f1data[4]))
		if (collapse_direction == 0):
			associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
		else:
			#check whether (f1 -> f2 or f2 -> f1) exists, if yes, take the more important
			#if not, store pair
			if ((associations_dic.get(f1afm_id + "_" + f2afm_id) == None) and (associations_dic.get(f2afm_id + "_" + f1afm_id) == None)):
				associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
			else:
				existingLink = associations_dic.get(f1afm_id + "_" + f2afm_id)
				ekey = f1afm_id + "_" + f2afm_id
				if (existingLink == None):
					existingLink = associations_dic.get(f2afm_id + "_" + f1afm_id) 
					ekey = f2afm_id + "_" + f1afm_id
				prevImportance = existingLink.split("\t")[3]
				if (float(importance) > float(prevImportance)):
					associations_dic[ekey] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"					 			 
		if (reverse_direction == 1):
			associations_dic[f2afm_id + "_" + f1afm_id] = f2aliasOmic + "\t" + f1aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + str(f2genescore) + "\t" + str(f1genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
			edgeCount = edgeCount + 1
		edgeCount = edgeCount + 1
		if (do_pubcrawl == "yes"):
			getRFACEInfo.processLine(line, pubcrawl_tsvout)
			pcc += 1
	for ei in associations_dic:
		tsvout.write(associations_dic[ei])
	fshout.write("#!/bin/bash\n")
	fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	fshout.write("load data local infile '" + tsvout.name + "' replace INTO TABLE " + associations_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';")
	fshout.write("\nEOFMYSQL\n")
	tsvout.close()
	unmappedout.close()
	pubcrawl_tsvout.close()
	fshout.close()
	print "\nReport: ValidEdges %i ImportanceCutoff %i edges filtered %i \nunMapped Edges %i Saved to %s" %(len(associations_dic), impCut, pvalueCutCount, unMapped, unmappedPath)
	print "Begin RF-ACE db bulk upload %s os.system sh %s" %(time.ctime(), fshout.name)
	os.system("sh " + fshout.name)
	solrshout.write("#!/bin/bash\n")
	solrshout.write("python createRFShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n")
	solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.close()
	print "Begin rface solr upload " + time.ctime()
	os.system("sh " + solrshout.name)

	if (do_pubcrawl == 'yes'):
		smtp.main("*****@*****.**", contacts, "Notification - New RFAce " + dataset_label + " Associations for PubCrawl", "New RFAce associations ready for PubCrawl load\n" + pubcrawl_tsvout.name + "\n" + str(pcc) + " Total Edges\n" + tsvout.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + "\n\n")
	print "Done processing associations %s" %(time.ctime())
	associations_dic = None