def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta,
                           config, annotations, quantileFeatures, results_path,
                           interestingFile):
    global features_hash
    print("processing feature set: matrix file %s annotation file %s" %
          (matrix_file, annotations))
    out_hash = {}
    features_hash = {}
    summary_hash = {}
    mydb = db_util.getDBSchema(config)
    myuser = db_util.getDBUser(config)
    mypw = db_util.getDBPassword(config)
    myhost = db_util.getDBHost(config)
    myport = db_util.getDBPort(config)
    if (not os.path.isfile(matrix_file)):
        print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR"
        sys.exit(-1)
    feature_matrix_file = open(matrix_file, "r")
    feature_table = mydb + "." + dataset_label + "_features"
    sample_table = mydb + "." + dataset_label + "_patients"
    fshout = open(results_path + dataset_label + '_load_features.sh', 'w')
    outfile = open(results_path + dataset_label + '_features_out.tsv', 'w')
    alidfile = open(results_path + dataset_label + '_features_alias_id.tsv',
                    'w')
    sampleOutfile = open(
        results_path + dataset_label + '_sample_values_out.tsv', 'w')
    featureId = 0
    annotation_hash, ftypes = process_feature_annotations(annotations)
    sub_afm_out = {}
    for q in quantileFeatures.split(","):
        sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm',
                              'w')
    fIntHash = get_feature_interest_hash(interestingFile)
    for line in feature_matrix_file:
        tokens = line.strip().split('\t')
        afmid = ""
        ftype = ""
        interesting_score = 0
        if (featureId == 0):
            sampleIds = tokens
            #not part of core function in RE import pipeline
            #if (persist_sample_meta == 1):
            #	populate_sample_meta(sampleIds.split(":"), config)
            #sampleOutfile.write(sampleIds + "\n");
            featureId += 1
            continue
        if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1):
            valuesArray = []
            alias = tokens[0]
            originalAFM = tokens[0]
            data = alias.split(':')
            if (len(data) < 4):
                #afmid = alias
                annotated_feature = annotation_hash.get(alias)
                if (annotated_feature == None):
                    print "ERROR: AFM feature %s is not in annotation" % (
                        alias)
                    sys.exit(-1)
                #hasAnnotations = True
                #put features in sub afm files for quantile calculation
                ftype = annotated_feature.split("\t")[1]
                alias = annotated_feature.replace("\t", ":")
                featureId = int(alias.split(":")[-1])
            data = alias.split(':')
            if (ftype == ""):
                ftype = data[1]
            afmid = alias
            #if (fIntHash[alias] != None):
            try:
                interesting_score = fIntHash[originalAFM]
            except KeyError:
                #print "Key error with fInterestingHash " + alias
                interesting_score = 0
            if (sub_afm_out.get(ftype) != None):
                sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) +
                                         "\n")
            features_hash[tokens[0]] = featureId
            if (len(data) <= 7):
                if (data[1] == 'CLIN' or data[1] == 'SAMP'):
                    alias = ":".join(data[0:3]) + ":::::"
                    data = alias.split(':')
                else:
                    data.append("")
            if len(data[3]) > 3:
                data[3] = data[3][3:]
            patient_values = ":".join(tokens[1:])
            for val in tokens[1:]:
                if (db_util.is_numeric(val)):
                    valuesArray.append(float(val))
                else:
                    valuesArray.append(0.0)
            #make sure that the number patient ids match values
            if (featureId == 1):
                start = len(sampleIds) - len(valuesArray)
                sampleStr = ":".join(sampleIds[start:])
                sampleOutfile.write(sampleStr + "\n")

            patient_value_mean = sum(valuesArray) / len(valuesArray)
            accumulate_summary_counts(summary_hash, data[1])
            alidfile.write(originalAFM + "\t" + str(featureId) + "\t" + alias +
                           "\n")
            out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join(
                data) + "\t" + patient_values + "\t" + str(
                    patient_value_mean) + "\t" + str(interesting_score)
        else:
            print "duplicated feature in feature set:" + tokens[0]
        featureId += 1
    for val in out_hash.values():
        outfile.write(val + "\n")

    summary_out = open(
        resultsPath + "feature_summary_" + dataset_label + ".json", "w")
    summary_json = "{"

    for feature_type in summary_hash:
        summary_json = summary_json + '"%s":%i,' % (feature_type,
                                                    summary_hash[feature_type])
    summary_out.write(summary_json[0:-1] + "}\n")
    summary_out.close()
    feature_matrix_file.close()
    outfile.close()
    alidfile.close()
    sampleOutfile.close()
    fshout.write("#!/bin/bash\n")
    fshout.write(
        "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n"
        % (myhost, myport, myuser, mypw, mydb))
    fshout.write("load data local infile '" + outfile.name +
                 "' replace INTO TABLE " + feature_table +
                 " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
    fshout.write("load data local infile '" + sampleOutfile.name +
                 "' replace INTO TABLE " + sample_table +
                 " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
    fshout.write("\ncommit;\nEOFMYSQL")
    fshout.close()
    print "processing done, running bulk load on %i features  %s" % (
        len(features_hash), time.ctime())
    if (persist_sample_meta == 1):
        os.system("sh " + fshout.name)
    return annotation_hash
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile):
	"""
	Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash
	Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA	
	"""
	edges_hash = {}
	max_pv = -1000.0
	max_pv_corr = -1000.0
	mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db")
	myuser = db_util.getDBUser(config) #config.get("mysql_jdbc_configs", "username")
	mypw = db_util.getDBPassword(config) #config.get("mysql_jdbc_configs", "password")
	myhost = db_util.getDBHost(config)
	myport = db_util.getDBPort(config)
	mysolr = db_util.getSolrPath(config)
	edges_file = open(pairwised_file)
	fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile)
	edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" 
	efshout = open(results_path + 'load_edges_' + dataset_label + '.sh','w')
	solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh','w')
	edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_pw_re.tsv','w')
	edges_out_pc = open(results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv','w')
	edges_meta_json = open(results_path + 'edges_out_' + dataset_label + '_meta.json','w')
	unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv'
	unmappedout = open(unmappedPath,'w')
	features_file = open(results_path + dataset_label + '_features_out.tsv','r')
	features_hash = {}
	for fl in features_file.readlines():
		ftk = fl.strip().split("\t")
		features_hash[ftk[1]] = ftk
		features_file.close()

	validEdgeId = 1
	invalidEdges = 0
	dupeEdges = 0
	totalEdges = 0
	cnan = 0
	pcc = 0
	unMapped = 0
	for line in edges_file:
		totalEdges += 1 
		line = line.strip()
		tokens = line.split('\t')
		if (len(tokens) < 11):
			if (validEdgeId == 1):
				print "Skipping header/line 1 for insufficient token reasons"
				continue
			print "ERROR: requires 11 tokens, found:" + str(len(tokens)) + " Skipping line\n" + line
			continue
		nodeA = tokens[0]
		nodeB = tokens[1]

		try:
			f1genescore = fIntHash[nodeA]
		except KeyError:
			f1genescore = 0
		try:
			f2genescore = fIntHash[nodeB]
		except KeyError:
			f2genescore = 0

		if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0):
			unmappedout.write(nodeA + "\t" + nodeB + "\n")
			unMapped += 1
			continue
		#nodeA = nodeA.replace('|', '_')
		#nodeB = nodeB.replace('|', '_')
		try:
			features_hash[nodeA]
		except KeyError:
			print "key error in resolving featureId for " + nodeA + " skipping edge."
			continue
		try:
			features_hash[nodeB]
		except KeyError:
			print "key error in resolving featureId for " + nodeB + " skipping edge."
			continue

		if (features_hash[nodeA] and features_hash[nodeB]):
			if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)):
				feature1id = ""#str(features_hash[nodeA]) 
				feature2id = ""#str(features_hash[nodeB])
				#This will need to be improve once all pairs has annotations 
				try:
					feature1id = str(features_hash[nodeA][0])
				except KeyError:
					print "ERROR: key error in resolving featureId for " + nodeA
				try:
					feature2id = str(features_hash[nodeB][0])
				except:
					print "ERROR: key error in resolving featureId for " + nodeB

				edges_hash[nodeA + "_" + nodeB] = validEdgeId
				validEdgeId += 1
				dataA = process_feature_alias(nodeA)
				label1_desc = ""
				dataB = process_feature_alias(nodeB)
				label2_desc = ""
				if (len(dataA) == 7):
					dataA.append("")
					nodeA = nodeA + ":"
				if (len(dataB) == 7):
					dataB.append("")
					nodeB = nodeB + ":"
				correlation_str = tokens[2]
				try:
					correlation = float(correlation_str)
				except ValueError:
					#Align correlation value to NaN
					cnan += 1
					correlation = float('nan')
					correlation_str = ''
				numna = tokens[3]
				pv_str = tokens[4]
				bonf = tokens[5]
				pv_bonf_str = tokens[6]
				numnaf1 = tokens[7]
				pvf1_str = tokens[8]
				numnaf2 = tokens[9]
				pvf2_str = tokens[10]
				try:
					pv = str(pvlambda(float(pv_str)))
					pv_bonf = str(pvlambda(float(pv_bonf_str)))
					pvf1 = str(pvlambda(float(pvf1_str)))
					pvf2 = str(pvlambda(float(pvf2_str)))
				except ValueError:
					#error in pairwise script, ignore these associations for now
					continue;

				if (float(pv) > max_pv):
					max_pv = float(pv)
				
				if (float(pv_bonf) > max_pv_corr):
					max_pv_corr = float(pv_bonf)

				rho = str(db_util.sign(correlation)*abs(float(pv)))
				
				link_distance = 500000000
				if ( len(tokens) >= 12 ):
  					link_distance = int(tokens[11])
				else:
					if (len(dataA) >=5 and len(dataB)>=5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]):
						link_distance = abs(int(dataB[4]) - int(dataA[4]))
				edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n")
				if (do_pubcrawl == "yes"):
					#call andrea code
					getPairwiseInfo.processLine(line, edges_out_pc)
					pcc += 1
			else:
				print "duplicated edge:" + nodeA + "_" + nodeB
				dupeEdges += 1
		else:
			print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB
			invalidEdges += 1
	print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" %(validEdgeId-1, dupeEdges, cnan, unMapped,unmappedPath, totalEdges, max_pv, max_pv_corr)	
	edges_meta_json.write('{"max_logpv":%f}' %(max_pv))
	edges_file.close()
	edges_out_re.close()
	edges_out_pc.close()
	edges_meta_json.close()	
	unmappedout.close()
	efshout.write("#!/bin/bash\n")
	efshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
	efshout.write("\ncommit;")
	efshout.write("\nEOFMYSQL")
	efshout.close()
	print "Begin pairwise db bulk upload " + time.ctime() 
	os.system("sh " + efshout.name)
	#create sharded association files for solr import
	solrshout.write("#!/bin/bash\n");
	solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") 
	solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv  -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.close()
	print "Begin pairwise solr upload " + time.ctime()
	os.system("sh " + solrshout.name)
	if (do_pubcrawl == "yes"):
		print "senting Pubcrawl notification to " + contacts
		smtp.main("*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
def process_feature_matrix(dataset_label, matrix_file, persist_sample_meta, config, annotations, quantileFeatures, results_path, interestingFile):	
	global features_hash
	print ("processing feature set: matrix file %s annotation file %s"%(matrix_file, annotations))
	out_hash = {}
	features_hash = {}
	summary_hash = {}
	mydb = db_util.getDBSchema(config) 
	myuser = db_util.getDBUser(config) 
	mypw = db_util.getDBPassword(config)
	myhost = db_util.getDBHost(config)
	myport = db_util.getDBPort(config)
	if (not os.path.isfile(matrix_file)):
	        print "ERROR\n" + matrix_file + " does not exist; unrecoverable ERROR"
		sys.exit(-1)
	feature_matrix_file = open(matrix_file, "r")
	feature_table = mydb + "." + dataset_label + "_features"
	sample_table = mydb + "." + dataset_label + "_patients"
	fshout = open(results_path + dataset_label + '_load_features.sh','w')
	outfile = open(results_path + dataset_label + '_features_out.tsv','w')
	alidfile = open(results_path + dataset_label + '_features_alias_id.tsv','w')
	sampleOutfile = open(results_path + dataset_label + '_sample_values_out.tsv','w')
	featureId = 0
	annotation_hash, ftypes = process_feature_annotations(annotations)
	sub_afm_out = {}
	for q in quantileFeatures.split(","):
		sub_afm_out[q] = open(results_path + dataset_label + '_' + q + '.afm','w')
	fIntHash = get_feature_interest_hash(interestingFile) 
	for line in feature_matrix_file:
		tokens = line.strip().split('\t')
		afmid = ""
		ftype = ""
		interesting_score = 0
		if (featureId == 0):               
                	sampleIds = tokens
			#not part of core function in RE import pipeline
			#if (persist_sample_meta == 1):
			#	populate_sample_meta(sampleIds.split(":"), config)	
			#sampleOutfile.write(sampleIds + "\n");
			featureId += 1
			continue
		if (not features_hash.get(tokens[0]) and len(tokens[0]) > 1):
			valuesArray = []
			alias = tokens[0]
			originalAFM = tokens[0]
			data = alias.split(':')
			if (len(data) < 4):
				#afmid = alias
				annotated_feature = annotation_hash.get(alias)
				if (annotated_feature == None):
					print "ERROR: AFM feature %s is not in annotation" %(alias)
					sys.exit(-1)
				#hasAnnotations = True
				#put features in sub afm files for quantile calculation
				ftype = annotated_feature.split("\t")[1]
				alias = annotated_feature.replace("\t", ":")
				featureId = int(alias.split(":")[-1])
			data = alias.split(':')
			if (ftype == ""):
				ftype = data[1]
			afmid = alias
			#if (fIntHash[alias] != None):
			try:
				interesting_score = fIntHash[originalAFM]
			except KeyError:
				#print "Key error with fInterestingHash " + alias
				interesting_score = 0
			if (sub_afm_out.get(ftype) != None):
				sub_afm_out[ftype].write(alias + "\t" + "\t".join(tokens[1:]) + "\n")
			features_hash[tokens[0]] = featureId
			if (len(data) <= 7):
				if (data[1] == 'CLIN' or data[1] == 'SAMP'):
					alias = ":".join(data[0:3]) + ":::::"
					data = alias.split(':')
				else:
					data.append("")
			if len(data[3]) > 3:
				data[3] = data[3][3:]
			patient_values = ":".join(tokens[1:])
			for val in tokens[1:]:
				if (db_util.is_numeric(val)):
					valuesArray.append(float(val))
				else:
					valuesArray.append(0.0)
			#make sure that the number patient ids match values
			if (featureId == 1):
				start = len(sampleIds) - len(valuesArray)
				sampleStr = ":".join(sampleIds[start:])
				sampleOutfile.write(sampleStr + "\n");

			patient_value_mean = sum(valuesArray)/len(valuesArray)
			accumulate_summary_counts(summary_hash,data[1])
			alidfile.write(originalAFM + "\t" + str(featureId) + "\t" +  alias + "\n")
			out_hash[afmid] = str(featureId) + "\t" + alias + "\t" + "\t".join(data) + "\t" + patient_values + "\t" + str(patient_value_mean) + "\t" + str(interesting_score)
		else:
			print "duplicated feature in feature set:" + tokens[0]
		featureId += 1
	for val in out_hash.values():
		outfile.write(val + "\n")

	summary_out = open(resultsPath + "feature_summary_" + dataset_label + ".json", "w")
	summary_json = "{"

	for feature_type in summary_hash:
		summary_json = summary_json + '"%s":%i,' %(feature_type, summary_hash[feature_type])
	summary_out.write(summary_json[0:-1] + "}\n")
	summary_out.close()
	feature_matrix_file.close()
	outfile.close()
	alidfile.close()
	sampleOutfile.close()
	fshout.write("#!/bin/bash\n")
	fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	fshout.write("load data local infile '" + outfile.name + "' replace INTO TABLE " + feature_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
	fshout.write("load data local infile '" + sampleOutfile.name + "' replace INTO TABLE " + sample_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
	fshout.write("\ncommit;\nEOFMYSQL")
	fshout.close()
	print "processing done, running bulk load on %i features  %s" %(len(features_hash), time.ctime())
	if (persist_sample_meta == 1):
		os.system("sh " + fshout.name)
	return annotation_hash
Beispiel #4
0
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda,
                           config, results_path, do_pubcrawl, contacts,
                           keep_unmapped, featureInterestingFile):
    """
	Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash
	Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA	
	"""
    edges_hash = {}
    max_pv = -1000.0
    max_pv_corr = -1000.0
    mydb = db_util.getDBSchema(config)  #config.get("mysql_jdbc_configs", "db")
    myuser = db_util.getDBUser(
        config)  #config.get("mysql_jdbc_configs", "username")
    mypw = db_util.getDBPassword(
        config)  #config.get("mysql_jdbc_configs", "password")
    myhost = db_util.getDBHost(config)
    myport = db_util.getDBPort(config)
    mysolr = db_util.getSolrPath(config)
    edges_file = open(pairwised_file)
    fIntHash = parse_features_rfex.get_feature_interest_hash(
        featureInterestingFile)
    edge_table = mydb + ".mv_" + dataset_label + "_feature_networks"
    efshout = open(results_path + 'load_edges_' + dataset_label + '.sh', 'w')
    solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh', 'w')
    edges_out_re = open(
        results_path + 'edges_out_' + dataset_label + '_pw_re.tsv', 'w')
    edges_out_pc = open(
        results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv', 'w')
    edges_meta_json = open(
        results_path + 'edges_out_' + dataset_label + '_meta.json', 'w')
    unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv'
    unmappedout = open(unmappedPath, 'w')
    features_file = open(results_path + dataset_label + '_features_out.tsv',
                         'r')
    features_hash = {}
    for fl in features_file.readlines():
        ftk = fl.strip().split("\t")
        features_hash[ftk[1]] = ftk
        features_file.close()

    validEdgeId = 1
    invalidEdges = 0
    dupeEdges = 0
    totalEdges = 0
    cnan = 0
    pcc = 0
    unMapped = 0
    for line in edges_file:
        totalEdges += 1
        line = line.strip()
        tokens = line.split('\t')
        if (len(tokens) < 11):
            if (validEdgeId == 1):
                print "Skipping header/line 1 for insufficient token reasons"
                continue
            print "ERROR: requires 11 tokens, found:" + str(
                len(tokens)) + " Skipping line\n" + line
            continue
        nodeA = tokens[0]
        nodeB = tokens[1]

        try:
            f1genescore = fIntHash[nodeA]
        except KeyError:
            f1genescore = 0
        try:
            f2genescore = fIntHash[nodeB]
        except KeyError:
            f2genescore = 0

        if (db_util.isUnmappedAssociation(nodeA, nodeB)
                and keep_unmapped == 0):
            unmappedout.write(nodeA + "\t" + nodeB + "\n")
            unMapped += 1
            continue
        #nodeA = nodeA.replace('|', '_')
        #nodeB = nodeB.replace('|', '_')
        try:
            features_hash[nodeA]
        except KeyError:
            print "key error in resolving featureId for " + nodeA + " skipping edge."
            continue
        try:
            features_hash[nodeB]
        except KeyError:
            print "key error in resolving featureId for " + nodeB + " skipping edge."
            continue

        if (features_hash[nodeA] and features_hash[nodeB]):
            if (not edges_hash.get(nodeA + "_" + nodeB)
                    and not edges_hash.get(nodeA + "_" + nodeB)):
                feature1id = ""  #str(features_hash[nodeA])
                feature2id = ""  #str(features_hash[nodeB])
                #This will need to be improve once all pairs has annotations
                try:
                    feature1id = str(features_hash[nodeA][0])
                except KeyError:
                    print "ERROR: key error in resolving featureId for " + nodeA
                try:
                    feature2id = str(features_hash[nodeB][0])
                except:
                    print "ERROR: key error in resolving featureId for " + nodeB

                edges_hash[nodeA + "_" + nodeB] = validEdgeId
                validEdgeId += 1
                dataA = process_feature_alias(nodeA)
                label1_desc = ""
                dataB = process_feature_alias(nodeB)
                label2_desc = ""
                if (len(dataA) == 7):
                    dataA.append("")
                    nodeA = nodeA + ":"
                if (len(dataB) == 7):
                    dataB.append("")
                    nodeB = nodeB + ":"
                correlation_str = tokens[2]
                try:
                    correlation = float(correlation_str)
                except ValueError:
                    #Align correlation value to NaN
                    cnan += 1
                    correlation = float('nan')
                    correlation_str = ''
                numna = tokens[3]
                pv_str = tokens[4]
                bonf = tokens[5]
                pv_bonf_str = tokens[6]
                numnaf1 = tokens[7]
                pvf1_str = tokens[8]
                numnaf2 = tokens[9]
                pvf2_str = tokens[10]
                try:
                    pv = str(pvlambda(float(pv_str)))
                    pv_bonf = str(pvlambda(float(pv_bonf_str)))
                    pvf1 = str(pvlambda(float(pvf1_str)))
                    pvf2 = str(pvlambda(float(pvf2_str)))
                except ValueError:
                    #error in pairwise script, ignore these associations for now
                    continue

                if (float(pv) > max_pv):
                    max_pv = float(pv)

                if (float(pv_bonf) > max_pv_corr):
                    max_pv_corr = float(pv_bonf)

                rho = str(db_util.sign(correlation) * abs(float(pv)))

                link_distance = 500000000
                if (len(tokens) >= 12):
                    link_distance = int(tokens[11])
                else:
                    if (len(dataA) >= 5 and len(dataB) >= 5
                            and db_util.is_numeric(dataA[4]) >= 1
                            and db_util.is_numeric(dataB[4]) >= 1
                            and dataA[3] == dataB[3]):
                        link_distance = abs(int(dataB[4]) - int(dataA[4]))
                edges_out_re.write(feature1id + "\t" + feature2id + "\t" +
                                   nodeA + "\t" + "\t".join(dataA) + "\t" +
                                   nodeB + "\t" + "\t".join(dataB) + "\t" +
                                   correlation_str + "\t" + numna + "\t" + pv +
                                   "\t" + bonf + "\t" + pv_bonf + "\t" +
                                   numnaf1 + "\t" + pvf1 + "\t" + numnaf2 +
                                   "\t" + pvf2 + "\t" + rho + "\t" +
                                   str(link_distance) + "\t" +
                                   str(f1genescore) + "\t" + str(f2genescore) +
                                   "\n")
                if (do_pubcrawl == "yes"):
                    #call andrea code
                    getPairwiseInfo.processLine(line, edges_out_pc)
                    pcc += 1
            else:
                print "duplicated edge:" + nodeA + "_" + nodeB
                dupeEdges += 1
        else:
            print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB
            invalidEdges += 1
    print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" % (
        validEdgeId - 1, dupeEdges, cnan, unMapped, unmappedPath, totalEdges,
        max_pv, max_pv_corr)
    edges_meta_json.write('{"max_logpv":%f}' % (max_pv))
    edges_file.close()
    edges_out_re.close()
    edges_out_pc.close()
    edges_meta_json.close()
    unmappedout.close()
    efshout.write("#!/bin/bash\n")
    efshout.write(
        "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n"
        % (myhost, myport, myuser, mypw, mydb))
    efshout.write("load data local infile '" + edges_out_re.name +
                  "' replace INTO TABLE " + edge_table +
                  " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
    efshout.write("\ncommit;")
    efshout.write("\nEOFMYSQL")
    efshout.close()
    print "Begin pairwise db bulk upload " + time.ctime()
    os.system("sh " + efshout.name)
    #create sharded association files for solr import
    solrshout.write("#!/bin/bash\n")
    solrshout.write("python createPWShardedDataset.py " + edges_out_re.name +
                    " " + dataset_label + "\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core2_final.tsv  -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.close()
    print "Begin pairwise solr upload " + time.ctime()
    os.system("sh " + solrshout.name)
    if (do_pubcrawl == "yes"):
        print "senting Pubcrawl notification to " + contacts
        smtp.main(
            "*****@*****.**", contacts,
            "Notification - New Pairwise Associations for PubCrawl",
            "New pairwise associations ready for PubCrawl load\n" +
            edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" +
            edges_out_re.name +
            " loaded into RegulomeExplorer, dataset label is " +
            dataset_label + " \n\n")
Beispiel #5
0
def process_associations_rfex(dataset_label, matrixfile, associationsfile, config, annotations, collapse_direction, reverse_direction, results_path, pv_lambda, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile):
	mydb = db_util.getDBSchema(config) 
	myuser = db_util.getDBUser(config) 
	mypw = db_util.getDBPassword(config) 
	myhost = db_util.getDBHost(config) 
	myport = db_util.getDBPort(config)
	mysolr = db_util.getSolrPath(config)
	if (not os.path.isfile(associationsfile)):
		print associationsfile + " does not exist; unrecoverable ERROR"
		sys.exit(-1)
	associations_table = mydb + ".mv_" + dataset_label + "_feature_networks"
	print "Begin processing associations %s Applying processing_pubcrawl %s" %(time.ctime(), do_pubcrawl)
	fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile)

	edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w')
	associations_in = open(associationsfile,'r')
	annotation_hash, ftype = parse_features_rfex.process_feature_annotations(annotations)
	fshout = open(results_path + 'load_sql_associations_' + dataset_label + '.sh','w')
	solrshout = open(results_path + 'load_solr_assocations_' + dataset_label + '.sh','w')
	unmappedPath = results_path  + 'edges_out_' + dataset_label + '_rface_unmapped.tsv'
	unmappedout = open(unmappedPath,'w')
	features_file = open(results_path + dataset_label + '_features_out.tsv','r')
	features_hash = {}
	for fl in features_file.readlines():
		ftk = fl.strip().split("\t")
		features_hash[ftk[1]] = ftk
	features_file.close()
		
	aliasid_file = open(results_path + dataset_label + '_features_alias_id.tsv','r')
	aliasid_hash = {}
	for fl in aliasid_file.readlines():
		ftk = fl.strip().split("\t")
		aliasid_hash[ftk[0]] = ftk
	aliasid_file.close()	
	
	tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w')
	pubcrawl_tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_pc.tsv','w')
	lc = 0
	edgeCount = 0
	pcc = 0
	unMapped = 0
	pvalueCutCount = 0
	impCut = 0
	lines = associations_in.readlines()
	associations_in.close()
	associations_dic = {}
	for line in lines:
		lc = lc + 1
		columns = line.strip().split('\t')
		if (len(columns) < 5):
			print "Missing required tokens in associations lineIndex %i lineValue %s" %(lc, line)
			continue
		f1alias = columns[0]
		#afm_ids will be used for directionality collapsing, if needed
		f1afm_id = columns[0]
		f2afm_id = columns[1]
		if (len(f1alias.split(":")) < 3):
			annotated_feature = annotation_hash.get(f1alias)
			if (annotated_feature == None):
				print "ERROR: Target feature %s is not in afm/annotation %i" %(f1alias, len(annotation_hash))
				continue
			f1alias = annotated_feature.replace("\t", ":")
		f2alias = columns[1]
		if (len(f2alias.split(":")) < 3):
			annotated_feature = annotation_hash.get(f2alias)
			if (annotated_feature == None):
				print "ERROR: Predictor feature %s is not in afm/annotation" %(f2alias)
				continue
			f2alias = annotated_feature.replace("\t", ":")
		try:
			f1genescore = fIntHash[f1alias]
		except KeyError:
			f1genescore = 0
		try:
			f2genescore = fIntHash[f2alias]
		except KeyError:
			f2genescore = 0
		
		f1data = f1alias.split(':')
		f2data = f2alias.split(':')

		if len(f1data) > 4:
			f1data[3] = f1data[3][3:]
		if len(f2data) > 4:
			f2data[3] = f2data[3][3:]
		
		if (len(f1data) <= 7 and (f1data[1] == 'CLIN' or f1data[1] == 'SAMP')):
			f1alias = ":".join(f1data[0:3]) + ":::::"
			f1data = f1alias.split(':')
		elif (len(f1data) == 7):
			f1data.append("")
		if (len(f2data) <= 7 and (f2data[1] == 'CLIN' or f2data[1] == 'SAMP')):
			f2alias = ":".join(f2data[0:3]) + ":::::"
			f2data = f2alias.split(':')
		elif (len(f2data) == 7):
			f2data.append("") 
		f1aliasOmic = f1alias
		f2aliasOmic = f2alias
		#for annotations
		try:    
			f1id = features_hash[f1alias][0]
		except KeyError:
			try:
				f1id = aliasid_hash[f1alias][1]
				f1aliasOmic = aliasid_hash[f1alias][2]
				f1data = f1aliasOmic.split(':')
				f1data[3] = f1data[3][3:]
			except KeyError:
				print "Skipping Key error with alias1 " + f1alias
                                continue
			
		try:
			f2id = features_hash[f2alias][0]#f2alias.split(":")[-1]
		except KeyError:
			try:
				f2id = aliasid_hash[f2alias][1]
				f2aliasOmic = aliasid_hash[f2alias][2]
				f2data = f2aliasOmic.split(':')
				f2data[3] = f2data[3][3:]	
			except KeyError:
				print "Skipping Key error with alias2 " + f2alias
				continue		

		pvalue = float(columns[2])
		pvalue = str(pv_lambda(pvalue))
		
		importance = columns[3]
		correlation = columns[4]
		patientct = columns[5]
		if (db_util.isUnmappedAssociation(f1alias, f2alias) and keep_unmapped == 0):
			unmappedout.write(f1alias + "\t" + f2alias + "\n")
			unMapped += 1
			continue	
		rhoscore = ""
		link_distance = -1
		if (len(f1data) >=5 and len(f2data)>=5 and db_util.is_numeric(f1data[4]) >= 1 and db_util.is_numeric(f2data[4]) >= 1 and f1data[3] == f2data[3]):
			link_distance = abs(int(f2data[4]) - int(f1data[4]))
		if (collapse_direction == 0):
			associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
		else:
			#check whether (f1 -> f2 or f2 -> f1) exists, if yes, take the more important
			#if not, store pair
			if ((associations_dic.get(f1afm_id + "_" + f2afm_id) == None) and (associations_dic.get(f2afm_id + "_" + f1afm_id) == None)):
				associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
			else:
				existingLink = associations_dic.get(f1afm_id + "_" + f2afm_id)
				ekey = f1afm_id + "_" + f2afm_id
				if (existingLink == None):
					existingLink = associations_dic.get(f2afm_id + "_" + f1afm_id) 
					ekey = f2afm_id + "_" + f1afm_id
				prevImportance = existingLink.split("\t")[3]
				if (float(importance) > float(prevImportance)):
					associations_dic[ekey] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"					 			 
		if (reverse_direction == 1):
			associations_dic[f2afm_id + "_" + f1afm_id] = f2aliasOmic + "\t" + f1aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + str(f2genescore) + "\t" + str(f1genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
			edgeCount = edgeCount + 1
		edgeCount = edgeCount + 1
		if (do_pubcrawl == "yes"):
			getRFACEInfo.processLine(line, pubcrawl_tsvout)
			pcc += 1
	for ei in associations_dic:
		tsvout.write(associations_dic[ei])
	fshout.write("#!/bin/bash\n")
	fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	fshout.write("load data local infile '" + tsvout.name + "' replace INTO TABLE " + associations_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';")
	fshout.write("\nEOFMYSQL\n")
	tsvout.close()
	unmappedout.close()
	pubcrawl_tsvout.close()
	fshout.close()
	print "\nReport: ValidEdges %i ImportanceCutoff %i edges filtered %i \nunMapped Edges %i Saved to %s" %(len(associations_dic), impCut, pvalueCutCount, unMapped, unmappedPath)
	print "Begin RF-ACE db bulk upload %s os.system sh %s" %(time.ctime(), fshout.name)
	os.system("sh " + fshout.name)
	solrshout.write("#!/bin/bash\n")
	solrshout.write("python createRFShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n")
	solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.close()
	print "Begin rface solr upload " + time.ctime()
	os.system("sh " + solrshout.name)

	if (do_pubcrawl == 'yes'):
		smtp.main("*****@*****.**", contacts, "Notification - New RFAce " + dataset_label + " Associations for PubCrawl", "New RFAce associations ready for PubCrawl load\n" + pubcrawl_tsvout.name + "\n" + str(pcc) + " Total Edges\n" + tsvout.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + "\n\n")
	print "Done processing associations %s" %(time.ctime())
	associations_dic = None