def main():
	"""
	confusion_matrix.py:
	###########################################################										
	true_pred_rel, false_pred_rel = identify_true_false_positives(
														TG_TF_pred,
														TG_TF_ref
														)
	###########################################################
	unpredicted_rel = count_false_negatives(
											TG_TF_ref, true_pred_rel, 
											tt_genes
											)										
	###########################################################
	TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
								total_rel, true_pred_rel, 
								false_pred_rel, unpredicted_rel
								)
	###########################################################
	print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
	###########################################################
	"""
	#Get test data from AtRegNet.txt
	AtReg_data = read_data(fa.filename_atreg)
	AtRegNet_parse = parse_AtReg_data(AtReg_data)
	
	TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all'])
	TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]
	
	TG_list_ref = [info[0] for info in TG_TF_ref]
	TG_set_ref = set(TG_list_ref)
	sh_TG_list_ref = list(TG_set_ref)
	
	TF_list_ref = [info[1] for info in TG_TF_ref]
	TF_set_ref = set(TF_list_ref)
	sh_TF_list_ref = list(TF_set_ref)
	###########################################################
	###########################################################
	artificial_data = copy.deepcopy(TG_TF_ref)
	dataset = "artificial dataset"
	cutoff = 0
	
	###########################################################
	###########################################################
	if true_P:
		tt_genes = [info[0] for info in artificial_data]
		total_rel = artificial_data
		###########################################################								
		true_pred_rel, false_pred_rel = identify_true_false_positives(
															artificial_data,
															TG_TF_ref
															)
		###########################################################

		unpredicted_rel = count_false_negatives(
												TG_TF_ref, true_pred_rel, 
												tt_genes
												)										
		###########################################################
		TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
									total_rel, true_pred_rel, 
									false_pred_rel, unpredicted_rel
									)
		###########################################################
		print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
	###########################################################	
	###########################################################


	percentages = [20, 40, 50, 60]



	if not false_N and false_P:
		#Add some noise in the form of False Positives
		for perc in percentages:
			cutoff = perc
			noised_artificial_data = add_false_positives(
										artificial_data, sh_TG_list_ref, 
										sh_TF_list_ref, perc
										)
			###########################################################
			tt_genes = [info[0] for info in artificial_data]
			total_rel = noised_artificial_data
			###########################################################								
			true_pred_rel, false_pred_rel = identify_true_false_positives(
																noised_artificial_data,
																TG_TF_ref
																)
			###########################################################

			unpredicted_rel = count_false_negatives(
													TG_TF_ref, true_pred_rel, 
													tt_genes
													)										
			###########################################################
			TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
										total_rel, true_pred_rel, 
										false_pred_rel, unpredicted_rel
										)
			###########################################################
			print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
		###########################################################	
		###########################################################		


	if false_N and not false_P:
		#Add some noise in the form of False Negatives
		for perc in percentages:
			cutoff = perc
			noised_artificial_data = add_false_negatives(artificial_data, TG_TF_ref, perc)
			
			###########################################################
			tt_genes = [info[0] for info in artificial_data]
			total_rel = artificial_data
			###########################################################								
			true_pred_rel, false_pred_rel = identify_true_false_positives(
																noised_artificial_data,
																TG_TF_ref
																)
			###########################################################

			unpredicted_rel = count_false_negatives(
													TG_TF_ref, true_pred_rel, 
													tt_genes
													)										
			###########################################################
			TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
										total_rel, true_pred_rel, 
										false_pred_rel, unpredicted_rel
										)
			###########################################################
			print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
		###########################################################	
		###########################################################	



	if false_P and false_N:
		#Add some noise in the form of False Positives
		#And then add some False_Negatives
		for perc in percentages:
			cutoff = perc
			print "ad:",len(artificial_data)
			noised_artificial_data = add_false_positives(
													artificial_data, 
													sh_TG_list_ref, 
													sh_TF_list_ref, perc
													)	
			print "nad:",len(noised_artificial_data)
			more_noised_artificial_data = add_false_negatives(
												noised_artificial_data, 
												TG_TF_ref, perc
												)
			print "mnad:",len(more_noised_artificial_data)
		
			
			
			###########################################################
			tt_genes = [info[0] for info in noised_artificial_data]
			total_rel = noised_artificial_data
			###########################################################								
			true_pred_rel, false_pred_rel = identify_true_false_positives(
																more_noised_artificial_data,
																TG_TF_ref
																)
			###########################################################

			unpredicted_rel = count_false_negatives(
													TG_TF_ref, true_pred_rel, 
													tt_genes
													)										
			###########################################################
			TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
										total_rel, true_pred_rel, 
										false_pred_rel, unpredicted_rel
										)
			###########################################################
			print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
def main():
    """
	"""
    # Get test data from AtRegNet.txt
    AtReg_data = read_data(fa.filename_atreg)
    AtRegNet_parse = parse_AtReg_data(AtReg_data)

    TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ["all"])
    TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]

    TG_list_ref = [info[0] for info in TG_TF_ref]
    TG_set_ref = set(TG_list_ref)

    TF_list_ref = [info[1] for info in TG_TF_ref]
    TF_set_ref = set(TF_list_ref)

    # -----------------------------------------------------------
    # exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Keurentjes_2007','Snoek_2012']
    exp_list = ["Ligterink_2014"]
    cutoff_list = [3]  # , 4.3, 6.7]
    chromo = [1, 2, 3, 4, 5]
    # -----------------------------------------------------------

    for dataset in exp_list:

        for cutoff in cutoff_list:

            print "Analysing %s %s" % (dataset, cutoff)

            ###########################################################
            # Extract the true TG-TF relations and the total possible
            # relations from the stored datafiles
            filelocation = "%s/%s/genelist_%s/genelist_%s_co%s.txt" % (
                fa.mr_folder,
                fa.gfolder,
                dataset,
                dataset,
                cutoff,
            )
            true_rel, total_rel = get_TGTF_from_genelist(filelocation, TG_TF_ref, TG_list_ref, TF_list_ref)

            ###########################################################
            # The TG in the true TG-TF relations are the true_traits (tt)
            # in this case named tt_genes
            tt_genes = list(set([info[0] for info in true_rel]))

            ########################################################################
            # Get for each true_trait the number of eQTLs
            enriched_fn = "%s/%s/enriched_%s/enriched_%s_co%s.txt" % (
                fa.mr_folder,
                fa.enriched_folder,
                dataset,
                dataset,
                cutoff,
            )
            trait_eqtl_genelist = get_info(enriched_fn)
            # Select true traits based on number of eQTLs
            tt_trait_eqtl_genelist = [[t[0], t[1], t[2]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1] > 0]
            trait_with_eqtl = [info[0] for info in tt_trait_eqtl_genelist]
            ########################################################################

            ###########################################################
            TG_TF_pred = process_enrichment(tt_trait_eqtl_genelist, TF_set_ref)

            ###########################################################
            true_pred_rel, false_pred_rel = identify_true_false_positives(TG_TF_pred, TG_TF_ref)

            ###########################################################
            unpredicted_rel = count_false_negatives(TG_TF_ref, true_pred_rel, trait_with_eqtl)

            ###########################################################
            TP, FP, FN, TN, recall, specif, precis = calculate_confusion(
                total_rel, true_pred_rel, false_pred_rel, unpredicted_rel
            )

            ###########################################################
            print "true_traits: %s" % len(set(tt_genes))
            print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
def main():
	"""
	"""
	tic = time.clock()
	#Get test data from AtRegNet.txt
	AtReg_data = read_data(fa.filename_atreg)
	AtRegNet_parse = parse_AtReg_data(AtReg_data)
	
	TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all'])
	TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]
	
	TG_list_ref = [info[0] for info in TG_TF_ref]
	TG_set_ref = set(TG_list_ref)
	sh_TG_list_ref = list(TG_set_ref)
	
	TF_list_ref = [info[1] for info in TG_TF_ref]
	TF_set_ref = set(TF_list_ref)
	sh_TF_list_ref = list(TF_set_ref)

	#-----------------------------------------------------------
	exp_list = ['Snoek_2012']
	#exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007']
	cutoff_list = [3]#[4.3,6.7]
	chromo = [1,2,3,4,5]
	#-----------------------------------------------------------
	
	eQTL_threshold_list = [0,1,2,3]
	
	#-----------------------------------------------------------
	
	#get the premade 1000 distinct seeds of 8 digits each
	seedfile = "%s/%s/random_seeds.txt"%(fa.mr_folder, fa.numfolder)
	#print "Retrieving randomized seeds from %s"%seedfile
	seeds = read_seeds(seedfile)
	
	data_dict = {}
	
	write_summary = False
	write_conf = False
	print_conf = True
	

	summary = []
	
	for dataset in exp_list:
		
		for cutoff in cutoff_list:
			
			for eQTL_threshold in eQTL_threshold_list:
				
				print "Initializing analysis for dataset %s with cutoff %s"%(dataset, cutoff)
				
				F1 = None
				ref_F1 = None
							
				############################################################		
				####Retrieve original confusion matrix results
				subfolder_F1 = "/eqtl_%s/valnum_%s"%(eQTL_threshold, dataset)
				F1_fn = "%s/%s/%s/valnum_results_%s_co%s"%(
												fa.mr_folder, fa.numfolder,
												subfolder_F1, dataset, cutoff
												)
				try:
					ref_recall, ref_precision, ref_F1 = read_predicted_confusion_data(F1_fn)
				except:
					ref_recall= ref_precision= ref_F1 = None
				
				if ref_F1 != None:
		
					############################################################
					####Retrieve genelist
					subfolder_genelist = "genelist_%s"%dataset
					genelist_fn = "%s/%s/%s/genelist_%s_co%s.txt"%(
													fa.mr_folder, fa.gfolder,
													subfolder_genelist, dataset,
													cutoff
													)
					trait_genelist_list = get_genelist(genelist_fn)
					true_rel, total_rel = get_TGTF_from_genelist(
															genelist_fn, TG_TF_ref, 
															TG_list_ref, TF_list_ref
															)
					tt_genes = list(set([info[0] for info in true_rel]))
					############################################################			
					####Retrieve enriched list
					subfolder_enriched = "enriched_%s"%dataset
					enriched_fn = "%s/%s/%s/enriched_%s_co%s.txt"%(
													fa.mr_folder, fa.enriched_folder,
													subfolder_enriched, dataset,
													cutoff
													)
					trait_eqtl_genelist, dict_trait_enriched = get_enriched(enriched_fn)
					
					############################################################
					####Retrieve True Traits
					emr_traits_fn = "%s/%s/emr_traitlist_%s_co%s.txt"%(
													fa.mr_folder, fa.trait_folder,
													dataset, cutoff
													)

		
					############################################################
					#get all traits that have more than X eQTLs, where X = eQTL_threshold			
					truetrait_eqtl_list = [[t[0], t[1]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1]>eQTL_threshold]
					trait_with_eqtl = [info[0] for info in truetrait_eqtl_list]
					print "traits with eQTL", len(trait_with_eqtl)
					############################################################
					
					higher_recall=lower_recall=higher_precision=lower_precision=0
					higher_F1=lower_F1=0		
					
					permutated_confusion = []
					#permutate!
					#print "Commencing permutation of %s, standby..."%len(seeds)
					#i = 0
					for seedling in seeds:
						#reset variables
						TP=FP=FN=TN=recall=specif=precision=F1= 0
						#print i
						#i += 1
						
						trait_randomsample = []
		
						#create [trait - sample gene list]
						for tr_ge in trait_genelist_list:
							g_trait, g_genelist = tr_ge
							#print g_trait
							#print len(g_genelist)
							
							if g_trait in trait_with_eqtl and g_trait in dict_trait_enriched:
								
								sample_size = len(dict_trait_enriched[g_trait])
								rsamp = select_random_sample(g_genelist, sample_size, seedling)
								trait_randomsample.append([g_trait,0, rsamp])
								#q = len(g_genelist)
								#print "take %s from %s"%(sample_size, q)


						#TG_TF_pred is summed over all traits in a (dataset, cutoff) combination
						#TG_TF_pred = get_randomized_predictions(trait_randomsample, TF_set_ref)
						TG_TF_pred = process_enrichment(trait_randomsample, TF_set_ref)
		
						
						#proceed with the random sample to the confusion matrix
						###########################################################										
						true_pred_rel, false_pred_rel = identify_true_false_positives(
																			TG_TF_pred,
																			TG_TF_ref
																			)
						
						###########################################################
						unpredicted_rel = count_false_negatives(
																TG_TF_ref, true_pred_rel, 
																tt_genes
																)
																
						###########################################################
						TP, FP, FN, TN, recall, specif, precision, F1 = calculate_confusion(
													total_rel, true_pred_rel, 
													false_pred_rel, unpredicted_rel
													)
						
						permutated_confusion.append([TP, FP, FN, TN, recall, specif, precision, F1])
						###########################################################
						#print "true_traits: %s"%len(set(tt_genes))
						#print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precision)
						###########################################################
						
		
						if recall != None:
							if recall < ref_recall:
								lower_recall += 1
							if recall >= ref_recall:
								higher_recall += 1
						#else:
							#print "recall is None"
							#pass
							
						if precision != None:
							if precision < ref_precision:
								lower_precision += 1
							if precision >= ref_precision:
								higher_precision += 1
						#else:
							#print "precision is None"
							#pass
							
						if ref_F1 != None and F1 != None:
								if F1 < ref_F1:
									lower_F1 += 1
								if F1 >= ref_F1:
									higher_F1 += 1
		
					summary.append([dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1])
						###########################################################
					
					
				if write_conf:
					substorage = "%s/%s/%s"%(fa.mr_folder, fa.numfolder, dataset)
					if not os.path.exists(substorage):
						os.mkdir(substorage)
						
					resultsfolder_conf = "%s/permutate_eqtl_%s_%s_co%s.txt"%(
														 substorage, eQTL_threshold, 
														 dataset, cutoff
														)

					try:	
						print "Writing to file %s"%resultsfolder_conf
						with open(resultsfolder_conf, 'w') as fo:
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("dataset: %s"%dataset)
							fo.write("\n")
							fo.write("cutoff: %s"%cutoff)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("lower_F1: \t%s"%lower_F1)
							fo.write("\n")
							fo.write("higher_F1: \t%s"%higher_F1)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("lower_recall: %s"%lower_recall)
							fo.write("\n")
							fo.write("higher_recall: %s"%higher_recall)
							fo.write("\n")
							fo.write("lower_precision: %s"%lower_precision)
							fo.write("\n")
							fo.write("higher_precision: %s"%higher_precision)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							for [TP, FP, FN, TN, recall, specif, precision, F1] in permutated_confusion:
								fo.write("-------------------------\n")
								fo.write("TP\t%s\tFN\t%s"%(TP, FN))
								fo.write("\n")
								fo.write("FP\t%s\tTN\t%s"%(FP, TN))
								fo.write("\n")
								fo.write("-------------------------\n")
								fo.write("recall\t%s"%recall)
								fo.write("\n")
								fo.write("specificity\t%s"%specif)
								fo.write("\n")
								fo.write("precision\t%s"%precision)
								fo.write("\n")
								fo.write("F1\t%s"%F1)
								fo.write("\n")
								fo.write("-------------------------\n")
					except:
						pass
											
				if print_conf:
					try:
						print "-------------------------"
						print "TP\t%s\tFN\t%s"%(TP, FN)
						print "FP\t%s\tTN\t%s"%(FP, TN)
						print "-------------------------"
						print "dataset: %s"%dataset
						print "cutoff: %s"%cutoff
						print "eQTL: %s"%eQTL_threshold
						print "-------------------------"
						print "lower_F1:\t%s"%lower_F1
						print "higher_F1:\t%s"%higher_F1	
						print "-------------------------"
						print "lower_recall: %s"%lower_recall
						print "higher_recall: %s"%higher_recall
						print "lower_precision: %s"%lower_precision
						print "higher_precision: %s"%higher_precision
						print "-------------------------"
					except:
						pass


						
			if write_summary:
				summfolder_conf = "%s/%s/permutate_summary_eqtl_%s.txt"%(
												fa.mr_folder, fa.numfolder,
												eQTL_threshold
												)
				try:
					with open(summfolder_conf, 'w') as fo:
						for dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1 in summary:
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("dataset: %s"%dataset)
							fo.write("\n")
							fo.write("cutoff: %s"%cutoff)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("lower_F1: \t%s"%lower_F1)
							fo.write("\n")
							fo.write("higher_F1: \t%s"%higher_F1)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
							fo.write("recall:")
							fo.write("\n")
							fo.write("lower: %s"%lower_recall)
							fo.write("\n")
							fo.write("higher: %s"%higher_recall)
							fo.write("\n")
							fo.write("precision:")
							fo.write("\n")
							fo.write("lower: %s"%lower_precision)
							fo.write("\n")
							fo.write("higher: %s"%higher_precision)
							fo.write("\n")
							fo.write("-------------------------")
							fo.write("\n")
				except:
					pass
Beispiel #4
0
												)
	###########################################################	
	tt_genes = list(set([info[0] for info in true_relations]))
	###########################################################	
	TG_TF_pred = process_enrichment(
									dataset, cutoff, chromosome, 
									TF_set_ref, tt_genes
									)
	###########################################################										
	true_pred_rel, false_pred_rel = identify_true_false_positives(
														TG_TF_pred,
														TG_TF_ref
														)
	###########################################################	
	unpredicted_rel = count_false_negatives(
											TG_TF_ref, true_pred_rel, 
											tt_genes
											)			

	#TFloc_list contains predicted datapoints
	#TFloc_list = [[info[1],info[0]] for info in total_rel_major]
	
	regus_list = sorted(list(set([info[1] for info in total_rel_major])))
	locus_list = sorted(list(set([info[0] for info in total_rel_major])))

	regus_dict, TF_chr_len_list = create_reg_dict(regus_list)
	
	locus_dict, chr_len_list = get_loci_from_dataset(locus_list)
	
	#Prepare data for scatter plots
	moddata_array, labels_mod = link_data_in_array(total_rel_major, locus_dict, regus_dict, {})