def main(): """ """ #----------------------------------------------------------- exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007'] cutoff_list = [3, 4.3, 6.7] eQTL_threshold = [0,1,2,3] chromo = [1,2,3,4,5] #----------------------------------------------------------- outputpath = "%s/%s/summary_validation_numerics.txt"%( fa.mr_folder, fa.numfolder ) for eQTL in eQTL_threshold: for dataset in exp_list: substorage = "%s/%s/%s"%(fa.mr_folder, fa.numfolder, dataset) for cutoff in cutoff_list: inputpath = "%s/permutate_eqtl_%s_%s_co%s.txt"%( substorage, eQTL, dataset, cutoff ) data = read_data(inputpath) for line in data: if line.startswith("lower_F1:"): lower_F1 = int(line[9:].strip()) if line.startswith("higher_F1:"): higher_F1 = int(line[10:].strip()) if line.startswith("lower_recall:"): lower_recall = int(line[13:].strip()) if line.startswith("higher_recall:"): higher_recall = int(line[14:].strip()) if line.startswith("lower_precision:"): lower_precision = int(line[16:].strip()) if line.startswith("higher_precision:"): higher_precision = int(line[17:].strip()) try: with open(outputpath, 'a') as fo: fo.write("dataset %s\n"%dataset) fo.write("cutoff %s\n"%cutoff) fo.write("eQTLs %s\n"%eQTL) fo.write("lower_F1 %s\n"%lower_F1) fo.write("higher_F1 %s\n"%higher_F1) fo.write("lower_recall %s\n"%lower_recall) fo.write("higher_recall %s\n"%higher_recall) fo.write("lower_precision %s\n"%lower_precision) fo.write("higher_precision %s\n"%higher_precision) fo.write("\n") except: pass
def read_predicted_confusion_data(fn): """ """ conf_data = read_data(fn) for line in conf_data: if line.startswith("dataset:"): dataset = line[9:].strip() if line.startswith("cutoff:"): cutoff = float(line[8:].strip()) if line.startswith("recall"): recall = line[7:].strip() if recall != "None": recall = float(recall) else: recall = None if line.startswith("precis"): precision = line[7:].strip() if precision != "None": precision = float(precision) else: precision = None if line.startswith("F1"): F1 = line[3:].strip() if F1 != "None": F1 = float(F1) else: F1 = None if F1: return recall, precision, F1
def main(): """ """ #Datasets exp_list = ['Ligterink_2014', 'Ligterink_2014_gxe', 'Snoek_2012','Keurentjes_2007'] #Variables chromosome = [1,2,3,4,5] cutoff_list = [6.7, 4.3, 3] reduce_traits = False reduce_traits_even_more = True for dataset in exp_list: storage_folder = "%s/%s/genelist_%s"%(fa.mr_folder, fa.gfolder, dataset) for cutoff in cutoff_list: if reduce_traits: print "retrieving traits for %s %s"%(dataset, cutoff) traits = get_trait_with_genelist(dataset, cutoff, chromosome) fileloc = "%s/%s/reduced_traitlist_%s_co%s.txt"%( fa.mr_folder, fa.trait_folder, dataset, cutoff ) print "writing file to %s"%fileloc write_trait_to_file(fileloc, traits) if reduce_traits_even_more: #emr = even more reduced emr_traits = [] fname = "%s/genelist_%s_co%s.txt"%( storage_folder, dataset, cutoff ) traitdata = read_data(fname) for line in traitdata: if line.startswith("trait:"): trait = line[7:16].strip() emr_traits.append(trait) #make a new emr_traitfilename #and store the traits emr_fileloc = "%s/%s/emr_traitlist_%s_co%s.txt"%( fa.mr_folder, fa.trait_folder, dataset, cutoff ) print "writing file to %s"%emr_fileloc write_trait_to_file(emr_fileloc, emr_traits)
def get_truetraits(fn): """ """ truetrait_list = [] data = read_data(fn) for line in data: if line.startswith("AT"): trait = line.strip() truetrait_list.append(trait) return truetrait_list
def get_genelist(fn): """ """ trait_genelist = [] data = read_data(fn) for line in data: if line.startswith("trait:"): trait = line[7:].strip() if line.startswith("AT"): genelist = line.split() trait_genelist.append([trait, genelist]) return trait_genelist
def get_enriched(fn): """ """ datadict = {} data = read_data(fn) for line in data: if line.startswith("trait:"): trait = line[7:].strip() if line.startswith("AT"): genelist = line.split() datadict[trait] = genelist return datadict
def process_data(fn): """ """ data = read_data(fn) sizes = [] for line in data: if line.startswith("trait:"): trait = line[7:].strip() if line.startswith("eQTL_size:"): size = int(line[11:].strip()) if size != 0: sizes.append(size) #if size == 74: #print trait return sizes
def get_info(fn): """ """ data = read_data(fn) trait_eqtl_genelist = [] for line in data: if line.startswith("trait:"): trait = line[7:16] if line.startswith("eqtl:"): eqtl = int(line[6:].strip()) if line.startswith("AT"): genelist = line.split() trait_eqtl_genelist.append([trait, eqtl, genelist]) return trait_eqtl_genelist
def get_enriched(fn): """ """ datalist_eqtl = [] datadict = {} data = read_data(fn) for line in data: if line.startswith("trait:"): trait = line[7:].strip() if line.startswith("eqtl:"): eqtl = int(line[6:].strip()) if line.startswith("AT"): genelist = line.split() datalist_eqtl.append([trait, eqtl]) datadict[trait] = genelist return datalist_eqtl, datadict
def main(): """ """ tic = time.clock() #Get test data from AtRegNet.txt AtReg_data = read_data(fa.filename_atreg) AtRegNet_parse = parse_AtReg_data(AtReg_data) TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all']) TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref] TG_list_ref = [info[0] for info in TG_TF_ref] TG_set_ref = set(TG_list_ref) sh_TG_list_ref = list(TG_set_ref) TF_list_ref = [info[1] for info in TG_TF_ref] TF_set_ref = set(TF_list_ref) sh_TF_list_ref = list(TF_set_ref) #----------------------------------------------------------- #exp_list = ['Snoek_2012'] exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007'] cutoff_list = [3,4.3,6.7] chromo = [1,2,3,4,5] #----------------------------------------------------------- eQTL_threshold_list = [0,1,2,3] #----------------------------------------------------------- #get the premade 1000 distinct seeds of 8 digits each seedfile = "%s/%s/random_seeds.txt"%(fa.mr_folder, fa.numfolder) #print "Retrieving randomized seeds from %s"%seedfile seeds = read_seeds(seedfile) data_dict = {} write_summary = False write_conf = False print_conf = True summary = [] print "start:" print "----------------------------" for eQTL_threshold in eQTL_threshold_list: for dataset in exp_list: for cutoff in cutoff_list: #print "Initializing analysis for dataset %s with cutoff %s"%(dataset, cutoff) F1 = None ref_F1 = None ############################################################ ####Retrieve original confusion matrix results subfolder_F1 = "/eqtl_%s/valnum_%s"%(eQTL_threshold, dataset) F1_fn = "%s/%s/%s/valnum_results_%s_co%s"%( fa.mr_folder, fa.numfolder, subfolder_F1, dataset, cutoff ) try: ref_recall, ref_precision, ref_F1 = read_predicted_confusion_data(F1_fn) except: ref_recall= ref_precision= ref_F1 = None if ref_F1 != None: ############################################################ ####Retrieve genelist subfolder_genelist = "genelist_%s"%dataset genelist_fn = "%s/%s/%s/genelist_%s_co%s.txt"%( fa.mr_folder, fa.gfolder, subfolder_genelist, dataset, cutoff ) trait_genelist_list = get_genelist(genelist_fn) true_rel, total_rel = get_TGTF_from_genelist( genelist_fn, TG_TF_ref, TG_list_ref, TF_list_ref ) tt_genes = list(set([info[0] for info in true_rel])) ############################################################ ####Retrieve enriched list subfolder_enriched = "enriched_%s"%dataset enriched_fn = "%s/%s/%s/enriched_%s_co%s.txt"%( fa.mr_folder, fa.enriched_folder, subfolder_enriched, dataset, cutoff ) trait_eqtl_genelist, dict_trait_enriched = get_enriched(enriched_fn) ############################################################ ####Retrieve True Traits emr_traits_fn = "%s/%s/emr_traitlist_%s_co%s.txt"%( fa.mr_folder, fa.trait_folder, dataset, cutoff ) ############################################################ #get all traits that have more than X eQTLs, where X = eQTL_threshold truetrait_eqtl_list = [[t[0], t[1]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1]>eQTL_threshold] trait_with_eqtl = [info[0] for info in truetrait_eqtl_list] print "dataset:", dataset print "cutoff:", cutoff print "eQTL_threshold:", eQTL_threshold print "traits with eQTL:", len(trait_with_eqtl)
def main(): """ confusion_matrix.py: ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( TG_TF_pred, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) ########################################################### TP, FP, FN, TN, recall, specif, precis = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) ########################################################### print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis) ########################################################### """ #Get test data from AtRegNet.txt AtReg_data = read_data(fa.filename_atreg) AtRegNet_parse = parse_AtReg_data(AtReg_data) TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all']) TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref] TG_list_ref = [info[0] for info in TG_TF_ref] TG_set_ref = set(TG_list_ref) sh_TG_list_ref = list(TG_set_ref) TF_list_ref = [info[1] for info in TG_TF_ref] TF_set_ref = set(TF_list_ref) sh_TF_list_ref = list(TF_set_ref) ########################################################### ########################################################### artificial_data = copy.deepcopy(TG_TF_ref) dataset = "artificial dataset" cutoff = 0 ########################################################### ########################################################### if true_P: tt_genes = [info[0] for info in artificial_data] total_rel = artificial_data ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( artificial_data, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) ########################################################### TP, FP, FN, TN, recall, specif, precis = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) ########################################################### print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis) ########################################################### ########################################################### percentages = [20, 40, 50, 60] if not false_N and false_P: #Add some noise in the form of False Positives for perc in percentages: cutoff = perc noised_artificial_data = add_false_positives( artificial_data, sh_TG_list_ref, sh_TF_list_ref, perc ) ########################################################### tt_genes = [info[0] for info in artificial_data] total_rel = noised_artificial_data ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( noised_artificial_data, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) ########################################################### TP, FP, FN, TN, recall, specif, precis = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) ########################################################### print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis) ########################################################### ########################################################### if false_N and not false_P: #Add some noise in the form of False Negatives for perc in percentages: cutoff = perc noised_artificial_data = add_false_negatives(artificial_data, TG_TF_ref, perc) ########################################################### tt_genes = [info[0] for info in artificial_data] total_rel = artificial_data ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( noised_artificial_data, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) ########################################################### TP, FP, FN, TN, recall, specif, precis = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) ########################################################### print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis) ########################################################### ########################################################### if false_P and false_N: #Add some noise in the form of False Positives #And then add some False_Negatives for perc in percentages: cutoff = perc print "ad:",len(artificial_data) noised_artificial_data = add_false_positives( artificial_data, sh_TG_list_ref, sh_TF_list_ref, perc ) print "nad:",len(noised_artificial_data) more_noised_artificial_data = add_false_negatives( noised_artificial_data, TG_TF_ref, perc ) print "mnad:",len(more_noised_artificial_data) ########################################################### tt_genes = [info[0] for info in noised_artificial_data] total_rel = noised_artificial_data ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( more_noised_artificial_data, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) ########################################################### TP, FP, FN, TN, recall, specif, precis = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) ########################################################### print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
def main(): """ """ tic = time.clock() #Get test data from AtRegNet.txt AtReg_data = read_data(fa.filename_atreg) AtRegNet_parse = parse_AtReg_data(AtReg_data) TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all']) TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref] TG_list_ref = [info[0] for info in TG_TF_ref] TG_set_ref = set(TG_list_ref) sh_TG_list_ref = list(TG_set_ref) TF_list_ref = [info[1] for info in TG_TF_ref] TF_set_ref = set(TF_list_ref) sh_TF_list_ref = list(TF_set_ref) #----------------------------------------------------------- exp_list = ['Snoek_2012'] #exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Snoek_2012','Keurentjes_2007'] cutoff_list = [3]#[4.3,6.7] chromo = [1,2,3,4,5] #----------------------------------------------------------- eQTL_threshold_list = [0,1,2,3] #----------------------------------------------------------- #get the premade 1000 distinct seeds of 8 digits each seedfile = "%s/%s/random_seeds.txt"%(fa.mr_folder, fa.numfolder) #print "Retrieving randomized seeds from %s"%seedfile seeds = read_seeds(seedfile) data_dict = {} write_summary = False write_conf = False print_conf = True summary = [] for dataset in exp_list: for cutoff in cutoff_list: for eQTL_threshold in eQTL_threshold_list: print "Initializing analysis for dataset %s with cutoff %s"%(dataset, cutoff) F1 = None ref_F1 = None ############################################################ ####Retrieve original confusion matrix results subfolder_F1 = "/eqtl_%s/valnum_%s"%(eQTL_threshold, dataset) F1_fn = "%s/%s/%s/valnum_results_%s_co%s"%( fa.mr_folder, fa.numfolder, subfolder_F1, dataset, cutoff ) try: ref_recall, ref_precision, ref_F1 = read_predicted_confusion_data(F1_fn) except: ref_recall= ref_precision= ref_F1 = None if ref_F1 != None: ############################################################ ####Retrieve genelist subfolder_genelist = "genelist_%s"%dataset genelist_fn = "%s/%s/%s/genelist_%s_co%s.txt"%( fa.mr_folder, fa.gfolder, subfolder_genelist, dataset, cutoff ) trait_genelist_list = get_genelist(genelist_fn) true_rel, total_rel = get_TGTF_from_genelist( genelist_fn, TG_TF_ref, TG_list_ref, TF_list_ref ) tt_genes = list(set([info[0] for info in true_rel])) ############################################################ ####Retrieve enriched list subfolder_enriched = "enriched_%s"%dataset enriched_fn = "%s/%s/%s/enriched_%s_co%s.txt"%( fa.mr_folder, fa.enriched_folder, subfolder_enriched, dataset, cutoff ) trait_eqtl_genelist, dict_trait_enriched = get_enriched(enriched_fn) ############################################################ ####Retrieve True Traits emr_traits_fn = "%s/%s/emr_traitlist_%s_co%s.txt"%( fa.mr_folder, fa.trait_folder, dataset, cutoff ) ############################################################ #get all traits that have more than X eQTLs, where X = eQTL_threshold truetrait_eqtl_list = [[t[0], t[1]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1]>eQTL_threshold] trait_with_eqtl = [info[0] for info in truetrait_eqtl_list] print "traits with eQTL", len(trait_with_eqtl) ############################################################ higher_recall=lower_recall=higher_precision=lower_precision=0 higher_F1=lower_F1=0 permutated_confusion = [] #permutate! #print "Commencing permutation of %s, standby..."%len(seeds) #i = 0 for seedling in seeds: #reset variables TP=FP=FN=TN=recall=specif=precision=F1= 0 #print i #i += 1 trait_randomsample = [] #create [trait - sample gene list] for tr_ge in trait_genelist_list: g_trait, g_genelist = tr_ge #print g_trait #print len(g_genelist) if g_trait in trait_with_eqtl and g_trait in dict_trait_enriched: sample_size = len(dict_trait_enriched[g_trait]) rsamp = select_random_sample(g_genelist, sample_size, seedling) trait_randomsample.append([g_trait,0, rsamp]) #q = len(g_genelist) #print "take %s from %s"%(sample_size, q) #TG_TF_pred is summed over all traits in a (dataset, cutoff) combination #TG_TF_pred = get_randomized_predictions(trait_randomsample, TF_set_ref) TG_TF_pred = process_enrichment(trait_randomsample, TF_set_ref) #proceed with the random sample to the confusion matrix ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives( TG_TF_pred, TG_TF_ref ) ########################################################### unpredicted_rel = count_false_negatives( TG_TF_ref, true_pred_rel, tt_genes ) ########################################################### TP, FP, FN, TN, recall, specif, precision, F1 = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) permutated_confusion.append([TP, FP, FN, TN, recall, specif, precision, F1]) ########################################################### #print "true_traits: %s"%len(set(tt_genes)) #print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precision) ########################################################### if recall != None: if recall < ref_recall: lower_recall += 1 if recall >= ref_recall: higher_recall += 1 #else: #print "recall is None" #pass if precision != None: if precision < ref_precision: lower_precision += 1 if precision >= ref_precision: higher_precision += 1 #else: #print "precision is None" #pass if ref_F1 != None and F1 != None: if F1 < ref_F1: lower_F1 += 1 if F1 >= ref_F1: higher_F1 += 1 summary.append([dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1]) ########################################################### if write_conf: substorage = "%s/%s/%s"%(fa.mr_folder, fa.numfolder, dataset) if not os.path.exists(substorage): os.mkdir(substorage) resultsfolder_conf = "%s/permutate_eqtl_%s_%s_co%s.txt"%( substorage, eQTL_threshold, dataset, cutoff ) try: print "Writing to file %s"%resultsfolder_conf with open(resultsfolder_conf, 'w') as fo: fo.write("-------------------------") fo.write("\n") fo.write("dataset: %s"%dataset) fo.write("\n") fo.write("cutoff: %s"%cutoff) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("lower_F1: \t%s"%lower_F1) fo.write("\n") fo.write("higher_F1: \t%s"%higher_F1) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("lower_recall: %s"%lower_recall) fo.write("\n") fo.write("higher_recall: %s"%higher_recall) fo.write("\n") fo.write("lower_precision: %s"%lower_precision) fo.write("\n") fo.write("higher_precision: %s"%higher_precision) fo.write("\n") fo.write("-------------------------") fo.write("\n") for [TP, FP, FN, TN, recall, specif, precision, F1] in permutated_confusion: fo.write("-------------------------\n") fo.write("TP\t%s\tFN\t%s"%(TP, FN)) fo.write("\n") fo.write("FP\t%s\tTN\t%s"%(FP, TN)) fo.write("\n") fo.write("-------------------------\n") fo.write("recall\t%s"%recall) fo.write("\n") fo.write("specificity\t%s"%specif) fo.write("\n") fo.write("precision\t%s"%precision) fo.write("\n") fo.write("F1\t%s"%F1) fo.write("\n") fo.write("-------------------------\n") except: pass if print_conf: try: print "-------------------------" print "TP\t%s\tFN\t%s"%(TP, FN) print "FP\t%s\tTN\t%s"%(FP, TN) print "-------------------------" print "dataset: %s"%dataset print "cutoff: %s"%cutoff print "eQTL: %s"%eQTL_threshold print "-------------------------" print "lower_F1:\t%s"%lower_F1 print "higher_F1:\t%s"%higher_F1 print "-------------------------" print "lower_recall: %s"%lower_recall print "higher_recall: %s"%higher_recall print "lower_precision: %s"%lower_precision print "higher_precision: %s"%higher_precision print "-------------------------" except: pass if write_summary: summfolder_conf = "%s/%s/permutate_summary_eqtl_%s.txt"%( fa.mr_folder, fa.numfolder, eQTL_threshold ) try: with open(summfolder_conf, 'w') as fo: for dataset, cutoff, lower_recall, higher_recall, lower_precision, higher_precision, lower_F1, higher_F1 in summary: fo.write("-------------------------") fo.write("\n") fo.write("dataset: %s"%dataset) fo.write("\n") fo.write("cutoff: %s"%cutoff) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("lower_F1: \t%s"%lower_F1) fo.write("\n") fo.write("higher_F1: \t%s"%higher_F1) fo.write("\n") fo.write("-------------------------") fo.write("\n") fo.write("recall:") fo.write("\n") fo.write("lower: %s"%lower_recall) fo.write("\n") fo.write("higher: %s"%higher_recall) fo.write("\n") fo.write("precision:") fo.write("\n") fo.write("lower: %s"%lower_precision) fo.write("\n") fo.write("higher: %s"%higher_precision) fo.write("\n") fo.write("-------------------------") fo.write("\n") except: pass
def main(): """ """ # Get test data from AtRegNet.txt AtReg_data = read_data(fa.filename_atreg) AtRegNet_parse = parse_AtReg_data(AtReg_data) TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ["all"]) TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref] TG_list_ref = [info[0] for info in TG_TF_ref] TG_set_ref = set(TG_list_ref) TF_list_ref = [info[1] for info in TG_TF_ref] TF_set_ref = set(TF_list_ref) # ----------------------------------------------------------- # exp_list = ['Ligterink_2014','Ligterink_2014_gxe','Keurentjes_2007','Snoek_2012'] exp_list = ["Ligterink_2014"] cutoff_list = [3] # , 4.3, 6.7] chromo = [1, 2, 3, 4, 5] # ----------------------------------------------------------- for dataset in exp_list: for cutoff in cutoff_list: print "Analysing %s %s" % (dataset, cutoff) ########################################################### # Extract the true TG-TF relations and the total possible # relations from the stored datafiles filelocation = "%s/%s/genelist_%s/genelist_%s_co%s.txt" % ( fa.mr_folder, fa.gfolder, dataset, dataset, cutoff, ) true_rel, total_rel = get_TGTF_from_genelist(filelocation, TG_TF_ref, TG_list_ref, TF_list_ref) ########################################################### # The TG in the true TG-TF relations are the true_traits (tt) # in this case named tt_genes tt_genes = list(set([info[0] for info in true_rel])) ######################################################################## # Get for each true_trait the number of eQTLs enriched_fn = "%s/%s/enriched_%s/enriched_%s_co%s.txt" % ( fa.mr_folder, fa.enriched_folder, dataset, dataset, cutoff, ) trait_eqtl_genelist = get_info(enriched_fn) # Select true traits based on number of eQTLs tt_trait_eqtl_genelist = [[t[0], t[1], t[2]] for t in trait_eqtl_genelist if t[0] in tt_genes and t[1] > 0] trait_with_eqtl = [info[0] for info in tt_trait_eqtl_genelist] ######################################################################## ########################################################### TG_TF_pred = process_enrichment(tt_trait_eqtl_genelist, TF_set_ref) ########################################################### true_pred_rel, false_pred_rel = identify_true_false_positives(TG_TF_pred, TG_TF_ref) ########################################################### unpredicted_rel = count_false_negatives(TG_TF_ref, true_pred_rel, trait_with_eqtl) ########################################################### TP, FP, FN, TN, recall, specif, precis = calculate_confusion( total_rel, true_pred_rel, false_pred_rel, unpredicted_rel ) ########################################################### print "true_traits: %s" % len(set(tt_genes)) print_results(dataset, cutoff, TP, FP, FN, TN, recall, specif, precis)
chromosome = [1,2,3,4,5] #cutoff_list = [3] #cutoff_list = [4.3] cutoff_list = [6.7] dataset = exp_list[0] cutoff = cutoff_list[0] #---------------------------------------------------------------------- #Set all data files and lists #---------------------------------------------------------------------- #read data from files, parse and process TF_family_data = read_data(fa.filename_fam) TF_fam_data = parse_family_data(TF_family_data) TF_fam_list = sorted([info[1] for info in TF_fam_data]) TF_fam_set = set(TF_fam_list) AtReg_data = read_data(fa.filename_atreg) AtRegNet_parse = parse_AtReg_data(AtReg_data) AtRegNet_list = [info[2] for info in AtRegNet_parse] AtRegNet_set = set(AtRegNet_list) AR_dict = make_AtReg_dict(AtRegNet_parse) fam_selection = ["all"]#use "all" for all regulators AtRegNet_pairs = TFloc_pairs_AtRegNet(AtRegNet_parse, fam_selection) TF_TG_ref = TFloc_pairs_AtRegNet(AtRegNet_parse, ['all']) TG_TF_ref = [[info[1], info[0]] for info in TF_TG_ref]
""" Read traits with a LOD score > 20 dataset is Ligterink_2014 """ import folder_assignments as fa from data_handlers import read_data filename = "%s/%s/lod_of_20.txt" % (fa.mr_folder, fa.raw_folder) data = read_data(filename) genelist = [] for line in data: if line.startswith("| AT"): gene = line[2:12] genelist.append(gene) genelist = sorted(list(set(genelist))) for g in genelist: print g
def main(): """ """ #exp_list = ['Ligterink_2014', 'Ligterink_2014_gxe', 'Keurentjes_2007', 'Snoek_2012'] #cutoff_list = [3, 4.3, 6.7] #----------------------------------------------------------- #exp_list = ['Ligterink_2014'] #exp_list = ['Ligterink_2014_gxe'] #exp_list = ['Keurentjes_2007'] exp_list = ['Snoek_2012'] cutoff_list = [3] #cutoff_list = [4.3] #cutoff_list = [6.7] chromo = [1,2,3,4,5] #----------------------------------------------------------- fileloc = "%s/%s/tt_te_combi.txt"%(fa.mr_folder, fa.numfolder) #print "Retrieving random sample sizes from %s"%fileloc szdata = read_data(fileloc) sample_size_dict = get_trait_samplesize_data(szdata) draw_trait_vs_eqtlsize = False draw_trait_vs_nreqtls = True for dataset in exp_list: for cutoff in cutoff_list: key = (dataset, cutoff) if key in sample_size_dict: #sample_size_list = [trait, sample_size] sample_size_list = sample_size_dict[key] tt_genes = [item[0] for item in sample_size_list] ######################################################################## x = [] y = [] ######################################################################## #Print some specific traits based on size of genelist #for t, gl in trait_genelist_list: #if len(gl) < 124: #print t ######################################################################## if draw_trait_vs_eqtlsize: #trait_genelist_list = [trait, genelist] trait_genelist_list = get_genelist(dataset, cutoff, chromo) tt_trait_genelist_list = [[t[0], t[1]] for t in trait_genelist_list if t[0] in tt_genes] t_gls_list = [[len(info[1]), info[0]] for info in tt_trait_genelist_list] sort_t_gls_list = sorted(t_gls_list, reverse=True) t = [info[1] for info in sort_t_gls_list] gls = [info[0] for info in sort_t_gls_list] for ind_t, trait in enumerate(t): x.append(ind_t) for eQTLsize in gls: y.append(eQTLsize) filename_plot = "%s/plots/tt_eQTLvsTrait_%s_co_%s.png"%(fa.mr_folder, dataset, cutoff) filename_text = "%s/plots/tt_eQTLvsTrait_%s_co_%s.txt"%(fa.mr_folder, dataset, cutoff) title = "eQTL size vs Traits for %s with cutoff %s"%(dataset, cutoff) write_plot(sort_t_gls_list, filename_text, title) draw_plot(x, y, filename_plot, title) tot = float(sum(gls)) if len(gls) != 0: avg = tot/float(len(gls)) print key print "Average eQTL size expressed in nr of genes: %s"%avg if draw_trait_vs_nreqtls: tr_eqtl_list = get_info(dataset, cutoff, chromo, tt_genes) tt_trait_eqtls_list = [[t[0], t[1]] for t in tr_eqtl_list if t[0] in tt_genes and t[1]>2] t_eqtls_list = [[info[1], info[0]] for info in tt_trait_eqtls_list] sort_t_eqtls_list = sorted(t_eqtls_list, reverse=True) t = [info[1] for info in sort_t_eqtls_list ] eqtls = [info[0] for info in sort_t_eqtls_list ] for ind_t, trait in enumerate(t): x.append(ind_t) for eQTLsize in eqtls: y.append(eQTLsize) filename_plot = "%s/plots/tt_nrofeQTLsvsTrait_%s_co_%s.png"%(fa.mr_folder, dataset, cutoff) filename_text = "%s/plots/tt_nrofeQTLsvsTrait_%s_co_%s.txt"%(fa.mr_folder, dataset, cutoff) title = "nr of eQTLs vs Traits for %s with cutoff %s"%(dataset, cutoff) #write_plot(sort_t_gls_list, filename_text, title) draw_plot(x, y, filename_plot, title)