def run(args): # load the data df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values found_genes, geneTS = gtm.get_gene_TS(df, genes) dfr = gtm.load_file_and_avg(args.rand_data_file) genesr = dfr['gene'].values found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr) n = geneTSr.shape[0] args_dict = ct.load_kwargs_file(argsfile=args.args_file) print args_dict if args.rowlist_file != None: with open(args.rowlist_file, 'rU') as f: rowlist = eval(f.readline()) else: rowlist = range(n) if args.test == "e": beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv(geneTS, geneTS, rowlist, **args_dict) with open(args.output_name, 'w') as outfile: pickle.dump(beta_tuple, outfile) all_res_df.to_csv(args.output_all_name, sep="\t", index=False) use_df.to_csv(args.output_use_name, sep="\t", index=False) param_df = use_df[["alpha", "lambda.min", "Row"]] rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load(geneTSr, geneTS, rowlist, param_df, **args_dict) with open(args.output_rand_name, 'w') as outfile: pickle.dump(rand_beta_tuple, outfile) rand_all_res_df.to_csv(args.output_rand_all_name, sep="\t", index=False) rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False) print "HIIIIIII" print "Output written to ", args.output_name print "All results written to ", args.output_all_name print "Used params written to ", args.output_use_name print "Rand output written to ", args.output_rand_name print "All rand results written to ", args.output_rand_all_name print "Used rand params written to ", args.output_rand_use_name
def run(args): # load the data df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values found_genes, geneTS = gtm.get_gene_TS(df, genes) dfr = gtm.load_file_and_avg(args.rand_data_file) genesr = dfr['gene'].values found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr) n = geneTSr.shape[0] args_dict = ct.load_kwargs_file(argsfile=args.args_file) print args_dict if args.rowlist_file != None: with open(args.rowlist_file, 'rU') as f: rowlist = eval(f.readline()) else: rowlist = range(n) if args.test == "e": beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv( geneTS, geneTS, rowlist, **args_dict) with open(args.output_name, 'w') as outfile: pickle.dump(beta_tuple, outfile) all_res_df.to_csv(args.output_all_name, sep="\t", index=False) use_df.to_csv(args.output_use_name, sep="\t", index=False) param_df = use_df[["alpha", "lambda.min", "Row"]] rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load( geneTSr, geneTS, rowlist, param_df, **args_dict) with open(args.output_rand_name, 'w') as outfile: pickle.dump(rand_beta_tuple, outfile) rand_all_res_df.to_csv(args.output_rand_all_name, sep="\t", index=False) rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False) print "HIIIIIII" print "Output written to ", args.output_name print "All results written to ", args.output_all_name print "Used params written to ", args.output_use_name print "Rand output written to ", args.output_rand_name print "All rand results written to ", args.output_rand_all_name print "Used rand params written to ", args.output_rand_use_name
def load_and_run(args): data_file = args.data_file rand_data_file = args.rand_data_file save_prefix = args.out_prefix assert args.test in {'e', 'l', 'r'} fit_method = cp.test2fit_method[args.test] lag = args.lag best_hyper = pickle.load(open(args.best_hyper_file, 'rb')) if args.row_file != None: rows = pickle.load(open(args.row_file, 'rb')) else: rows = None assert args.null in {"l", "g"} # Load data file # Load data file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) genesr, geneTSr = gtm.load_basic_rep_file_list(rand_data_file) # dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) # dfsr, genesr, geneTSr, dfr, __, __ = gtm.load_rep_file_list(rand_data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) dfr = pd.read_csv(rand_data_file, sep="\t") genesr, geneTSr = gtm.get_gene_TS(dfr) assert (geneTS.shape == geneTSr.shape) assert (genes == genesr).all() coefs, intercepts, fit_result_df, coefsr, fit_result_dfr = cp.run( geneTS, geneTSr, hyper=best_hyper, fit_method=fit_method, lag=lag, rows=rows, save_prefix=save_prefix, has_reps=args.load_reps, null=args.null, only_array=args.only_array) print("RESULTS of causal fit") print("*************************") print("NORMAL: ") cp.summarize_fit(coefs, intercepts, fit_result_df)
def load_and_run(args): data_file = args.data_file lag = args.lag if args.row_file != None: rows = pickle.load(open(args.row_file, 'rb')) else: rows = None # Load data file # Load data file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) coefs = fit_all_pairwise_conditional(geneTS=geneTS, lag=lag, rows=rows, coeflag_options=None, has_reps=args.load_reps) outfile = args.out_prefix + "_coefs.p" pickle.dump(coefs, open(outfile, 'wb')) print("Coefs saved to ", outfile)
def run(args): # load the data df = gtm.load_file_and_avg(args.data_file) genes = df['gene'].values found_genes, geneTS = gtm.get_gene_TS(df, genes) args_dict = load_kwargs_file(argsfile=args.args_file) if args.pairlist_file == None: pairlist = None else: pairlist = np.load(open(args.pairlist_file)) print args_dict if args.test == 'g': output = ct.pairwise_granger_causality_all(geneTS, pairlist, **args_dict) with open(args.output_name, 'w') as outfile: pickle.dump(output, outfile) print "HELLOOOOOOOO" print "Output written to ", args.output_name
def main(): tstart = time.time() input_file = args.input_file out_file_prefix = args.out_file_prefix start_index = args.start_index end_index = args.end_index df = gtm.load_file_and_avg(input_file) genes = df['gene'][start_index:end_index].values found_genes, geneTS = gtm.get_gene_TS(df, genes) cause_type = args.cause_type if cause_type == 'g': model_orders = range(args.model_order_min, args.model_order_max + 1) threshold = args.p_threshold p_matr_list = [] sig_matr_list = [] for model_order in model_orders: t_gc = time.time() p_matr = pairwise_granger_causality_all( geneTS, model_order=model_order, use_processes=args.use_processes, procnum=args.procnum) print "Time for granger causality", time.time() - t_gc sig_matr = p_matr < threshold p_matr_list.append(p_matr) sig_matr_list.append(sig_matr) all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr( sig_matr_list=sig_matr_list) print "Total number of significant pairs ", all_sig_num + not_sig_num print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / ( all_sig_num + not_sig_num) out_file_name = out_file_prefix + "_GC.p" pickle.dump([ model_orders, p_matr_list, sig_matr_list, (all_sig_matr, all_sig_num, not_sig_num) ], open(out_file_name, "w")) print "Results written to", out_file_name # compare the significant matrices # save the output p matrices print "Total time used ", time.time() - tstart
def main(): tstart = time.time() input_file = args.input_file out_file_prefix = args.out_file_prefix start_index = args.start_index end_index = args.end_index df = gtm.load_file_and_avg(input_file) genes = df['gene'][start_index:end_index].values found_genes, geneTS = gtm.get_gene_TS(df, genes) cause_type = args.cause_type if cause_type == 'g': model_orders = range(args.model_order_min, args.model_order_max + 1) threshold = args.p_threshold p_matr_list = [] sig_matr_list = [] for model_order in model_orders: t_gc = time.time() p_matr = pairwise_granger_causality_all(geneTS, model_order=model_order, use_processes=args.use_processes, procnum=args.procnum) print "Time for granger causality", time.time() - t_gc sig_matr = p_matr < threshold p_matr_list.append(p_matr) sig_matr_list.append(sig_matr) all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr(sig_matr_list=sig_matr_list) print "Total number of significant pairs ", all_sig_num + not_sig_num print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / (all_sig_num + not_sig_num) out_file_name = out_file_prefix + "_GC.p" pickle.dump([model_orders, p_matr_list, sig_matr_list, (all_sig_matr, all_sig_num, not_sig_num)], open(out_file_name, "w")) print "Results written to", out_file_name # compare the significant matrices # save the output p matrices print "Total time used ", time.time() - tstart
def load_and_run(args): data_file = args.data_file output_names = args.output_names assert args.test in {'e', 'l', 'r'} fit_method = cp.test2fit_method[args.test] lag = args.lag hyperlist = pickle.load(open(args.hyper_file, 'rb')) if args.row_file != None: rows = pickle.load(open(args.row_file, 'rb')) else: rows = None # Load data file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) best_hyper, best, hyper_df, hyper_fit_dfs = cp.run_cross_validate(geneTS, fit_method=fit_method, hyperlist=hyperlist, lag=lag, rows=rows, has_reps=args.load_reps) print("Best hyper is : ", best_hyper) print("Best result : ", best) print("Hyper df: ") print(hyper_df) for output_name, hyper_fit_df, hyper in zip(output_names, hyper_fit_dfs, hyperlist): hyper_fit_df.to_csv(output_name, sep="\t", index=0) print("Result for ", hyper, " written to ", output_name)
def load_and_run(args): lag = args.lag save_prefix = args.save_prefix assert args.stratify_by in {"e", "n"} stratify_by = cp.args2stratify_by[args.stratify_by] # Load data file and prepare a file to pass to plotters if args.load_reps: # load dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list( args.data_file) dfsr, genesr, geneTSr, dfr, __, __ = gtm.load_rep_file_list( args.rand_data_file) # get shared prefix timekeys print "Timekeys: ", timekeys print "Num per key: ", num_per_keys else: df = pd.read_csv(args.data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) dfr = pd.read_csv(args.rand_data_file, sep="\t") genesr, geneTSr = gtm.get_gene_TS(dfr) timekeys = df.columns.values[1:] print "Timekeys: ", timekeys # Num. replicates per key num_per_keys = None assert (geneTS.shape == geneTSr.shape) assert (genes == genesr).all() coefs = pickle.load(open(args.coef_file, 'rB')) intercepts = pickle.load(open(args.intercept_file, 'rB')) fit_result_df = pd.read_csv(args.fit_result_file, sep="\t") coefsr = pickle.load(open(args.coef_rand_file, 'rB')) # interceptsr = pickle.load(open(args.intercept_rand_file, 'rB')) fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t") if args.best_hyper_file != None: best_hyper = pickle.load(open(args.best_hyper_file, 'rB')) else: best_hyper = None print "RESULTS" print "*************************" print "NORMAL: " cp.summarize_fit(coefs, intercepts, fit_result_df, filename="fit_all_summary_normal.txt", hyper=best_hyper, test_name=args.test_name, lag=lag) # print "*************************" # print "RANDOM:" # cp.summarize_fit(coefsr, interceptsr, fit_result_dfr, filename="fit_all_summary_random.txt", hyper=best_hyper, # test_name=args.test_name, lag=lag) # LEFT OFF HERE: SEE IF YOU CAN STILL DO FIT_RESULT_SUMMARY W/O INTERCEPT # -Jlu 1/25/17 10:14 AM # Align the coefs # print "Aligning coefficients" acoefs = lc.align_coefs(coefs, lag) acoefsr = lc.align_coefs(coefsr, lag) print "Removing alphas (gene-on-self effects) " acoefs = lc.remove_alphas(acoefs, lag) acoefsr = lc.remove_alphas(acoefsr, lag) coef_nets = [] coefr_nets = [] # Save the gene matrices for i in range(acoefs.shape[0]): coef_matr_filename = save_prefix + "-" + str(i + 1) + "-matrix.txt" coefr_matr_filename = save_prefix + "-" + str(i + 1) + "-r-matrix.txt" coef_net_filename = save_prefix + "-" + str(i + 1) + "-network.txt" coefr_net_filename = save_prefix + "-" + str(i + 1) + "-r-network.txt" coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename, matrix=acoefs[i], genes=genes) coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename, matrix=acoefsr[i], genes=genes) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 coef_net = nh.matr_to_net(coef_matr, extra_dict=extra_dict, make_type=False) coefr_net = nh.matr_to_net(coefr_matr, extra_dict=extra_dict, make_type=False) coef_net.to_csv(coef_net_filename, sep="\t", index=False) coefr_net.to_csv(coefr_net_filename, sep="\t", index=False) coef_nets.append(coef_net) coefr_nets.append(coefr_net) print "Coef ", i + 1 print "Networks written to " print coef_net_filename print coefr_net_filename # max_net_filename = save_prefix + "-max-network.txt" # max_r_net_filename = save_prefix + "-max-r-network.txt" union_net_filename = save_prefix + "-union-network.txt" union_r_net_filename = save_prefix + "-union-r-network.txt" if acoefs.shape[0] > 1: m_net = cp.get_max_network(coef_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( coef_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) print "Max network edges: ", m_net.shape print "Union network edges: ", union_net.shape else: union_net = coef_nets[0] union_net.to_csv(union_net_filename, sep="\t", index=False) if acoefsr.shape[0] > 1: m_net = cp.get_max_network(coefr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_r_net = cp.get_union_network( coefr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) else: union_r_net = coefr_nets[0] union_r_net.to_csv(union_r_net_filename, sep="\t", index=False) # print "Max networks written to " # print max_net_filename # print max_r_net_filename print "Unioned networks written to " print union_net_filename print union_r_net_filename if not os.path.exists("plots"): os.makedirs("plots") if not os.path.exists("plots" + os.sep + "betas"): os.makedirs("plots" + os.sep + "betas") # Plot the betas for i in range(acoefs.shape[0]): if len(np.nonzero(acoefs[i])[0]) > 0 and len( np.nonzero(acoefsr[i])[0]) > 0: fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename="plots" + os.sep + "betas" + os.sep + "beta_nonzero_coef-" + str(i + 1), title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename="plots" + os.sep + "betas" + os.sep + "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90", zoom_in_top_percentile=95, zoom_in_bottom_percentile=5, title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename="plots" + os.sep + "betas" + os.sep + "beta_abs_coef-" + str(i + 1), title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename="plots" + os.sep + "betas" + os.sep + "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95", zoom_in_top_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename="plots" + os.sep + "betas" + os.sep + "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5", zoom_in_bottom_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") print "Coef ", i + 1 print "Plots of betas written to: plots" + os.sep + "betas" # get FDRS fdrs = [0.01, 0.05, 0.1, 0.2] acoefs_fdrs = [] sf_dfs = [] for fdr in fdrs: fdr_dir = "fdr-" + str(fdr) + "-" + stratify_by if not os.path.exists(fdr_dir): os.makedirs(fdr_dir) fdr_prefix = fdr_dir + os.sep + save_prefix acoefs_fdr = np.zeros(acoefs.shape) fdr_nets = [] print "*************" for i in range(acoefs.shape[0]): print "-----" print "FDR = ", fdr print "Lag ", lag print "Coef ", i + 1 print "Stratify ", stratify_by acoefs_fdr[i], threshes = fc.get_abs_thresh( acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by) # print "Threshes", threshes fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-matrix.txt" fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" fdr_matr = gtm.save_gene_matrix(fdr_matr_filename, matrix=acoefs_fdr[i], genes=genes) pickle.dump( threshes, open( fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-threshes.p", 'wB')) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 fdr_net = nh.matr_to_net(fdr_matr, extra_dict=extra_dict, make_type=False) fdr_net.to_csv(fdr_net_filename, sep="\t", index=False) fdr_nets.append(fdr_net) # write summary readme sf_df = fc.summarize_fdr(matr=acoefs_fdr[i], test=args.test_name, fdr=fdr, lag=lag, coef=i + 1, hyper=best_hyper, thresh=threshes, readme_name=fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-README.txt", matrixname=fdr_matr_filename, filename=fdr_net_filename) sf_dfs.append(sf_df) print "Network edges: ", fdr_net.shape[0] if acoefs_fdr.shape[0] > 1: m_net = cp.get_max_network(fdr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( fdr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs_fdr.shape[0] + 1)] + [""]) else: union_net = fdr_nets[0] union_net_filename = fdr_prefix + "-union-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" union_net.to_csv(union_net_filename, sep="\t", index=False) print "Union network edges", union_net.shape[0] print "Written to ", union_net_filename acoefs_fdrs.append(acoefs_fdr.copy()) all_sf_dfs = pd.concat(sf_dfs) all_sf_dfs.to_csv("fit_all_summary_fdr-" + stratify_by + ".txt", sep="\t", index=False) print "********" print "Summaries of all fdrs written to fit_all_summary_fdr-" + stratify_by + ".txt" print "Matrices done." with open("matrices_done.txt", 'w') as donefile: donefile.write("done\n") if args.plot_coef_fdr: print "*******" print "PLOTS" for i, fdr in zip(range(len(fdrs)), fdrs): acoefs_fdr = acoefs_fdrs[i] if not os.path.exists("plots" + os.sep + "fdr-" + str(fdr)): os.makedirs("plots" + os.sep + "fdr-" + str(fdr)) # Only plot the bar if replicates were loaded cp.plot_all_coef(acoefs_fdr, df, genes, lag, file_prefix="plots" + os.sep + "fdr-" + str(fdr) + os.sep + save_prefix + "-", plot_bar=args.load_reps, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2, verbose=True) # Plot them without error bars just to check if args.load_reps: cp.plot_all_coef(acoefs_fdr, df, genes, lag, file_prefix="plots" + os.sep + "fdr-" + str(fdr) + os.sep + save_prefix + "-nobar-", plot_bar=False, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2) print "FDR plots written to: ", "plots" + os.sep + "fdr-" + str( fdr) # Plot all the coefs # NOTE: this will take a long time! if args.plot_all: raise ValueError( "Fix all the below first before trying to do plot all") if not os.path.exists("plots" + os.sep + "original"): os.makedirs("plots" + os.sep + "original") cp.plot_all_coef(acoefs, df, genes, lag, file_prefix="plots" + os.sep + "original" + os.sep + save_prefix + "-", plot_bar=args.load_reps, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2) print "Original plots written to: ", "plots" + os.sep + "original" if not os.path.exists("plots" + os.sep + "randomized"): os.makedirs("plots" + os.sep + "randomized") cp.plot_all_coef(acoefsr, dfr, genes, lag, file_prefix="plots" + os.sep + "randomized" + os.sep + save_prefix + "-", plot_bar=args.load_reps, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2) print "Randomized plots written to: ", "plots" + os.sep + "randomized"
def run(args): if args.test not in {"r", "l", "e"}: raise ValueError( "args.test must be r (ridge), l (lasso) or e (elastic net)") if args.null not in {"l", "g"}: raise ValueError("args.null must be l (local) or g (global)") # Load files data_file = args.data_file rand_data_file = args.rand_data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) hyperlist = pickle.load(open(args.hyper_list_file, 'rb')) # hyper_names = cp.hyperlist_to_namelist(hyperlist) # Make hyper files for cross_validate loading. hyper_filenames = [] print("*************") print("HYPERS") print("*************") if not os.path.exists("hyper"): os.makedirs("hyper") # for hyper, hyper_name in zip(hyperlist, hyper_names): for hyper, h in zip(hyperlist, list(range(len(hyperlist)))): hyper_filename = "hyper" + os.sep + args.output_name + "-hyper-" + str( h) + ".p" hyper_filenames.append(hyper_filename) pickle.dump([hyper], open(hyper_filename, 'wb')) print("Hypers written in format: ", hyper_filename) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") if not os.path.exists("rows"): os.makedirs("rows") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) pickle.dump(partition_row, open(row_filename, 'wb')) print("Row written in format: ", row_filename) if not os.path.exists("timing"): os.makedirs("timing") print("Folder timing created") resulttimefile = os.path.join("timing", "result_time.csv") if not os.path.exists(resulttimefile): with open(resulttimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) if args.cv != 0: print("*************") print("CV") print("*************") # Make CV scripts cv_scripts = [] hyper_output_dict = collections.OrderedDict() hyper_int_dict = collections.OrderedDict() if not os.path.exists("cv-scripts"): os.makedirs("cv-scripts") cvtimefile = os.path.join("timing", "hyper_time.csv") if not os.path.exists(cvtimefile): with open(cvtimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) for hyper, h, hyper_filename in zip(hyperlist, list(range(len(hyperlist))), hyper_filenames): hyper_output_group = [] for partition_row, i, row_filename in zip( partition_rows, list(range(len(partition_rows))), row_filenames): cv_prefix = args.output_name + "-cv-" + str(h) + "-row-" + str( i) cv_script = os.path.join("cv-scripts", cv_prefix + ".sh") cv_scripts.append(cv_script) cv_output = "hyper" + os.sep + cv_prefix + "-result.txt" hyper_output_group.append(cv_output) command_string = "time python cross_validate.py -d " + data_file + " -lr " + str(args.load_reps) + " -o " + cv_output + " -hl " + str(hyper_filename) \ + " -t " + args.test + " -l " + str(args.lag) + " -rl " + str(row_filename) with open(cv_script, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") #outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") #outputfile.write("module load anaconda\n") outputfile.write("module load anaconda3\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + cv_script + ",$START,$END,$SECONDS >> " + cvtimefile + "\n") os.chmod(cv_script, 0o777) # Set the output names, prepare for integration of all the hyper parameter fit results hyper_output_dict[str(hyper)] = hyper_output_group hyper_int_dict[str( hyper)] = "hyper" + os.sep + args.output_name + "-cv-" + str( h) + "-result.txt" hyper_output_df = pd.DataFrame(hyper_output_dict) hyper_int_df = pd.DataFrame(hyper_int_dict, index=[0]) print("Hyper output df is in form", hyper_output_df.head(n=5)) hyper_output_df.to_csv("cv_outputs.txt", sep="\t", index=0) hyper_int_df.to_csv("cv_integrated.txt", sep="\t", index=0) print("Partitioned CV fit_result_dfs in cv_outputs.txt", "Integrated CV fit_result_dfs in cv_integrated.txt") with open("cv_script_list.txt", 'w') as outfile: for cv_script in cv_scripts: outfile.write(cv_script + "\n") print("CV scripts written to cv_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs( cv_scripts, number=int(math.ceil( len(cv_scripts) * 1.0 / args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = [ "./" + script_filename for script_filename in script_group ] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("cv_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print( "Parallel script list written to cv_parallel_script_list.txt" ) # Integrate hyperparameters # Begin whole normal fit hyper_script = "set_hyper.sh" with open(hyper_script, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") outputfile.write("set -e\n") outputfile.write( "time python integrate_hyper.py -hfd cv_outputs.txt -ind cv_integrated.txt -hl " + args.hyper_list_file + "\n") outputfile.write( "time python set_hyper.py -ind cv_integrated.txt -r " + "hyper" + os.sep + "hyper_df.txt -o " + "hyper" + os.sep + "best_hyper.p -hl " + args.hyper_list_file + " -tn " + args.test_name + " \n") outputfile.write("END=$(date)\n") outputfile.write("echo " + hyper_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(hyper_script, 0o777) print("set_hyper.sh written") print("*************") print("FITTING") print("*************") # Run the actual fit if not os.path.exists("fit"): os.makedirs("fit") if not os.path.exists("fit-scripts"): os.makedirs("fit-scripts") fittimefile = os.path.join("timing", "fit_time.csv") if not os.path.exists(fittimefile): with open(fittimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) fit_scripts = [] fit_output_prefixes = [] for partition_row, i, row_filename in zip(partition_rows, list(range(len(partition_rows))), row_filenames): fit_prefix = args.output_name + "-fit-row-" + str(i) fit_script = os.path.join("fit-scripts", fit_prefix + ".sh") fit_scripts.append(fit_script) fit_output_prefix = "fit" + os.sep + fit_prefix fit_output_prefixes.append(fit_output_prefix) command_string = "time python fit_all.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \ " -o " + fit_output_prefix + " -bh " + \ "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \ str(row_filename) + " -n " + args.null + " -oa " + str(args.only_array) with open(fit_script, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") #outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda3\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + fit_script + ",$START,$END,$SECONDS >> " + fittimefile + "\n") os.chmod(fit_script, 0o777) with open("fit_script_list.txt", 'w') as outfile: for fit_script in fit_scripts: outfile.write("./" + fit_script + "\n") print("Fit scripts written to fit_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs( fit_scripts, number=int(math.ceil(len(fit_scripts) * 1.0 / args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = [ "./" + script_filename for script_filename in script_group ] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("fit_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print( "Parallel script list written to fit_parallel_script_list.txt") # Note the output files fit_coefs = [ fit_output_prefix + "_coefs.p" for fit_output_prefix in fit_output_prefixes ] fit_intercepts = [ fit_output_prefix + "_intercepts.p" for fit_output_prefix in fit_output_prefixes ] fit_results = [ fit_output_prefix + "_fit_result_df.txt" for fit_output_prefix in fit_output_prefixes ] fit_coefsr = [ fit_output_prefix + "_coefsr.p" for fit_output_prefix in fit_output_prefixes ] # fit_interceptsr = [fit_output_prefix + "_interceptsr.p" for fit_output_prefix in fit_output_prefixes] fit_resultsr = [ fit_output_prefix + "_fit_result_dfr.txt" for fit_output_prefix in fit_output_prefixes ] fit_output_dict = collections.OrderedDict() fit_output_dict["coef"] = fit_coefs fit_output_dict["coefr"] = fit_coefsr fit_output_dict["intercept"] = fit_intercepts # fit_output_dict["interceptr"] = fit_interceptsr output_matr_df = pd.DataFrame(fit_output_dict) output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False) print("Output matrices written to output_matr_list.txt") int_matr_dict = collections.OrderedDict() int_matr_dict["coef"] = "fit" + os.sep + args.output_name + "_coefs.p" int_matr_dict["coefr"] = "fit" + os.sep + args.output_name + "_coefsr.p" int_matr_dict[ "intercept"] = "fit" + os.sep + args.output_name + "_intercepts.p" # int_matr_dict["interceptr"] = "fit" + os.sep + args.output_name + "_interceptsr.p" int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False) print("integrated matrices written to int_matr_list.txt") fit_result_dict = collections.OrderedDict() fit_result_dict["fit_result"] = fit_results fit_result_dict["fit_resultr"] = fit_resultsr output_df_df = pd.DataFrame(fit_result_dict) output_df_df.to_csv("output_df_list.txt", sep="\t", index=False) print("output dfs written to output_df_list.txt") int_df_dict = collections.OrderedDict() int_df_dict[ "fit_result"] = "fit" + os.sep + args.output_name + "_fit_result_df.txt" int_df_dict[ "fit_resultr"] = "fit" + os.sep + args.output_name + "_fit_result_dfr.txt" int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv("int_df_list.txt", sep="\t", index=False) print("Integrated dfs written to int_df_list.txt") with open("finish-none.sh", 'w') as ifile: ifile.write("#!/bin/bash\n") ifile.write("START=$(date)\n") ifile.write("set -e\n") ifile.write( "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt " + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n") ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ args.output_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "n" + " -tn " + args.test_name + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + "finish-none.sh" + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") print("Finish script, stratby None, written to finish-none.sh") os.chmod("finish-none.sh", 0o777) with open("finish-effect.sh", 'w') as ifile: ifile.write("#!/bin/bash\n") ifile.write("START=$(date)\n") ifile.write("set -e\n") ifile.write( "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt " + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n") ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ args.output_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "e" + " -tn " + args.test_name + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + "finish-effect.sh" + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") print("Finish script, stratby effect, written to finish-effect.sh") os.chmod("finish-effect.sh", 0o777) with open("plot_coef.sh", 'w') as ifile: ifile.write("#!/bin/bash\n") ifile.write("START=$(date)\n") ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ args.output_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "n" + " -tn " + args.test_name + " -pcf 1 " + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + "plot_coef.sh" + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") print("Plot coef script written to plot_coef.sh") os.chmod("plot_coef.sh", 0o777) with open("cleanup_list.txt", 'w') as outfile: cleanup_list = row_filenames if args.cv: cleanup_list += cv_scripts + list( itertools.chain.from_iterable(list( hyper_output_dict.values()))) cleanup_list += fit_scripts + fit_coefs + fit_intercepts + fit_results + fit_coefsr + fit_resultsr for script in cleanup_list: outfile.write(script + "\n") print("Cleanup scripts written to cleanup_list.txt") with open("timing/timing_list.txt", 'w') as outfile: outfile.write(cvtimefile + "\n") outfile.write(fittimefile + "\n") outfile.write(resulttimefile + "\n") print("Timing files written to timing_list.txt") with open("summarize_time.sh", 'w') as outfile: outfile.write( "python summarize_time.py -i timing/timing_list.txt -o timing/summary_time.csv -oo timing/overall_time.csv\n" ) os.chmod("summarize_time.sh", 0o777) print("Summarize timing script written to summarize_time.sh")
def load_and_run(args): lag = args.lag save_prefix = args.save_prefix full_save_prefix = os.path.join(args.result_save_folder, save_prefix) # Load data file and prepare a file to pass to plotters if args.load_reps: # load genes, _ = gtm.load_basic_rep_file_list(args.data_file) # _, genes, _, _, _, _ = gtm.load_rep_file_list(args.data_file) # dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list(args.data_file) # print "Timekeys: ", timekeys # print "Num per key: ", num_per_keys else: df = pd.read_csv(args.data_file, sep="\t") genes, _ = gtm.get_gene_TS(df) # dfr = pd.read_csv(args.rand_data_file, sep="\t") # genesr, geneTSr = gtm.get_gene_TS(dfr) # # timekeys = df.columns.values[1:] # print "Timekeys: ", timekeys # # # Num. replicates per key # num_per_keys = None with open(args.bootstrap_file_with_names, 'r') as f: filenames = [line.split("\n")[0] for line in f.readlines()] if args.do_lite: stats_matr_dict = cp.bootstrap_matrices_iter_free(filenames) else: if args.transpose_bootstrap_folder == None: raise ValueError("If doing bootstrap calculation, transpose is required") # allow the other problem transpose_bootstrap_folder = os.path.join(args.outer_save_folder, args.transpose_bootstrap_folder) if not os.path.exists(transpose_bootstrap_folder): os.makedirs(transpose_bootstrap_folder) if not os.path.exists(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump))): os.makedirs(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump))) transpose_prefix = os.path.join(transpose_bootstrap_folder, save_prefix) dump_prefix = os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump), save_prefix) t = time.time() bootstrap_coef_file_matr = transpose_bootstrap_matrices(filenames, length_before_dump=args.length_before_dump, save_prefix=transpose_prefix, dump_prefix=dump_prefix ) print("Time to transpose: ", time.time() - t) bootstrap_coef_filename = dump_prefix + "-NAMES.p" pickle.dump(bootstrap_coef_file_matr, open(bootstrap_coef_filename, 'wb')) print("Bootstrap coef matrix dumped to ", bootstrap_coef_filename) t = time.time() stats_matr_dict = compute_bootstrap_stats_matr(bootstrap_coef_file_matr) print("Time to get stats: ", time.time() - t) # align results if args.dump_raw: dump_stats_matr_dict = stats_matr_dict.copy() if args.unalign_before_raw_dump: for k in dump_stats_matr_dict: dump_stats_matr_dict[k] = lc.unalign_coefs(dump_stats_matr_dict[k], lag) for k in dump_stats_matr_dict: outfile = full_save_prefix + "_raw_" + k + "_coefs.p" with open(outfile, 'wb') as f: pickle.dump(dump_stats_matr_dict[k], f) print("For ", k , "Saved to ", outfile) if args.do_align: for k in stats_matr_dict: stats_matr_dict[k] = lc.align_coefs(stats_matr_dict[k], lag) # Save the gene matrices # Note bootstrap_matr is of form lag x n x n full_nets = [] for i in range(1, lag + 1): print("Lag: ", i) print("Aggregating results") #bootstrap_mean, bootstrap_std, bootstrap_freq = cp.get_bootstrap_results(bootstrap_lag_to_matrs[i]) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = lag extra_dict["Coef"] = i nets = [] for k in stats_matr_dict: raw_matr = stats_matr_dict[k][i-1] matr_filename = full_save_prefix + "-" + str(i) + "-bootstrap-" + k + "-matrix.txt" matr = gtm.save_gene_matrix(matr_filename, matrix=raw_matr, genes=genes) print("Saved ", k, " to ", matr_filename) if k == "mean": net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(), abs_name="AbsBootstrap:" + k.capitalize(), do_sort=False, extra_dict=extra_dict) else: net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(), no_abs=True, do_sort=False, extra_dict=extra_dict) nets.append(net) full_net = nets[0] for j in range(1, len(nets)): full_net = full_net.merge(nets[j], how='outer') print("Final net: ", full_net.shape[0]) sortby = "Bootstrap:Freq" print("Sorting by :", sortby) full_net.sort_values(sortby, inplace=True, ascending=False) full_net_filename = full_save_prefix +"-" + str(i) + "-bootstrap-network.txt" full_net.to_csv(full_net_filename, sep="\t", index=False) print("Written to ", full_net_filename) full_nets.append(full_net) union_net_filename = full_save_prefix + "-union-bootstrap-network.txt" if lag > 1: m_net = cp.get_max_network(full_nets, max_col="AbsBootstrap:Mean", index_col="Cause-Effect") union_net = cp.get_union_network(full_nets + [m_net], suffixes=[str(i) for i in range(1, lag + 1)] + [""]) print("Max network edges: ", m_net.shape) print("Union network edges: ", union_net.shape) else: union_net = full_nets[0] sortby = "Bootstrap:Freq" print("Sorting by :", sortby) union_net.sort_values(sortby, inplace=True, ascending=False) union_net.to_csv(union_net_filename, sep="\t", index=False) print("Unioned bootstrap network written to ", union_net_filename)
def run(args): if args.test not in {"r", "l", "e"}: raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)") if args.null not in {"l", "g"}: raise ValueError("args.null must be l (local) or g (global)") # Load files data_file = args.data_file rand_data_file = args.rand_data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) print("Reading rows from format: ", row_filename) print("*************") print("BOOTSTRAP") print("*************") # Run the actual fit # Need an integration if not os.path.exists("bootstrap"): os.makedirs("bootstrap") # For the bootstrap individual fit scripts if not os.path.exists("bootstrap-fit-scripts"): os.makedirs("bootstrap-fit-scripts") # For the bootstrap finish scripts if not os.path.exists("bootstrap-finish-scripts"): os.makedirs("bootstrap-finish-scripts") # Finish, aggregating all the coefficients (stratification = none) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")): os.makedirs(os.path.join("bootstrap-finish-scripts", "none")) # Finish, stratifying each coefficient by the effect gene (stratification = effect) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")): os.makedirs(os.path.join("bootstrap-finish-scripts", "effect")) # if args.write_all_bootstrap_scripts_first: print("WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!") for b in range(args.bootstrap_num): if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh") for b in range(args.bootstrap_num) for i in range(len(row_filenames))] print("SCRIPTS") with open("bootstrap_script_list.txt", 'w') as outfile: for bootstrap_script in all_bootstrap_scripts: outfile.write("./" + bootstrap_script + "\n") print("bootstrap scripts written to bootstrap_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to bootstrap_parallel_script_list.txt") # make one script for each... # all_bootstrap_scripts = set([]) all_int_coefs = [] all_int_intercepts = [] finish_none_scripts = [] finish_effect_scripts = [] # record where the thresholded coefficients are written # For integrating these, later. fdrs = [0.01, 0.05, 0.1, 0.2] all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs]) all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs]) all_fdr_none_intercepts_dict = dict([(x, []) for x in fdrs]) all_fdr_effect_intercepts_dict = dict([(x, []) for x in fdrs]) try: fittimefile = os.path.join("timing", "bootstrap_fit_time.csv") if not os.path.exists(fittimefile): with open(fittimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) finishtimefile = os.path.join("timing", "bootstrap_finish_time.csv") if not os.path.exists(finishtimefile): with open(finishtimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) resulttimefile = os.path.join("timing", "bootstrap_result_time.csv") if not os.path.exists(resulttimefile): with open(resulttimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) with open(os.path.join("timing/timing_list.txt"), 'a') as f: f.write(fittimefile + "\n") f.write(finishtimefile + "\n") f.write(resulttimefile + "\n") except IOError: raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.") for b in range(args.bootstrap_num): if b % 50 == 0: print("SEED/BOOTSTRAP NUM: ", b) bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b) bootstrap_folder = os.path.join("bootstrap", str(b)) if not os.path.exists(bootstrap_folder): os.makedirs(bootstrap_folder) # print "Created folder: ", bootstrap_folder bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name) if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) # create scripts for bootstrap bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh") for i in range(len(partition_rows))] bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))] command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \ " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \ "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \ "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array) for i, row_filename in zip(list(range(len(partition_rows))), row_filenames): # writing results to the bootstrap prefix command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename) with open(bootstrap_scripts[i], 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") #outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") #outputfile.write("module load anaconda\n") outputfile.write("module load anaconda3\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + bootstrap_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n") os.chmod(bootstrap_scripts[i], 0o777) # print "Scripts made" # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts)) # Note the output files bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_output_dict = collections.OrderedDict() bootstrap_output_dict["coef"] = bootstrap_coefs bootstrap_output_dict["coefr"] = bootstrap_coefsr bootstrap_output_dict["intercept"] = bootstrap_intercepts # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene output_matr_df = pd.DataFrame(bootstrap_output_dict) output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt") output_matr_df.to_csv(output_matr_file, sep="\t", index=False) # print "Raw parallelilized output matrices, before integration, written to", output_matr_file int_matr_dict = collections.OrderedDict() int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p" int_matr_dict["coefr"] = bootstrap_outmost_prefix + "_coefsr.p" int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p" # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p" # append these to the list of final bootstrapped coefficients all_int_coefs.append(int_matr_dict["coef"]) all_int_intercepts.append(int_matr_dict["intercept"]) int_matr_file = bootstrap_outmost_prefix + "_int_matr_list.txt" int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv(int_matr_file, sep="\t", index=False) # print "integrated matrices written to " + int_matr_file bootstrap_result_dict = collections.OrderedDict() bootstrap_result_dict["fit_result"] = bootstrap_results bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt" output_df_df = pd.DataFrame(bootstrap_result_dict) output_df_df.to_csv(output_df_file, sep="\t", index=False) # print "output dfs file written to ", output_df_file int_df_dict = collections.OrderedDict() int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt" int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt" int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt" int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv(int_df_file, sep="\t", index=False) # print "Integrated dfs file written to ", int_df_file # just need to put all of this into the outmost name finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh") with open(finish_none_script, 'w') as ifile: ifile.write("set -e\n") ifile.write("START=$(date)\n") ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" ) ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + finish_none_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n") # print "Finish script, stratby None, written to", finish_none_script os.chmod(finish_none_script, 0o777) finish_none_scripts.append(finish_none_script) finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh") with open(finish_effect_script, 'w') as ifile: ifile.write("set -e\n") ifile.write("START=$(date)\n") ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" ) ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "e" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + finish_effect_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n") # print "Finish script, stratby effect, written to", finish_effect_script os.chmod(finish_effect_script, 0o777) finish_effect_scripts.append(finish_effect_script) # get all the fdr files immediately for fdr in fdrs: all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" + "-coefs.p")) all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" + "-coefs.p")) all_fdr_none_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" + "-intercepts.p")) all_fdr_effect_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" + "-intercepts.p")) # print "-----------" int_coef_file = "all_bootstrap_coefs.txt" with open(int_coef_file, 'w') as f: for b_coef in all_int_coefs: f.write(b_coef + "\n") print("All integrated bootstrapped coef files written to ", int_coef_file) int_intercept_file = "all_bootstrap_intercepts.txt" with open(int_intercept_file, 'w') as f: for b_intercept in all_int_intercepts: f.write(b_intercept + "\n") print("All integrated bootstrapped intercept files written to ", int_intercept_file) all_finish_effect_script = "finish-effect-bootstrap-all.sh" with open(all_finish_effect_script, 'w') as f: f.write("set -e\n") for s in finish_effect_scripts: f.write("./" + s + "\n") os.chmod(all_finish_effect_script, 0o777) print("All bootstrap effects scripts written to ", all_finish_effect_script) if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(finish_effect_scripts, number=int(math.ceil(len(finish_effect_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("finish-effect-bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to finish-effect-bootstrap_parallel_script_list.txt") all_finish_none_script = "finish-none-bootstrap-all.sh" with open(all_finish_none_script, 'w') as f: f.write("set -e\n") for s in finish_none_scripts: f.write("./" + s + "\n") os.chmod(all_finish_none_script, 0o777) print("All bootstrap nones scripts written to ", all_finish_none_script) if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(finish_none_scripts, number=int(math.ceil(len(finish_none_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("finish-none-bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to finish-none-bootstrap_parallel_script_list.txt") # integrate all the bootrastrapped FDR bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_summary_file = "get_result_bootstrap.sh" with open(bootstrap_summary_file, 'w') as f: f.write("START=$(date)\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + int_coef_file + " -da 1"+ " -tbf " + "bootstrap-transpose" + " -uabrd 0\n") f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_summary_file, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file) # integrate in a lite version bootstrap_summary_file = "get_result_bootstrap_lite.sh" with open(bootstrap_summary_file, 'w') as f: f.write("START=$(date)\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + int_coef_file + " -da 1"+ " -dl 1 -uabrd 0\n") f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_summary_file, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file) for fdr in fdrs: print("*************************") print("Integrating bootstrap files for FDR ", fdr) print("****EFFECT***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) # write the fdr file out bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt" with open(bootstrap_fdr_effect_list_file, 'w') as f: for b_coef in all_fdr_effect_coefs_dict[fdr]: f.write(b_coef + "\n") print("All fdr effect written to ", bootstrap_fdr_effect_list_file) bootstrap_fdr_effect_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-effect.txt" with open(bootstrap_fdr_effect_intercept_list_file, 'w') as f: for b_intercept in all_fdr_effect_intercepts_dict[fdr]: f.write(b_intercept + "\n") print("All fdr effect written to ", bootstrap_fdr_effect_intercept_list_file) bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh" with open(bootstrap_fdr_effect_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_effect_list_file + " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-effect -uabrd 1\n") # f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_effect_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script) bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect_lite.sh" with open(bootstrap_fdr_effect_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_effect_list_file + " -da 0" + " -dl 1 -uabrd 1\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_effect_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script) print("-----------------------") print("****NONE***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt" with open(bootstrap_fdr_none_list_file, 'w') as f: for b_coef in all_fdr_none_coefs_dict[fdr]: f.write(b_coef + "\n") print("All fdr none written to ", bootstrap_fdr_none_list_file) bootstrap_fdr_none_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-none.txt" with open(bootstrap_fdr_none_intercept_list_file, 'w') as f: for b_intercept in all_fdr_none_intercepts_dict[fdr]: f.write(b_intercept + "\n") print("All fdr none written to ", bootstrap_fdr_none_intercept_list_file) bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh" with open(bootstrap_fdr_none_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-none -uabrd 1\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_none_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script) bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none_lite.sh" with open(bootstrap_fdr_none_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -dl 1 -uabrd 1\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_none_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script) print() print("FDR DONE ") print(" *************************************") print("SCRIPTS") with open("bootstrap_script_list.txt", 'w') as outfile: # lEFT OFF HERE for bootstrap_script in sorted(all_bootstrap_scripts): outfile.write("./" + bootstrap_script + "\n") print("bootstrap scripts written to bootstrap_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to bootstrap_parallel_script_list.txt") print("TIMING")
def load_and_run(args): lag = args.lag save_prefix = args.save_prefix assert args.stratify_by in {"e", "n"} stratify_by = cp.args2stratify_by[args.stratify_by] if args.output_folder == None: args.output_folder = "." # Load data file and prepare a file to pass to plotters if args.load_reps: # load genes, geneTS = gtm.load_basic_rep_file_list(args.data_file) genesr, geneTSr = gtm.load_basic_rep_file_list(args.rand_data_file) # dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list(args.data_file) # dfsr, genesr, geneTSr, dfr, __, __ = gtm.load_rep_file_list(args.rand_data_file) # get shared prefix timekeys # print "Timekeys: ", timekeys # print "Num per key: ", num_per_keys else: df = pd.read_csv(args.data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) dfr = pd.read_csv(args.rand_data_file, sep="\t") genesr, geneTSr = gtm.get_gene_TS(dfr) timekeys = df.columns.values[1:] print("Timekeys: ", timekeys) # Num. replicates per key num_per_keys = None assert (geneTS.shape == geneTSr.shape) assert (genes == genesr).all() coefs = pickle.load(open(args.coef_file, 'rb')) intercepts = pickle.load(open(args.intercept_file, 'rb')) fit_result_df = pd.read_csv(args.fit_result_file, sep="\t") coefsr = pickle.load(open(args.coef_rand_file, 'rb')) # interceptsr = pickle.load(open(args.intercept_rand_file, 'rb')) fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t") if args.best_hyper_file != None: best_hyper = pickle.load(open(args.best_hyper_file, 'rb')) else: best_hyper = None print("RESULTS") print("*************************") print("RESIDUALS: ") print("*************************") print("NORMAL: ") cp.summarize_fit(coefs, intercepts, fit_result_df, filename=os.path.join(args.output_folder, "fit_all_summary_normal.txt"), hyper=best_hyper, test_name=args.test_name, lag=lag) # Align the coefs # print "Aligning coefficients" acoefs = lc.align_coefs(coefs, lag) acoefsr = lc.align_coefs(coefsr, lag) print("Removing alphas (gene-on-self effects) ") acoefs = lc.remove_alphas(acoefs, lag) acoefsr = lc.remove_alphas(acoefsr, lag) coef_nets = [] coefr_nets = [] # Save the gene matrices for i in range(acoefs.shape[0]): coef_matr_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-matrix.txt") coefr_matr_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-r-matrix.txt") coef_net_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-network.txt") coefr_net_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-r-network.txt") coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename, matrix=acoefs[i], genes=genes) coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename, matrix=acoefsr[i], genes=genes) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 coef_net = nh.matr_to_net(coef_matr, extra_dict=extra_dict, make_type=False) coefr_net = nh.matr_to_net(coefr_matr, extra_dict=extra_dict, make_type=False) coef_net.to_csv(coef_net_filename, sep="\t", index=False) coefr_net.to_csv(coefr_net_filename, sep="\t", index=False) coef_nets.append(coef_net) coefr_nets.append(coefr_net) print("Coef ", i + 1) print("Networks written to ") print(coef_net_filename) print(coefr_net_filename) # max_net_filename = save_prefix + "-max-network.txt" # max_r_net_filename = save_prefix + "-max-r-network.txt" union_net_filename = os.path.join(args.output_folder, save_prefix + "-union-network.txt") union_r_net_filename = os.path.join(args.output_folder, save_prefix + "-union-r-network.txt") if acoefs.shape[0] > 1: m_net = cp.get_max_network(coef_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( coef_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) print("Max network edges: ", m_net.shape) print("Union network edges: ", union_net.shape) else: union_net = coef_nets[0] union_net.to_csv(union_net_filename, sep="\t", index=False) if acoefsr.shape[0] > 1: m_net = cp.get_max_network(coefr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_r_net = cp.get_union_network( coefr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) else: union_r_net = coefr_nets[0] union_r_net.to_csv(union_r_net_filename, sep="\t", index=False) # print "Max networks written to " # print max_net_filename # print max_r_net_filename print("Unioned networks written to ") print(union_net_filename) print(union_r_net_filename) if not os.path.exists(os.path.join(args.output_folder, "plots")): os.makedirs(os.path.join(args.output_folder, "plots")) if args.plot_coef: if not os.path.exists( os.path.join(args.output_folder, "plots", "betas")): os.makedirs(os.path.join(args.output_folder, "plots", "betas")) # Plot the betas for i in range(acoefs.shape[0]): if len(np.nonzero(acoefs[i])[0]) > 0 and len( np.nonzero(acoefsr[i])[0]) > 0: fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename=os.path.join( args.output_folder, "plots", "betas", "beta_nonzero_coef-" + str(i + 1)), title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas( acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename=os.path.join( args.output_folder, "plots", "betas", "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90"), zoom_in_top_percentile=95, zoom_in_bottom_percentile=5, title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename=os.path.join(args.output_folder, "plots", "betas", "beta_abs_coef-" + str(i + 1)), title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename=os.path.join( args.output_folder, "plots", "betas", "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95"), zoom_in_top_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename=os.path.join( args.output_folder, "plots", "betas", "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5"), zoom_in_bottom_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") print("Coef ", i + 1) print("Plots of betas written to: ", os.path.join(args.output_folder, "plots", "betas")) # get FDRS fdrs = [0.01, 0.05, 0.1, 0.2] acoefs_fdrs = [] sf_dfs = [] for fdr in fdrs: fdr_dir = os.path.join(args.output_folder, "fdr-" + str(fdr) + "-" + stratify_by) if not os.path.exists(fdr_dir): os.makedirs(fdr_dir) fdr_prefix = fdr_dir + os.sep + save_prefix # in case we want there to be an intermediate directory for fdr, like the bootstrap case. # if not os.path.exists(os.path.dirname(fdr_prefix)): # os.makedirs(os.path.dirname(fdr_prefix)) acoefs_fdr = np.zeros(acoefs.shape) fdr_nets = [] print("*************") for i in range(acoefs.shape[0]): print("-----") print("FDR = ", fdr) print("Lag ", lag) print("Coef ", i + 1) print("Stratify ", stratify_by) acoefs_fdr[i], threshes = fc.get_abs_thresh( acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by) # print "Threshes", threshes fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-matrix.txt" fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" fdr_matr = gtm.save_gene_matrix(fdr_matr_filename, matrix=acoefs_fdr[i], genes=genes) pickle.dump( threshes, open( fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-threshes.p", 'wb')) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 fdr_net = nh.matr_to_net(fdr_matr, extra_dict=extra_dict, make_type=False) fdr_net.to_csv(fdr_net_filename, sep="\t", index=False) fdr_nets.append(fdr_net) # write summary readme sf_df = fc.summarize_fdr(matr=acoefs_fdr[i], test=args.test_name, fdr=fdr, lag=lag, coef=i + 1, hyper=best_hyper, thresh=threshes, readme_name=fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-README.txt", matrixname=fdr_matr_filename, filename=fdr_net_filename) sf_dfs.append(sf_df) print("Network edges: ", fdr_net.shape[0]) if acoefs_fdr.shape[0] > 1: m_net = cp.get_max_network(fdr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( fdr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs_fdr.shape[0] + 1)] + [""]) else: union_net = fdr_nets[0] union_net_filename = fdr_prefix + "-union-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" union_net.to_csv(union_net_filename, sep="\t", index=False) print("Union network edges", union_net.shape[0]) print("Written to ", union_net_filename) fdr_agg_matr_filename = fdr_prefix + "-union-fdr-" + str( fdr) + "-" + stratify_by + "-coefs.p" pickle.dump(acoefs_fdr, open(fdr_agg_matr_filename, 'wb')) print("Thresholded matrix written as pickle file: ", fdr_agg_matr_filename) acoefs_fdrs.append(acoefs_fdr.copy()) all_sf_dfs = pd.concat(sf_dfs) # Hack to allow the base to still be fit_all_summary_fdr-stratby.txt # While the bootstrap will write to its own file, in its own corresponding folder # bullshit. just sent the output folder save_file = os.path.join(args.output_folder, "fit_all_summary_fdr-" + stratify_by + ".txt") all_sf_dfs.to_csv(save_file, sep="\t", index=False) print("********") print("Summaries of all fdrs written to ", save_file) print("Matrices done.") with open(os.path.join(args.output_folder, "matrices_done.txt"), 'w') as donefile: donefile.write("done\n")
def run(args): data_file = args.data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) print("Reading rows from format: ", row_filename) print("*************") print("PAIRWISE") print("*************") # Run the actual fit # Need an integration if not os.path.exists("pairwise"): os.makedirs("pairwise") # For the pairwise individual fit scripts if not os.path.exists("pairwise-fit-scripts"): os.makedirs("pairwise-fit-scripts") # For the pairwise finish scripts if not os.path.exists("pairwise-finish-scripts"): os.makedirs("pairwise-finish-scripts") pairwise_result_folder = os.path.join("pairwise", "pairwise-results") if not os.path.exists(pairwise_result_folder): os.makedirs(pairwise_result_folder) # make one script for each... # all_bootstrap_scripts = set([]) # all_int_coefs = [] # all_int_intercepts = [] # record where the thresholded coefficients are written # For integrating these, later. try: fittimefile = os.path.join("timing", "pairwise_fit_time.csv") if not os.path.exists(fittimefile): with open(fittimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) finishtimefile = os.path.join("timing", "pairwise_finish_time.csv") if not os.path.exists(finishtimefile): with open(finishtimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) # resulttimefile = os.path.join("timing", "bootstrap_result_time.csv") # if not os.path.exists(resulttimefile): # with open(resulttimefile, 'w') as csvfile: # f = csv.writer(csvfile) # f.writerow(["Name", "Start", "End", "Elapsed"]) with open(os.path.join("timing/timing_list.txt"), 'a') as f: f.write(fittimefile + "\n") f.write(finishtimefile + "\n") # f.write(resulttimefile + "\n") except IOError: raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.") pairwise_outmost_name = args.output_name + "-pairwise" pairwise_outmost_prefix = os.path.join("pairwise", pairwise_outmost_name) # create scripts for pairwise pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh") for i in range(len(partition_rows))] pairwise_row_prefixes = [pairwise_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))] command_template = "time python fit_pairwise.py -d " + data_file + " -lr " + str(args.load_reps) + \ " -o " + "pairwise_row_prefixes[i]" + " -l " + str(args.lag) + " -rl " + \ "row_filename" for i, row_filename in zip(list(range(len(partition_rows))), row_filenames): # writing results to the pairwise prefix command_string = command_template.replace("pairwise_row_prefixes[i]", pairwise_row_prefixes[i]).replace("row_filename", row_filename) with open(pairwise_scripts[i], 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + pairwise_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n") os.chmod(pairwise_scripts[i], 0o777) print("Scripts made") # all_pairwise_scripts = all_pairwise_scripts.union(set(pairwise_scripts)) # Note the output files pairwise_coefs = [pairwise_row_prefix + "_coefs.p" for pairwise_row_prefix in pairwise_row_prefixes] pairwise_output_dict = collections.OrderedDict() pairwise_output_dict["coef"] = pairwise_coefs output_matr_df = pd.DataFrame(pairwise_output_dict) output_matr_file = os.path.join("pairwise", pairwise_outmost_name + "_output_matr_list.txt") output_matr_df.to_csv(output_matr_file, sep="\t", index=False) print("Raw parallelilized output matrices, before integration, written to", output_matr_file) int_matr_dict = collections.OrderedDict() int_matr_dict["coef"] = os.path.join(pairwise_result_folder, pairwise_outmost_name + "_coefs.p") # # append these to the list of final bootstrapped coefficients # all_int_coefs.append(int_matr_dict["coef"]) # all_int_intercepts.append(int_matr_dict["intercept"]) int_matr_file = pairwise_outmost_prefix + "_int_matr_list.txt" int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv(int_matr_file, sep="\t", index=False) print("integrated matrices written to " + int_matr_file) # just need to put all of this into the outmost name all_pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh") for i in range(len(partition_rows))] print("SCRIPTS") with open("pairwise_script_list.txt", 'w') as outfile: for pairwise_script in all_pairwise_scripts: outfile.write("./" + pairwise_script + "\n") print("pairwise scripts written to pairwise_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(all_pairwise_scripts, number=int(math.ceil(len(all_pairwise_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("pairwise_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to pairwise_parallel_script_list.txt") finish_script = os.path.join("pairwise-finish-scripts", "finish.sh") with open(finish_script, 'w') as ifile: ifile.write("set -e\n") ifile.write("START=$(date)\n") ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + " -t a \n") ifile.write("END=$(date)\n") ifile.write("echo " + finish_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n") print("Finish script, written to", finish_script) os.chmod(finish_script, 0o777)
def run(args): if args.test not in {"r", "l", "e"}: raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)") if args.null not in {"l", "g"}: raise ValueError("args.null must be l (local) or g (global)") # Load files data_file = args.data_file rand_data_file = args.rand_data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) print("Reading rows from format: ", row_filename) print("*************") print("BOOTSTRAP") print("*************") # Run the actual fit # Need an integration if not os.path.exists("bootstrap"): os.makedirs("bootstrap") # For the bootstrap individual fit scripts if not os.path.exists("bootstrap-fit-scripts"): os.makedirs("bootstrap-fit-scripts") # For the bootstrap finish scripts if not os.path.exists("bootstrap-finish-scripts"): os.makedirs("bootstrap-finish-scripts") # Finish, aggregating all the coefficients (stratification = none) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")): os.makedirs(os.path.join("bootstrap-finish-scripts", "none")) # Finish, stratifying each coefficient by the effect gene (stratification = effect) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")): os.makedirs(os.path.join("bootstrap-finish-scripts", "effect")) # if args.write_all_bootstrap_scripts_first: # # print "WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!" # # for b in range(args.bootstrap_num): # if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): # os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) # # all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh") # for b in range(args.bootstrap_num) for i in range(len(row_filenames))] # print "SCRIPTS" # # with open("bootstrap_script_list.txt", 'w') as outfile: # for bootstrap_script in all_bootstrap_scripts: # outfile.write("./" + bootstrap_script + "\n") # print "bootstrap scripts written to bootstrap_script_list.txt" # # if args.parallel_num > 0: # print "Parallel Number (# processes per job): " + str(args.parallel_num) # # script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num))) # # print "Number of script groups ", len(script_groups) # # parallel_scripts = [] # for i, script_group in zip(range(len(script_groups)), script_groups): # appended_script_filenames = ["./" + script_filename for script_filename in script_group] # parallel_script = " & ".join(appended_script_filenames) # parallel_scripts.append(parallel_script) # # with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile: # for parallel_script in parallel_scripts: # scriptfile.write(parallel_script + "\n") # print "Parallel script list written to bootstrap_parallel_script_list.txt" # make one script for each... # # all_bootstrap_scripts = set([]) # # all_int_coefs = [] # # finish_none_scripts = [] # finish_effect_scripts = [] # record where the thresholded coefficients are written # For integrating these, later. fdrs = [0.01, 0.05, 0.1, 0.2] # all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs]) # all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs]) # for b in range(args.bootstrap_num): # print "SEED/BOOTSTRAP NUM: ", b # # bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b) # # bootstrap_folder = os.path.join("bootstrap", str(b)) # if not os.path.exists(bootstrap_folder): # os.makedirs(bootstrap_folder) # print "Created folder: ", bootstrap_folder # bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name) # # if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): # os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) # # # # create scripts for bootstrap # bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh") # for i in range(len(partition_rows))] # bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))] # # command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \ # " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \ # "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \ # "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array) # # for i, row_filename in zip(range(len(partition_rows)), row_filenames): # # # writing results to the bootstrap prefix # # command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename) # # with open(bootstrap_scripts[i], 'w') as outputfile: # outputfile.write("#!/bin/bash\nmodule load python/2.7\nmodule load python/2.7/scipy-mkl\nmodule load python/2.7/numpy-mkl\nmodule load anaconda\n") # outputfile.write(command_string + "\n") # os.chmod(bootstrap_scripts[i], 0777) # # # print "Scripts made" # # # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts)) # # # Note the output files # # bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] # # bootstrap_output_dict = collections.OrderedDict() # bootstrap_output_dict["coef"] = bootstrap_coefs # bootstrap_output_dict["coefr"] = bootstrap_coefsr # bootstrap_output_dict["intercept"] = bootstrap_intercepts # # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr # # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene # # output_matr_df = pd.DataFrame(bootstrap_output_dict) # output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt") # output_matr_df.to_csv(output_matr_file, sep="\t", index=False) # print "Raw parallelilized output matrices, before integration, written to", output_matr_file # # # # # int_matr_dict = collections.OrderedDict() # int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p" # int_matr_dict["coefr"] = bootstrap_outmost_prefix + "_coefsr.p" # int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p" # # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p" # # # append these to the list of final bootstrapped coefficients # all_int_coefs.append(int_matr_dict["coef"]) # # int_matr_file = bootstrap_outmost_prefix + "_int_matr_list.txt" # int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) # int_matr_df.to_csv(int_matr_file, sep="\t", index=False) # print "integrated matrices written to " + int_matr_file # # # bootstrap_result_dict = collections.OrderedDict() # bootstrap_result_dict["fit_result"] = bootstrap_results # bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr # # # # output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt" # output_df_df = pd.DataFrame(bootstrap_result_dict) # output_df_df.to_csv(output_df_file, sep="\t", index=False) # print "output dfs file written to ", output_df_file # # int_df_dict = collections.OrderedDict() # int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt" # int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt" # # int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt" # int_df_df = pd.DataFrame(int_df_dict, index=[0]) # int_df_df.to_csv(int_df_file, sep="\t", index=False) # print "Integrated dfs file written to ", int_df_file # # # # # just need to put all of this into the outmost name # # # finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh") # with open(finish_none_script, 'w') as ifile: # ifile.write("set -e\n") # ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) # ifile.write(" && " + \ # "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" # ) # ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ # " -lr " + str(args.load_reps) + \ # " -bh " + "hyper" + os.sep + "best_hyper.p" + \ # " -o " + \ # bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ # " -cfr " + int_matr_dict["coefr"] + " -fr " + \ # int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ # " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") # print "Finish script, stratby None, written to", finish_none_script # os.chmod(finish_none_script, 0777) # # finish_none_scripts.append(finish_none_script) # # # finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh") # with open(finish_effect_script, 'w') as ifile: # ifile.write("set -e\n") # ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) # ifile.write(" && " + \ # "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" # ) # ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ # " -lr " + str(args.load_reps) + \ # " -bh " + "hyper" + os.sep + "best_hyper.p" + \ # " -o " + \ # bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ # " -cfr " + int_matr_dict["coefr"] + " -fr " + \ # int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ # " -sb " + "e" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") # print "Finish script, stratby effect, written to", finish_effect_script # os.chmod(finish_effect_script, 0777) # # finish_effect_scripts.append(finish_effect_script) # get all the fdr files immediately # for fdr in fdrs: # all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none", # bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" + "_coefs.p")) # all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect", # bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" + "_coefs.p")) # print "-----------" int_coef_file = "all_bootstrap_coefs.txt" # integrate all the bootrastrapped FDR bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_summary_file = "get_result_bootstrap.sh" with open(bootstrap_summary_file, 'w') as f: f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -o " + os.path.join(bootstrap_result_folder, args.output_name) + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + int_coef_file + " -da 1") os.chmod(bootstrap_summary_file, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file) for fdr in fdrs: print("*************************") print("Integrating bootstrap files for FDR ", fdr) print("****EFFECT***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) # write the fdr file out bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt" # with open(bootstrap_fdr_effect_list_file, 'w') as f: # for b_coef in all_fdr_effect_coefs_dict[fdr]: # f.write(b_coef + "\n") # # print "All fdr effect written to ", bootstrap_fdr_effect_list_file bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh" with open(bootstrap_fdr_effect_summary_script, 'w') as f: f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_effect_list_file + " -da 0") os.chmod(bootstrap_fdr_effect_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script) print("-----------------------") print("****NONE***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt" # with open(bootstrap_fdr_none_list_file, 'w') as f: # for b_coef in all_fdr_none_coefs_dict[fdr]: # f.write(b_coef + "\n") # # print "All fdr none written to ", bootstrap_fdr_none_list_file bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh" with open(bootstrap_fdr_none_summary_script, 'w') as f: f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_none_list_file + " -da 0") os.chmod(bootstrap_fdr_none_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script) print() print("FDR DONE ") print(" *************************************")