def get_cause_plot_triples(cause2effects, sort_dict=None): """ :param cause2effects: Dictionary of the causes and effects :param sort_dict: Dictionary returning a key to sort the effects by :return: a list of plot_triples, cause at beginning """ plot_triples_list = [] for cause in cause2effects: effects = sorted(cause2effects[cause], key = lambda entry: sort_dict[entry], reverse=True) effect_list = pj.partition_inputs(list(effects), int(round(len(effects)/2.0))) plot_triples_list.extend([[cause] + e for e in effect_list]) print "Plot triples: " print plot_triples_list[0:20] return plot_triples_list
def get_cause_plot_triples(cause2effects, sort_dict=None): """ :param cause2effects: Dictionary of the causes and effects :param sort_dict: Dictionary returning a key to sort the effects by :return: a list of plot_triples, cause at beginning """ plot_triples_list = [] for cause in cause2effects: effects = sorted(cause2effects[cause], key=lambda entry: sort_dict[entry], reverse=True) effect_list = pj.partition_inputs(list(effects), int(round(len(effects) / 2.0))) plot_triples_list.extend([[cause] + e for e in effect_list]) print "Plot triples: " print plot_triples_list[0:20] return plot_triples_list
def run(args): data_file = args.data_file.split('/')[-1] rand_data_file = args.rand_data_file.split('/')[-1] df = gtm.load_file_and_avg(data_file) genes = df['gene'].values n = len(genes) script_filenames = [] output_filenames = [] output_rand_filenames = [] if args.test == "e": all_res_filenames = [] use_filenames = [] all_res_rand_filenames = [] use_rand_filenames = [] else: all_res_filenames = None use_filenames = None all_res_rand_filenames = None use_rand_filenames = None partition_rows = pj.partition_inputs(range(n), args.job_num) for partition_row, i in zip(partition_rows, range(len(partition_rows))): script_filename = args.output_name + "-script-" + str(i) + ".sh" script_filenames.append(script_filename) output_filename = args.output_name + "-" + str(i) + ".p" output_filenames.append(output_filename) output_rand_filename = args.output_name + "-randomized-" + str(i) + ".p" output_rand_filenames.append(output_rand_filename) # prepare the job associated with this row_filename = args.output_name + "-row-" + str(i) + ".txt" command_string = "python run_causal_rand_row.py -d " + data_file + " -rd " + rand_data_file + \ " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \ str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename if args.test == "e": all_res_filename = args.output_name + "-all-params-" + str(i) + ".txt" all_res_filenames.append(all_res_filename) use_filename = args.output_name + "-used-params-" + str(i) + ".txt" use_filenames.append(use_filename) all_res_rand_filename = args.output_name + "-all-params-randomized-" + str(i) + ".txt" all_res_rand_filenames.append(all_res_rand_filename) use_rand_filename = args.output_name + "-used-params-randomized-" + str(i) + ".txt" use_rand_filenames.append(use_rand_filename) command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename with open(row_filename, 'w') as rowfile: rowfile.write(str(partition_row) + "\n") print "Partition row written to ", row_filename with open(script_filename, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("module load python/2.7\n") outputfile.write("module load python/2.7/scipy-mkl\n") outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") os.chmod(script_filename, 0777) print "Script written to ", script_filename integrated_name_dict = {} integrated_name_dict["Output"] = args.output_name + ".p" integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p" integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt" integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt" integrated_name_dict["All-Rand-Params"] = args.output_name + "-all-params-randomized.txt" integrated_name_dict["Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt" with open("script_list.txt", 'w') as scriptfile: for script_filename in script_filenames: scriptfile.write(script_filename + "\n") print "Script list written to script_list.txt" # list of matrices to integrate output_matr_dict = {"Output": output_filenames, "Rand-Output": output_rand_filenames} output_matr_df = pd.DataFrame(output_matr_dict) output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False) print "Output matrices written to output_matr_list.txt" int_matr_dict = dict([(x, integrated_name_dict[x]) for x in ["Output", "Rand-Output"]]) int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False) print "integrated matrices written to int_matr_list.txt" if args.test == "e": # lists of dataframes (param files) to integrate # These will only be integrated if output_df_dict = {} output_df_lists = [all_res_filenames, use_filenames, all_res_rand_filenames, use_rand_filenames] output_df_names = ["All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params"] for out_list, out_name in zip(output_df_lists, output_df_names): if out_list != None: output_df_dict[out_name] = out_list output_df_df = pd.DataFrame(output_df_dict) output_df_df.to_csv("output_df_list.txt", sep="\t", index=False) print "output dfs written to output_df_list.txt" int_df_dict = dict([(x, integrated_name_dict[x]) for x in set(output_df_names).intersection(output_df_dict.keys())]) int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv("int_df_list.txt", sep="\t", index=False) print "Integrated dfs written to int_df_list.txt" with open("integrate_outputs.sh", 'w') as ifile: if args.test == "e": # here , "a" means the axis to integrate by ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \ "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n") else: ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n") print "Integration script written to integrate_outputs.sh" os.chmod("integrate_outputs.sh", 0777) with open("fdr_control.sh", 'w') as ffile: fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \ " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \ " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram " ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n") print "FDR CONTROL script written to fdr_control.sh" os.chmod("fdr_control.sh", 0777) if args.parallel_num > 0: print "Parallel Number (# processes per job): " + str(args.parallel_num) script_groups = pj.partition_inputs(script_filenames, number=int(math.ceil(len(script_filenames) * 1.0/args.parallel_num))) print "Number of script groups ", len(script_groups) parallel_scripts = [] for i, script_group in zip(range(len(script_groups)), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) print "Parallel Script ", i, ":", parallel_script parallel_scripts.append(parallel_script) with open("parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print "Parallel script list written to parallel_script_list.txt"
def run(args): if args.test not in {"r", "l", "e"}: raise ValueError( "args.test must be r (ridge), l (lasso) or e (elastic net)") if args.null not in {"l", "g"}: raise ValueError("args.null must be l (local) or g (global)") # Load files data_file = args.data_file rand_data_file = args.rand_data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) hyperlist = pickle.load(open(args.hyper_list_file, 'rb')) # hyper_names = cp.hyperlist_to_namelist(hyperlist) # Make hyper files for cross_validate loading. hyper_filenames = [] print("*************") print("HYPERS") print("*************") if not os.path.exists("hyper"): os.makedirs("hyper") # for hyper, hyper_name in zip(hyperlist, hyper_names): for hyper, h in zip(hyperlist, list(range(len(hyperlist)))): hyper_filename = "hyper" + os.sep + args.output_name + "-hyper-" + str( h) + ".p" hyper_filenames.append(hyper_filename) pickle.dump([hyper], open(hyper_filename, 'wb')) print("Hypers written in format: ", hyper_filename) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") if not os.path.exists("rows"): os.makedirs("rows") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) pickle.dump(partition_row, open(row_filename, 'wb')) print("Row written in format: ", row_filename) if not os.path.exists("timing"): os.makedirs("timing") print("Folder timing created") resulttimefile = os.path.join("timing", "result_time.csv") if not os.path.exists(resulttimefile): with open(resulttimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) if args.cv != 0: print("*************") print("CV") print("*************") # Make CV scripts cv_scripts = [] hyper_output_dict = collections.OrderedDict() hyper_int_dict = collections.OrderedDict() if not os.path.exists("cv-scripts"): os.makedirs("cv-scripts") cvtimefile = os.path.join("timing", "hyper_time.csv") if not os.path.exists(cvtimefile): with open(cvtimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) for hyper, h, hyper_filename in zip(hyperlist, list(range(len(hyperlist))), hyper_filenames): hyper_output_group = [] for partition_row, i, row_filename in zip( partition_rows, list(range(len(partition_rows))), row_filenames): cv_prefix = args.output_name + "-cv-" + str(h) + "-row-" + str( i) cv_script = os.path.join("cv-scripts", cv_prefix + ".sh") cv_scripts.append(cv_script) cv_output = "hyper" + os.sep + cv_prefix + "-result.txt" hyper_output_group.append(cv_output) command_string = "time python cross_validate.py -d " + data_file + " -lr " + str(args.load_reps) + " -o " + cv_output + " -hl " + str(hyper_filename) \ + " -t " + args.test + " -l " + str(args.lag) + " -rl " + str(row_filename) with open(cv_script, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") #outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") #outputfile.write("module load anaconda\n") outputfile.write("module load anaconda3\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + cv_script + ",$START,$END,$SECONDS >> " + cvtimefile + "\n") os.chmod(cv_script, 0o777) # Set the output names, prepare for integration of all the hyper parameter fit results hyper_output_dict[str(hyper)] = hyper_output_group hyper_int_dict[str( hyper)] = "hyper" + os.sep + args.output_name + "-cv-" + str( h) + "-result.txt" hyper_output_df = pd.DataFrame(hyper_output_dict) hyper_int_df = pd.DataFrame(hyper_int_dict, index=[0]) print("Hyper output df is in form", hyper_output_df.head(n=5)) hyper_output_df.to_csv("cv_outputs.txt", sep="\t", index=0) hyper_int_df.to_csv("cv_integrated.txt", sep="\t", index=0) print("Partitioned CV fit_result_dfs in cv_outputs.txt", "Integrated CV fit_result_dfs in cv_integrated.txt") with open("cv_script_list.txt", 'w') as outfile: for cv_script in cv_scripts: outfile.write(cv_script + "\n") print("CV scripts written to cv_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs( cv_scripts, number=int(math.ceil( len(cv_scripts) * 1.0 / args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = [ "./" + script_filename for script_filename in script_group ] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("cv_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print( "Parallel script list written to cv_parallel_script_list.txt" ) # Integrate hyperparameters # Begin whole normal fit hyper_script = "set_hyper.sh" with open(hyper_script, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") outputfile.write("set -e\n") outputfile.write( "time python integrate_hyper.py -hfd cv_outputs.txt -ind cv_integrated.txt -hl " + args.hyper_list_file + "\n") outputfile.write( "time python set_hyper.py -ind cv_integrated.txt -r " + "hyper" + os.sep + "hyper_df.txt -o " + "hyper" + os.sep + "best_hyper.p -hl " + args.hyper_list_file + " -tn " + args.test_name + " \n") outputfile.write("END=$(date)\n") outputfile.write("echo " + hyper_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(hyper_script, 0o777) print("set_hyper.sh written") print("*************") print("FITTING") print("*************") # Run the actual fit if not os.path.exists("fit"): os.makedirs("fit") if not os.path.exists("fit-scripts"): os.makedirs("fit-scripts") fittimefile = os.path.join("timing", "fit_time.csv") if not os.path.exists(fittimefile): with open(fittimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) fit_scripts = [] fit_output_prefixes = [] for partition_row, i, row_filename in zip(partition_rows, list(range(len(partition_rows))), row_filenames): fit_prefix = args.output_name + "-fit-row-" + str(i) fit_script = os.path.join("fit-scripts", fit_prefix + ".sh") fit_scripts.append(fit_script) fit_output_prefix = "fit" + os.sep + fit_prefix fit_output_prefixes.append(fit_output_prefix) command_string = "time python fit_all.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \ " -o " + fit_output_prefix + " -bh " + \ "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \ str(row_filename) + " -n " + args.null + " -oa " + str(args.only_array) with open(fit_script, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") #outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda3\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + fit_script + ",$START,$END,$SECONDS >> " + fittimefile + "\n") os.chmod(fit_script, 0o777) with open("fit_script_list.txt", 'w') as outfile: for fit_script in fit_scripts: outfile.write("./" + fit_script + "\n") print("Fit scripts written to fit_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs( fit_scripts, number=int(math.ceil(len(fit_scripts) * 1.0 / args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = [ "./" + script_filename for script_filename in script_group ] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("fit_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print( "Parallel script list written to fit_parallel_script_list.txt") # Note the output files fit_coefs = [ fit_output_prefix + "_coefs.p" for fit_output_prefix in fit_output_prefixes ] fit_intercepts = [ fit_output_prefix + "_intercepts.p" for fit_output_prefix in fit_output_prefixes ] fit_results = [ fit_output_prefix + "_fit_result_df.txt" for fit_output_prefix in fit_output_prefixes ] fit_coefsr = [ fit_output_prefix + "_coefsr.p" for fit_output_prefix in fit_output_prefixes ] # fit_interceptsr = [fit_output_prefix + "_interceptsr.p" for fit_output_prefix in fit_output_prefixes] fit_resultsr = [ fit_output_prefix + "_fit_result_dfr.txt" for fit_output_prefix in fit_output_prefixes ] fit_output_dict = collections.OrderedDict() fit_output_dict["coef"] = fit_coefs fit_output_dict["coefr"] = fit_coefsr fit_output_dict["intercept"] = fit_intercepts # fit_output_dict["interceptr"] = fit_interceptsr output_matr_df = pd.DataFrame(fit_output_dict) output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False) print("Output matrices written to output_matr_list.txt") int_matr_dict = collections.OrderedDict() int_matr_dict["coef"] = "fit" + os.sep + args.output_name + "_coefs.p" int_matr_dict["coefr"] = "fit" + os.sep + args.output_name + "_coefsr.p" int_matr_dict[ "intercept"] = "fit" + os.sep + args.output_name + "_intercepts.p" # int_matr_dict["interceptr"] = "fit" + os.sep + args.output_name + "_interceptsr.p" int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False) print("integrated matrices written to int_matr_list.txt") fit_result_dict = collections.OrderedDict() fit_result_dict["fit_result"] = fit_results fit_result_dict["fit_resultr"] = fit_resultsr output_df_df = pd.DataFrame(fit_result_dict) output_df_df.to_csv("output_df_list.txt", sep="\t", index=False) print("output dfs written to output_df_list.txt") int_df_dict = collections.OrderedDict() int_df_dict[ "fit_result"] = "fit" + os.sep + args.output_name + "_fit_result_df.txt" int_df_dict[ "fit_resultr"] = "fit" + os.sep + args.output_name + "_fit_result_dfr.txt" int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv("int_df_list.txt", sep="\t", index=False) print("Integrated dfs written to int_df_list.txt") with open("finish-none.sh", 'w') as ifile: ifile.write("#!/bin/bash\n") ifile.write("START=$(date)\n") ifile.write("set -e\n") ifile.write( "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt " + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n") ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ args.output_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "n" + " -tn " + args.test_name + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + "finish-none.sh" + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") print("Finish script, stratby None, written to finish-none.sh") os.chmod("finish-none.sh", 0o777) with open("finish-effect.sh", 'w') as ifile: ifile.write("#!/bin/bash\n") ifile.write("START=$(date)\n") ifile.write("set -e\n") ifile.write( "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt " + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n") ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ args.output_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "e" + " -tn " + args.test_name + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + "finish-effect.sh" + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") print("Finish script, stratby effect, written to finish-effect.sh") os.chmod("finish-effect.sh", 0o777) with open("plot_coef.sh", 'w') as ifile: ifile.write("#!/bin/bash\n") ifile.write("START=$(date)\n") ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ args.output_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "n" + " -tn " + args.test_name + " -pcf 1 " + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + "plot_coef.sh" + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") print("Plot coef script written to plot_coef.sh") os.chmod("plot_coef.sh", 0o777) with open("cleanup_list.txt", 'w') as outfile: cleanup_list = row_filenames if args.cv: cleanup_list += cv_scripts + list( itertools.chain.from_iterable(list( hyper_output_dict.values()))) cleanup_list += fit_scripts + fit_coefs + fit_intercepts + fit_results + fit_coefsr + fit_resultsr for script in cleanup_list: outfile.write(script + "\n") print("Cleanup scripts written to cleanup_list.txt") with open("timing/timing_list.txt", 'w') as outfile: outfile.write(cvtimefile + "\n") outfile.write(fittimefile + "\n") outfile.write(resulttimefile + "\n") print("Timing files written to timing_list.txt") with open("summarize_time.sh", 'w') as outfile: outfile.write( "python summarize_time.py -i timing/timing_list.txt -o timing/summary_time.csv -oo timing/overall_time.csv\n" ) os.chmod("summarize_time.sh", 0o777) print("Summarize timing script written to summarize_time.sh")
def run(args): data_file = args.data_file.split('/')[-1] rand_data_file = args.rand_data_file.split('/')[-1] df = gtm.load_file_and_avg(data_file) genes = df['gene'].values n = len(genes) script_filenames = [] output_filenames = [] output_rand_filenames = [] if args.test == "e": all_res_filenames = [] use_filenames = [] all_res_rand_filenames = [] use_rand_filenames = [] else: all_res_filenames = None use_filenames = None all_res_rand_filenames = None use_rand_filenames = None partition_rows = pj.partition_inputs(range(n), args.job_num) for partition_row, i in zip(partition_rows, range(len(partition_rows))): script_filename = args.output_name + "-script-" + str(i) + ".sh" script_filenames.append(script_filename) output_filename = args.output_name + "-" + str(i) + ".p" output_filenames.append(output_filename) output_rand_filename = args.output_name + "-randomized-" + str( i) + ".p" output_rand_filenames.append(output_rand_filename) # prepare the job associated with this row_filename = args.output_name + "-row-" + str(i) + ".txt" command_string = "python run_causal_rand_row.py -d " + data_file + " -rd " + rand_data_file + \ " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \ str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename if args.test == "e": all_res_filename = args.output_name + "-all-params-" + str( i) + ".txt" all_res_filenames.append(all_res_filename) use_filename = args.output_name + "-used-params-" + str(i) + ".txt" use_filenames.append(use_filename) all_res_rand_filename = args.output_name + "-all-params-randomized-" + str( i) + ".txt" all_res_rand_filenames.append(all_res_rand_filename) use_rand_filename = args.output_name + "-used-params-randomized-" + str( i) + ".txt" use_rand_filenames.append(use_rand_filename) command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename with open(row_filename, 'w') as rowfile: rowfile.write(str(partition_row) + "\n") print "Partition row written to ", row_filename with open(script_filename, 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("module load python/2.7\n") outputfile.write("module load python/2.7/scipy-mkl\n") outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") os.chmod(script_filename, 0777) print "Script written to ", script_filename integrated_name_dict = {} integrated_name_dict["Output"] = args.output_name + ".p" integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p" integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt" integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt" integrated_name_dict[ "All-Rand-Params"] = args.output_name + "-all-params-randomized.txt" integrated_name_dict[ "Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt" with open("script_list.txt", 'w') as scriptfile: for script_filename in script_filenames: scriptfile.write(script_filename + "\n") print "Script list written to script_list.txt" # list of matrices to integrate output_matr_dict = { "Output": output_filenames, "Rand-Output": output_rand_filenames } output_matr_df = pd.DataFrame(output_matr_dict) output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False) print "Output matrices written to output_matr_list.txt" int_matr_dict = dict([(x, integrated_name_dict[x]) for x in ["Output", "Rand-Output"]]) int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False) print "integrated matrices written to int_matr_list.txt" if args.test == "e": # lists of dataframes (param files) to integrate # These will only be integrated if output_df_dict = {} output_df_lists = [ all_res_filenames, use_filenames, all_res_rand_filenames, use_rand_filenames ] output_df_names = [ "All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params" ] for out_list, out_name in zip(output_df_lists, output_df_names): if out_list != None: output_df_dict[out_name] = out_list output_df_df = pd.DataFrame(output_df_dict) output_df_df.to_csv("output_df_list.txt", sep="\t", index=False) print "output dfs written to output_df_list.txt" int_df_dict = dict([ (x, integrated_name_dict[x]) for x in set(output_df_names).intersection(output_df_dict.keys()) ]) int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv("int_df_list.txt", sep="\t", index=False) print "Integrated dfs written to int_df_list.txt" with open("integrate_outputs.sh", 'w') as ifile: if args.test == "e": # here , "a" means the axis to integrate by ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \ "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n") else: ifile.write( "python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n" ) print "Integration script written to integrate_outputs.sh" os.chmod("integrate_outputs.sh", 0777) with open("fdr_control.sh", 'w') as ffile: fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \ " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \ " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram " ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n") print "FDR CONTROL script written to fdr_control.sh" os.chmod("fdr_control.sh", 0777) if args.parallel_num > 0: print "Parallel Number (# processes per job): " + str( args.parallel_num) script_groups = pj.partition_inputs( script_filenames, number=int( math.ceil(len(script_filenames) * 1.0 / args.parallel_num))) print "Number of script groups ", len(script_groups) parallel_scripts = [] for i, script_group in zip(range(len(script_groups)), script_groups): appended_script_filenames = [ "./" + script_filename for script_filename in script_group ] parallel_script = " & ".join(appended_script_filenames) print "Parallel Script ", i, ":", parallel_script parallel_scripts.append(parallel_script) with open("parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print "Parallel script list written to parallel_script_list.txt"
def run(args): if args.test not in {"r", "l", "e"}: raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)") if args.null not in {"l", "g"}: raise ValueError("args.null must be l (local) or g (global)") # Load files data_file = args.data_file rand_data_file = args.rand_data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) print("Reading rows from format: ", row_filename) print("*************") print("BOOTSTRAP") print("*************") # Run the actual fit # Need an integration if not os.path.exists("bootstrap"): os.makedirs("bootstrap") # For the bootstrap individual fit scripts if not os.path.exists("bootstrap-fit-scripts"): os.makedirs("bootstrap-fit-scripts") # For the bootstrap finish scripts if not os.path.exists("bootstrap-finish-scripts"): os.makedirs("bootstrap-finish-scripts") # Finish, aggregating all the coefficients (stratification = none) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")): os.makedirs(os.path.join("bootstrap-finish-scripts", "none")) # Finish, stratifying each coefficient by the effect gene (stratification = effect) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")): os.makedirs(os.path.join("bootstrap-finish-scripts", "effect")) # if args.write_all_bootstrap_scripts_first: print("WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!") for b in range(args.bootstrap_num): if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh") for b in range(args.bootstrap_num) for i in range(len(row_filenames))] print("SCRIPTS") with open("bootstrap_script_list.txt", 'w') as outfile: for bootstrap_script in all_bootstrap_scripts: outfile.write("./" + bootstrap_script + "\n") print("bootstrap scripts written to bootstrap_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to bootstrap_parallel_script_list.txt") # make one script for each... # all_bootstrap_scripts = set([]) all_int_coefs = [] all_int_intercepts = [] finish_none_scripts = [] finish_effect_scripts = [] # record where the thresholded coefficients are written # For integrating these, later. fdrs = [0.01, 0.05, 0.1, 0.2] all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs]) all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs]) all_fdr_none_intercepts_dict = dict([(x, []) for x in fdrs]) all_fdr_effect_intercepts_dict = dict([(x, []) for x in fdrs]) try: fittimefile = os.path.join("timing", "bootstrap_fit_time.csv") if not os.path.exists(fittimefile): with open(fittimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) finishtimefile = os.path.join("timing", "bootstrap_finish_time.csv") if not os.path.exists(finishtimefile): with open(finishtimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) resulttimefile = os.path.join("timing", "bootstrap_result_time.csv") if not os.path.exists(resulttimefile): with open(resulttimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) with open(os.path.join("timing/timing_list.txt"), 'a') as f: f.write(fittimefile + "\n") f.write(finishtimefile + "\n") f.write(resulttimefile + "\n") except IOError: raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.") for b in range(args.bootstrap_num): if b % 50 == 0: print("SEED/BOOTSTRAP NUM: ", b) bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b) bootstrap_folder = os.path.join("bootstrap", str(b)) if not os.path.exists(bootstrap_folder): os.makedirs(bootstrap_folder) # print "Created folder: ", bootstrap_folder bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name) if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) # create scripts for bootstrap bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh") for i in range(len(partition_rows))] bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))] command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \ " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \ "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \ "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array) for i, row_filename in zip(list(range(len(partition_rows))), row_filenames): # writing results to the bootstrap prefix command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename) with open(bootstrap_scripts[i], 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") #outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") #outputfile.write("module load anaconda\n") outputfile.write("module load anaconda3\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + bootstrap_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n") os.chmod(bootstrap_scripts[i], 0o777) # print "Scripts made" # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts)) # Note the output files bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] bootstrap_output_dict = collections.OrderedDict() bootstrap_output_dict["coef"] = bootstrap_coefs bootstrap_output_dict["coefr"] = bootstrap_coefsr bootstrap_output_dict["intercept"] = bootstrap_intercepts # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene output_matr_df = pd.DataFrame(bootstrap_output_dict) output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt") output_matr_df.to_csv(output_matr_file, sep="\t", index=False) # print "Raw parallelilized output matrices, before integration, written to", output_matr_file int_matr_dict = collections.OrderedDict() int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p" int_matr_dict["coefr"] = bootstrap_outmost_prefix + "_coefsr.p" int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p" # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p" # append these to the list of final bootstrapped coefficients all_int_coefs.append(int_matr_dict["coef"]) all_int_intercepts.append(int_matr_dict["intercept"]) int_matr_file = bootstrap_outmost_prefix + "_int_matr_list.txt" int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv(int_matr_file, sep="\t", index=False) # print "integrated matrices written to " + int_matr_file bootstrap_result_dict = collections.OrderedDict() bootstrap_result_dict["fit_result"] = bootstrap_results bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt" output_df_df = pd.DataFrame(bootstrap_result_dict) output_df_df.to_csv(output_df_file, sep="\t", index=False) # print "output dfs file written to ", output_df_file int_df_dict = collections.OrderedDict() int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt" int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt" int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt" int_df_df = pd.DataFrame(int_df_dict, index=[0]) int_df_df.to_csv(int_df_file, sep="\t", index=False) # print "Integrated dfs file written to ", int_df_file # just need to put all of this into the outmost name finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh") with open(finish_none_script, 'w') as ifile: ifile.write("set -e\n") ifile.write("START=$(date)\n") ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" ) ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + finish_none_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n") # print "Finish script, stratby None, written to", finish_none_script os.chmod(finish_none_script, 0o777) finish_none_scripts.append(finish_none_script) finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh") with open(finish_effect_script, 'w') as ifile: ifile.write("set -e\n") ifile.write("START=$(date)\n") ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) ifile.write(" && " + \ "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" ) ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ " -lr " + str(args.load_reps) + \ " -bh " + "hyper" + os.sep + "best_hyper.p" + \ " -o " + \ bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ " -cfr " + int_matr_dict["coefr"] + " -fr " + \ int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ " -sb " + "e" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") ifile.write("END=$(date)\n") ifile.write("echo " + finish_effect_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n") # print "Finish script, stratby effect, written to", finish_effect_script os.chmod(finish_effect_script, 0o777) finish_effect_scripts.append(finish_effect_script) # get all the fdr files immediately for fdr in fdrs: all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" + "-coefs.p")) all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" + "-coefs.p")) all_fdr_none_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" + "-intercepts.p")) all_fdr_effect_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect", bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" + "-intercepts.p")) # print "-----------" int_coef_file = "all_bootstrap_coefs.txt" with open(int_coef_file, 'w') as f: for b_coef in all_int_coefs: f.write(b_coef + "\n") print("All integrated bootstrapped coef files written to ", int_coef_file) int_intercept_file = "all_bootstrap_intercepts.txt" with open(int_intercept_file, 'w') as f: for b_intercept in all_int_intercepts: f.write(b_intercept + "\n") print("All integrated bootstrapped intercept files written to ", int_intercept_file) all_finish_effect_script = "finish-effect-bootstrap-all.sh" with open(all_finish_effect_script, 'w') as f: f.write("set -e\n") for s in finish_effect_scripts: f.write("./" + s + "\n") os.chmod(all_finish_effect_script, 0o777) print("All bootstrap effects scripts written to ", all_finish_effect_script) if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(finish_effect_scripts, number=int(math.ceil(len(finish_effect_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("finish-effect-bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to finish-effect-bootstrap_parallel_script_list.txt") all_finish_none_script = "finish-none-bootstrap-all.sh" with open(all_finish_none_script, 'w') as f: f.write("set -e\n") for s in finish_none_scripts: f.write("./" + s + "\n") os.chmod(all_finish_none_script, 0o777) print("All bootstrap nones scripts written to ", all_finish_none_script) if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(finish_none_scripts, number=int(math.ceil(len(finish_none_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("finish-none-bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to finish-none-bootstrap_parallel_script_list.txt") # integrate all the bootrastrapped FDR bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_summary_file = "get_result_bootstrap.sh" with open(bootstrap_summary_file, 'w') as f: f.write("START=$(date)\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + int_coef_file + " -da 1"+ " -tbf " + "bootstrap-transpose" + " -uabrd 0\n") f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_summary_file, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file) # integrate in a lite version bootstrap_summary_file = "get_result_bootstrap_lite.sh" with open(bootstrap_summary_file, 'w') as f: f.write("START=$(date)\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + int_coef_file + " -da 1"+ " -dl 1 -uabrd 0\n") f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_summary_file, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file) for fdr in fdrs: print("*************************") print("Integrating bootstrap files for FDR ", fdr) print("****EFFECT***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) # write the fdr file out bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt" with open(bootstrap_fdr_effect_list_file, 'w') as f: for b_coef in all_fdr_effect_coefs_dict[fdr]: f.write(b_coef + "\n") print("All fdr effect written to ", bootstrap_fdr_effect_list_file) bootstrap_fdr_effect_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-effect.txt" with open(bootstrap_fdr_effect_intercept_list_file, 'w') as f: for b_intercept in all_fdr_effect_intercepts_dict[fdr]: f.write(b_intercept + "\n") print("All fdr effect written to ", bootstrap_fdr_effect_intercept_list_file) bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh" with open(bootstrap_fdr_effect_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_effect_list_file + " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-effect -uabrd 1\n") # f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_effect_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script) bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect_lite.sh" with open(bootstrap_fdr_effect_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_effect_list_file + " -da 0" + " -dl 1 -uabrd 1\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_effect_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script) print("-----------------------") print("****NONE***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt" with open(bootstrap_fdr_none_list_file, 'w') as f: for b_coef in all_fdr_none_coefs_dict[fdr]: f.write(b_coef + "\n") print("All fdr none written to ", bootstrap_fdr_none_list_file) bootstrap_fdr_none_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-none.txt" with open(bootstrap_fdr_none_intercept_list_file, 'w') as f: for b_intercept in all_fdr_none_intercepts_dict[fdr]: f.write(b_intercept + "\n") print("All fdr none written to ", bootstrap_fdr_none_intercept_list_file) bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh" with open(bootstrap_fdr_none_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-none -uabrd 1\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_none_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script) bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none_lite.sh" with open(bootstrap_fdr_none_summary_script, 'w') as f: f.write("START=$(date)\n") f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -dl 1 -uabrd 1\n") f.write("END=$(date)\n") f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n") os.chmod(bootstrap_fdr_none_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script) print() print("FDR DONE ") print(" *************************************") print("SCRIPTS") with open("bootstrap_script_list.txt", 'w') as outfile: # lEFT OFF HERE for bootstrap_script in sorted(all_bootstrap_scripts): outfile.write("./" + bootstrap_script + "\n") print("bootstrap scripts written to bootstrap_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to bootstrap_parallel_script_list.txt") print("TIMING")
def run(args): data_file = args.data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) print("Reading rows from format: ", row_filename) print("*************") print("PAIRWISE") print("*************") # Run the actual fit # Need an integration if not os.path.exists("pairwise"): os.makedirs("pairwise") # For the pairwise individual fit scripts if not os.path.exists("pairwise-fit-scripts"): os.makedirs("pairwise-fit-scripts") # For the pairwise finish scripts if not os.path.exists("pairwise-finish-scripts"): os.makedirs("pairwise-finish-scripts") pairwise_result_folder = os.path.join("pairwise", "pairwise-results") if not os.path.exists(pairwise_result_folder): os.makedirs(pairwise_result_folder) # make one script for each... # all_bootstrap_scripts = set([]) # all_int_coefs = [] # all_int_intercepts = [] # record where the thresholded coefficients are written # For integrating these, later. try: fittimefile = os.path.join("timing", "pairwise_fit_time.csv") if not os.path.exists(fittimefile): with open(fittimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) finishtimefile = os.path.join("timing", "pairwise_finish_time.csv") if not os.path.exists(finishtimefile): with open(finishtimefile, 'w') as csvfile: f = csv.writer(csvfile) f.writerow(["Name", "Start", "End", "Elapsed"]) # resulttimefile = os.path.join("timing", "bootstrap_result_time.csv") # if not os.path.exists(resulttimefile): # with open(resulttimefile, 'w') as csvfile: # f = csv.writer(csvfile) # f.writerow(["Name", "Start", "End", "Elapsed"]) with open(os.path.join("timing/timing_list.txt"), 'a') as f: f.write(fittimefile + "\n") f.write(finishtimefile + "\n") # f.write(resulttimefile + "\n") except IOError: raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.") pairwise_outmost_name = args.output_name + "-pairwise" pairwise_outmost_prefix = os.path.join("pairwise", pairwise_outmost_name) # create scripts for pairwise pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh") for i in range(len(partition_rows))] pairwise_row_prefixes = [pairwise_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))] command_template = "time python fit_pairwise.py -d " + data_file + " -lr " + str(args.load_reps) + \ " -o " + "pairwise_row_prefixes[i]" + " -l " + str(args.lag) + " -rl " + \ "row_filename" for i, row_filename in zip(list(range(len(partition_rows))), row_filenames): # writing results to the pairwise prefix command_string = command_template.replace("pairwise_row_prefixes[i]", pairwise_row_prefixes[i]).replace("row_filename", row_filename) with open(pairwise_scripts[i], 'w') as outputfile: outputfile.write("#!/bin/bash\n") outputfile.write("START=$(date)\n") outputfile.write("module load python/2.7\n") # outputfile.write("module load python/2.7/scipy-mkl\n") # outputfile.write("module load python/2.7/numpy-mkl\n") outputfile.write("module load anaconda\n") outputfile.write(command_string) outputfile.write("\n") outputfile.write("END=$(date)\n") outputfile.write("echo " + pairwise_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n") os.chmod(pairwise_scripts[i], 0o777) print("Scripts made") # all_pairwise_scripts = all_pairwise_scripts.union(set(pairwise_scripts)) # Note the output files pairwise_coefs = [pairwise_row_prefix + "_coefs.p" for pairwise_row_prefix in pairwise_row_prefixes] pairwise_output_dict = collections.OrderedDict() pairwise_output_dict["coef"] = pairwise_coefs output_matr_df = pd.DataFrame(pairwise_output_dict) output_matr_file = os.path.join("pairwise", pairwise_outmost_name + "_output_matr_list.txt") output_matr_df.to_csv(output_matr_file, sep="\t", index=False) print("Raw parallelilized output matrices, before integration, written to", output_matr_file) int_matr_dict = collections.OrderedDict() int_matr_dict["coef"] = os.path.join(pairwise_result_folder, pairwise_outmost_name + "_coefs.p") # # append these to the list of final bootstrapped coefficients # all_int_coefs.append(int_matr_dict["coef"]) # all_int_intercepts.append(int_matr_dict["intercept"]) int_matr_file = pairwise_outmost_prefix + "_int_matr_list.txt" int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) int_matr_df.to_csv(int_matr_file, sep="\t", index=False) print("integrated matrices written to " + int_matr_file) # just need to put all of this into the outmost name all_pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh") for i in range(len(partition_rows))] print("SCRIPTS") with open("pairwise_script_list.txt", 'w') as outfile: for pairwise_script in all_pairwise_scripts: outfile.write("./" + pairwise_script + "\n") print("pairwise scripts written to pairwise_script_list.txt") if args.parallel_num > 0: print("Parallel Number (# processes per job): " + str(args.parallel_num)) script_groups = pj.partition_inputs(all_pairwise_scripts, number=int(math.ceil(len(all_pairwise_scripts) * 1.0/args.parallel_num))) print("Number of script groups ", len(script_groups)) parallel_scripts = [] for i, script_group in zip(list(range(len(script_groups))), script_groups): appended_script_filenames = ["./" + script_filename for script_filename in script_group] parallel_script = " & ".join(appended_script_filenames) parallel_scripts.append(parallel_script) with open("pairwise_parallel_script_list.txt", 'w') as scriptfile: for parallel_script in parallel_scripts: scriptfile.write(parallel_script + "\n") print("Parallel script list written to pairwise_parallel_script_list.txt") finish_script = os.path.join("pairwise-finish-scripts", "finish.sh") with open(finish_script, 'w') as ifile: ifile.write("set -e\n") ifile.write("START=$(date)\n") ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + " -t a \n") ifile.write("END=$(date)\n") ifile.write("echo " + finish_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n") print("Finish script, written to", finish_script) os.chmod(finish_script, 0o777)
def run(args): if args.test not in {"r", "l", "e"}: raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)") if args.null not in {"l", "g"}: raise ValueError("args.null must be l (local) or g (global)") # Load files data_file = args.data_file rand_data_file = args.rand_data_file if args.load_reps: genes, geneTS = gtm.load_basic_rep_file_list(data_file) #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file) else: df = pd.read_csv(data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) n = len(genes) # Make row files # Split up the rows according to number of input scripts partition_rows = pj.partition_inputs(list(range(n)), args.script_num) row_filenames = [] print("*************") print("ROWS") print("*************") for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))): row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p") row_filenames.append(row_filename) print("Reading rows from format: ", row_filename) print("*************") print("BOOTSTRAP") print("*************") # Run the actual fit # Need an integration if not os.path.exists("bootstrap"): os.makedirs("bootstrap") # For the bootstrap individual fit scripts if not os.path.exists("bootstrap-fit-scripts"): os.makedirs("bootstrap-fit-scripts") # For the bootstrap finish scripts if not os.path.exists("bootstrap-finish-scripts"): os.makedirs("bootstrap-finish-scripts") # Finish, aggregating all the coefficients (stratification = none) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")): os.makedirs(os.path.join("bootstrap-finish-scripts", "none")) # Finish, stratifying each coefficient by the effect gene (stratification = effect) if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")): os.makedirs(os.path.join("bootstrap-finish-scripts", "effect")) # if args.write_all_bootstrap_scripts_first: # # print "WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!" # # for b in range(args.bootstrap_num): # if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): # os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) # # all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh") # for b in range(args.bootstrap_num) for i in range(len(row_filenames))] # print "SCRIPTS" # # with open("bootstrap_script_list.txt", 'w') as outfile: # for bootstrap_script in all_bootstrap_scripts: # outfile.write("./" + bootstrap_script + "\n") # print "bootstrap scripts written to bootstrap_script_list.txt" # # if args.parallel_num > 0: # print "Parallel Number (# processes per job): " + str(args.parallel_num) # # script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num))) # # print "Number of script groups ", len(script_groups) # # parallel_scripts = [] # for i, script_group in zip(range(len(script_groups)), script_groups): # appended_script_filenames = ["./" + script_filename for script_filename in script_group] # parallel_script = " & ".join(appended_script_filenames) # parallel_scripts.append(parallel_script) # # with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile: # for parallel_script in parallel_scripts: # scriptfile.write(parallel_script + "\n") # print "Parallel script list written to bootstrap_parallel_script_list.txt" # make one script for each... # # all_bootstrap_scripts = set([]) # # all_int_coefs = [] # # finish_none_scripts = [] # finish_effect_scripts = [] # record where the thresholded coefficients are written # For integrating these, later. fdrs = [0.01, 0.05, 0.1, 0.2] # all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs]) # all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs]) # for b in range(args.bootstrap_num): # print "SEED/BOOTSTRAP NUM: ", b # # bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b) # # bootstrap_folder = os.path.join("bootstrap", str(b)) # if not os.path.exists(bootstrap_folder): # os.makedirs(bootstrap_folder) # print "Created folder: ", bootstrap_folder # bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name) # # if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))): # os.makedirs(os.path.join("bootstrap-fit-scripts", str(b))) # # # # create scripts for bootstrap # bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh") # for i in range(len(partition_rows))] # bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))] # # command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \ # " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \ # "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \ # "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array) # # for i, row_filename in zip(range(len(partition_rows)), row_filenames): # # # writing results to the bootstrap prefix # # command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename) # # with open(bootstrap_scripts[i], 'w') as outputfile: # outputfile.write("#!/bin/bash\nmodule load python/2.7\nmodule load python/2.7/scipy-mkl\nmodule load python/2.7/numpy-mkl\nmodule load anaconda\n") # outputfile.write(command_string + "\n") # os.chmod(bootstrap_scripts[i], 0777) # # # print "Scripts made" # # # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts)) # # # Note the output files # # bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes] # bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes] # # bootstrap_output_dict = collections.OrderedDict() # bootstrap_output_dict["coef"] = bootstrap_coefs # bootstrap_output_dict["coefr"] = bootstrap_coefsr # bootstrap_output_dict["intercept"] = bootstrap_intercepts # # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr # # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene # # output_matr_df = pd.DataFrame(bootstrap_output_dict) # output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt") # output_matr_df.to_csv(output_matr_file, sep="\t", index=False) # print "Raw parallelilized output matrices, before integration, written to", output_matr_file # # # # # int_matr_dict = collections.OrderedDict() # int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p" # int_matr_dict["coefr"] = bootstrap_outmost_prefix + "_coefsr.p" # int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p" # # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p" # # # append these to the list of final bootstrapped coefficients # all_int_coefs.append(int_matr_dict["coef"]) # # int_matr_file = bootstrap_outmost_prefix + "_int_matr_list.txt" # int_matr_df = pd.DataFrame(int_matr_dict, index=[0]) # int_matr_df.to_csv(int_matr_file, sep="\t", index=False) # print "integrated matrices written to " + int_matr_file # # # bootstrap_result_dict = collections.OrderedDict() # bootstrap_result_dict["fit_result"] = bootstrap_results # bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr # # # # output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt" # output_df_df = pd.DataFrame(bootstrap_result_dict) # output_df_df.to_csv(output_df_file, sep="\t", index=False) # print "output dfs file written to ", output_df_file # # int_df_dict = collections.OrderedDict() # int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt" # int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt" # # int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt" # int_df_df = pd.DataFrame(int_df_dict, index=[0]) # int_df_df.to_csv(int_df_file, sep="\t", index=False) # print "Integrated dfs file written to ", int_df_file # # # # # just need to put all of this into the outmost name # # # finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh") # with open(finish_none_script, 'w') as ifile: # ifile.write("set -e\n") # ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) # ifile.write(" && " + \ # "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" # ) # ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ # " -lr " + str(args.load_reps) + \ # " -bh " + "hyper" + os.sep + "best_hyper.p" + \ # " -o " + \ # bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ # " -cfr " + int_matr_dict["coefr"] + " -fr " + \ # int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ # " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") # print "Finish script, stratby None, written to", finish_none_script # os.chmod(finish_none_script, 0777) # # finish_none_scripts.append(finish_none_script) # # # finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh") # with open(finish_effect_script, 'w') as ifile: # ifile.write("set -e\n") # ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file + (" -t m -a 1 " if args.only_array else " -t a ")) # ifile.write(" && " + \ # "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n" # ) # ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\ # " -lr " + str(args.load_reps) + \ # " -bh " + "hyper" + os.sep + "best_hyper.p" + \ # " -o " + \ # bootstrap_outmost_name + " -cf " + int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \ # " -cfr " + int_matr_dict["coefr"] + " -fr " + \ # int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \ # " -sb " + "e" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n") # print "Finish script, stratby effect, written to", finish_effect_script # os.chmod(finish_effect_script, 0777) # # finish_effect_scripts.append(finish_effect_script) # get all the fdr files immediately # for fdr in fdrs: # all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none", # bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" + "_coefs.p")) # all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect", # bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" + "_coefs.p")) # print "-----------" int_coef_file = "all_bootstrap_coefs.txt" # integrate all the bootrastrapped FDR bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_summary_file = "get_result_bootstrap.sh" with open(bootstrap_summary_file, 'w') as f: f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -o " + os.path.join(bootstrap_result_folder, args.output_name) + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + int_coef_file + " -da 1") os.chmod(bootstrap_summary_file, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file) for fdr in fdrs: print("*************************") print("Integrating bootstrap files for FDR ", fdr) print("****EFFECT***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) # write the fdr file out bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt" # with open(bootstrap_fdr_effect_list_file, 'w') as f: # for b_coef in all_fdr_effect_coefs_dict[fdr]: # f.write(b_coef + "\n") # # print "All fdr effect written to ", bootstrap_fdr_effect_list_file bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh" with open(bootstrap_fdr_effect_summary_script, 'w') as f: f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_effect_list_file + " -da 0") os.chmod(bootstrap_fdr_effect_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script) print("-----------------------") print("****NONE***") bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none") if not os.path.exists(bootstrap_result_folder): os.makedirs(bootstrap_result_folder) bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt" # with open(bootstrap_fdr_none_list_file, 'w') as f: # for b_coef in all_fdr_none_coefs_dict[fdr]: # f.write(b_coef + "\n") # # print "All fdr none written to ", bootstrap_fdr_none_list_file bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh" with open(bootstrap_fdr_none_summary_script, 'w') as f: f.write("set -e\n") f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \ " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \ " -b " + bootstrap_fdr_none_list_file + " -da 0") os.chmod(bootstrap_fdr_none_summary_script, 0o777) print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script) print() print("FDR DONE ") print(" *************************************")