Ejemplo n.º 1
0
def get_cause_plot_triples(cause2effects, sort_dict=None):
    """
    :param cause2effects: Dictionary of the causes and effects
    :param sort_dict: Dictionary returning a key to sort the effects by
    :return: a list of plot_triples, cause at beginning
    """
    plot_triples_list = []
    for cause in cause2effects:
        effects = sorted(cause2effects[cause], key = lambda entry: sort_dict[entry], reverse=True)
        effect_list = pj.partition_inputs(list(effects), int(round(len(effects)/2.0)))


        plot_triples_list.extend([[cause] + e for e in effect_list])

    print "Plot triples: "
    print plot_triples_list[0:20]

    return plot_triples_list
Ejemplo n.º 2
0
def get_cause_plot_triples(cause2effects, sort_dict=None):
    """
    :param cause2effects: Dictionary of the causes and effects
    :param sort_dict: Dictionary returning a key to sort the effects by
    :return: a list of plot_triples, cause at beginning
    """
    plot_triples_list = []
    for cause in cause2effects:
        effects = sorted(cause2effects[cause],
                         key=lambda entry: sort_dict[entry],
                         reverse=True)
        effect_list = pj.partition_inputs(list(effects),
                                          int(round(len(effects) / 2.0)))

        plot_triples_list.extend([[cause] + e for e in effect_list])

    print "Plot triples: "
    print plot_triples_list[0:20]

    return plot_triples_list
Ejemplo n.º 3
0
def run(args):


    data_file = args.data_file.split('/')[-1]
    rand_data_file = args.rand_data_file.split('/')[-1]


    df = gtm.load_file_and_avg(data_file)

    genes = df['gene'].values

    n = len(genes)

    script_filenames = []
    output_filenames = []
    output_rand_filenames = []

    if args.test == "e":
        all_res_filenames = []
        use_filenames = []
        all_res_rand_filenames = []
        use_rand_filenames = []
    else:
        all_res_filenames = None
        use_filenames = None
        all_res_rand_filenames = None
        use_rand_filenames = None

    partition_rows = pj.partition_inputs(range(n), args.job_num)


    for partition_row, i in zip(partition_rows, range(len(partition_rows))):

        script_filename = args.output_name + "-script-" + str(i) + ".sh"
        script_filenames.append(script_filename)


        output_filename = args.output_name + "-" + str(i) + ".p"
        output_filenames.append(output_filename)

        output_rand_filename = args.output_name + "-randomized-" + str(i) + ".p"
        output_rand_filenames.append(output_rand_filename)

        # prepare the job associated with this

        row_filename = args.output_name + "-row-" + str(i) + ".txt"

        command_string = "python run_causal_rand_row.py -d " + data_file +  " -rd " + rand_data_file + \
                         " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \
                         str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename

        if args.test == "e":
            all_res_filename = args.output_name + "-all-params-" + str(i) + ".txt"
            all_res_filenames.append(all_res_filename)

            use_filename = args.output_name + "-used-params-" + str(i) + ".txt"
            use_filenames.append(use_filename)

            all_res_rand_filename = args.output_name + "-all-params-randomized-" + str(i) + ".txt"
            all_res_rand_filenames.append(all_res_rand_filename)

            use_rand_filename = args.output_name + "-used-params-randomized-" + str(i) + ".txt"
            use_rand_filenames.append(use_rand_filename)

            command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename


        with open(row_filename, 'w') as rowfile:
            rowfile.write(str(partition_row) + "\n")

        print "Partition row written to ", row_filename


        with open(script_filename, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("module load python/2.7\n")
            outputfile.write("module load python/2.7/scipy-mkl\n")
            outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda\n")
            outputfile.write(command_string)
            outputfile.write("\n")
        os.chmod(script_filename, 0777)

        print "Script written to ", script_filename

    integrated_name_dict = {}
    integrated_name_dict["Output"] = args.output_name + ".p"
    integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p"
    integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt"
    integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt"
    integrated_name_dict["All-Rand-Params"] = args.output_name + "-all-params-randomized.txt"
    integrated_name_dict["Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt"


    with open("script_list.txt", 'w') as scriptfile:
        for script_filename in script_filenames:
            scriptfile.write(script_filename + "\n")
        print "Script list written to script_list.txt"


    # list of matrices to integrate
    output_matr_dict = {"Output": output_filenames, "Rand-Output": output_rand_filenames}
    output_matr_df = pd.DataFrame(output_matr_dict)
    output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False)
    print "Output matrices written to output_matr_list.txt"

    int_matr_dict = dict([(x, integrated_name_dict[x]) for x in ["Output", "Rand-Output"]])
    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False)
    print "integrated matrices written to int_matr_list.txt"


    if args.test == "e":
        # lists of dataframes (param files) to integrate
        # These will only be integrated if
        output_df_dict = {}
        output_df_lists = [all_res_filenames, use_filenames, all_res_rand_filenames, use_rand_filenames]
        output_df_names = ["All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params"]
        for out_list, out_name in zip(output_df_lists, output_df_names):
            if out_list != None:
                output_df_dict[out_name] = out_list

        output_df_df = pd.DataFrame(output_df_dict)
        output_df_df.to_csv("output_df_list.txt", sep="\t", index=False)
        print "output dfs written to output_df_list.txt"


        int_df_dict = dict([(x, integrated_name_dict[x]) for x in set(output_df_names).intersection(output_df_dict.keys())])
        int_df_df = pd.DataFrame(int_df_dict, index=[0])
        int_df_df.to_csv("int_df_list.txt", sep="\t", index=False)
        print "Integrated dfs written to int_df_list.txt"


    with open("integrate_outputs.sh", 'w') as ifile:

        if args.test == "e":
            # here , "a" means the axis to integrate by
            ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \
                        "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n")

        else:
            ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n")

        print "Integration script written to integrate_outputs.sh"
        os.chmod("integrate_outputs.sh", 0777)

    with open("fdr_control.sh", 'w') as ffile:
        fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \
                    " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \
                    " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram "
        ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n")
        print "FDR CONTROL script written to fdr_control.sh"
        os.chmod("fdr_control.sh", 0777)


    if args.parallel_num > 0:
        print "Parallel Number (# processes per job): " + str(args.parallel_num)

        script_groups = pj.partition_inputs(script_filenames, number=int(math.ceil(len(script_filenames) * 1.0/args.parallel_num)))

        print "Number of script groups ", len(script_groups)


        parallel_scripts = []
        for i, script_group in zip(range(len(script_groups)), script_groups):
            appended_script_filenames = ["./" + script_filename for script_filename in script_group]
            parallel_script = " & ".join(appended_script_filenames)
            print "Parallel Script ", i, ":", parallel_script
            parallel_scripts.append(parallel_script)

        with open("parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print "Parallel script list written to parallel_script_list.txt"
Ejemplo n.º 4
0
def run(args):
    if args.test not in {"r", "l", "e"}:
        raise ValueError(
            "args.test must be r (ridge), l (lasso) or e (elastic net)")

    if args.null not in {"l", "g"}:
        raise ValueError("args.null must be l (local) or g (global)")

    # Load files
    data_file = args.data_file
    rand_data_file = args.rand_data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)

    hyperlist = pickle.load(open(args.hyper_list_file, 'rb'))
    # hyper_names = cp.hyperlist_to_namelist(hyperlist)

    # Make hyper files for cross_validate loading.

    hyper_filenames = []

    print("*************")
    print("HYPERS")
    print("*************")

    if not os.path.exists("hyper"):
        os.makedirs("hyper")

    # for hyper, hyper_name in zip(hyperlist, hyper_names):
    for hyper, h in zip(hyperlist, list(range(len(hyperlist)))):
        hyper_filename = "hyper" + os.sep + args.output_name + "-hyper-" + str(
            h) + ".p"

        hyper_filenames.append(hyper_filename)

        pickle.dump([hyper], open(hyper_filename, 'wb'))

    print("Hypers written in format: ", hyper_filename)

    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []

    print("*************")
    print("ROWS")
    print("*************")

    if not os.path.exists("rows"):
        os.makedirs("rows")

    for partition_row, i in zip(partition_rows,
                                list(range(len(partition_rows)))):

        row_filename = os.path.join("rows",
                                    args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

        pickle.dump(partition_row, open(row_filename, 'wb'))

    print("Row written in format: ", row_filename)

    if not os.path.exists("timing"):
        os.makedirs("timing")
        print("Folder timing created")
    resulttimefile = os.path.join("timing", "result_time.csv")
    if not os.path.exists(resulttimefile):
        with open(resulttimefile, 'w') as csvfile:
            f = csv.writer(csvfile)
            f.writerow(["Name", "Start", "End", "Elapsed"])

    if args.cv != 0:
        print("*************")
        print("CV")
        print("*************")

        # Make CV scripts

        cv_scripts = []

        hyper_output_dict = collections.OrderedDict()
        hyper_int_dict = collections.OrderedDict()

        if not os.path.exists("cv-scripts"):
            os.makedirs("cv-scripts")

        cvtimefile = os.path.join("timing", "hyper_time.csv")
        if not os.path.exists(cvtimefile):
            with open(cvtimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        for hyper, h, hyper_filename in zip(hyperlist,
                                            list(range(len(hyperlist))),
                                            hyper_filenames):

            hyper_output_group = []

            for partition_row, i, row_filename in zip(
                    partition_rows, list(range(len(partition_rows))),
                    row_filenames):

                cv_prefix = args.output_name + "-cv-" + str(h) + "-row-" + str(
                    i)

                cv_script = os.path.join("cv-scripts", cv_prefix + ".sh")
                cv_scripts.append(cv_script)

                cv_output = "hyper" + os.sep + cv_prefix + "-result.txt"
                hyper_output_group.append(cv_output)

                command_string = "time python cross_validate.py -d " + data_file + " -lr " + str(args.load_reps) +  " -o " + cv_output + " -hl " + str(hyper_filename) \
                                 + " -t " + args.test + " -l " + str(args.lag) + " -rl " + str(row_filename)

                with open(cv_script, 'w') as outputfile:
                    outputfile.write("#!/bin/bash\n")
                    outputfile.write("START=$(date)\n")
                    #outputfile.write("module load python/2.7\n")
                    # outputfile.write("module load python/2.7/scipy-mkl\n")
                    # outputfile.write("module load python/2.7/numpy-mkl\n")
                    #outputfile.write("module load anaconda\n")
                    outputfile.write("module load anaconda3\n")
                    outputfile.write(command_string)
                    outputfile.write("\n")
                    outputfile.write("END=$(date)\n")
                    outputfile.write("echo " + cv_script +
                                     ",$START,$END,$SECONDS >> " + cvtimefile +
                                     "\n")
                os.chmod(cv_script, 0o777)

            # Set the output names, prepare for integration of all the hyper parameter fit results
            hyper_output_dict[str(hyper)] = hyper_output_group
            hyper_int_dict[str(
                hyper)] = "hyper" + os.sep + args.output_name + "-cv-" + str(
                    h) + "-result.txt"

        hyper_output_df = pd.DataFrame(hyper_output_dict)
        hyper_int_df = pd.DataFrame(hyper_int_dict, index=[0])

        print("Hyper output df is in form", hyper_output_df.head(n=5))

        hyper_output_df.to_csv("cv_outputs.txt", sep="\t", index=0)
        hyper_int_df.to_csv("cv_integrated.txt", sep="\t", index=0)

        print("Partitioned CV fit_result_dfs in cv_outputs.txt",
              "Integrated CV fit_result_dfs in cv_integrated.txt")

        with open("cv_script_list.txt", 'w') as outfile:
            for cv_script in cv_scripts:
                outfile.write(cv_script + "\n")
            print("CV scripts written to cv_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " +
                  str(args.parallel_num))

            script_groups = pj.partition_inputs(
                cv_scripts,
                number=int(math.ceil(
                    len(cv_scripts) * 1.0 / args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))),
                                       script_groups):
                appended_script_filenames = [
                    "./" + script_filename for script_filename in script_group
                ]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("cv_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print(
                    "Parallel script list written to cv_parallel_script_list.txt"
                )

        # Integrate hyperparameters
        # Begin whole normal fit

        hyper_script = "set_hyper.sh"

        with open(hyper_script, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("START=$(date)\n")
            outputfile.write("set -e\n")
            outputfile.write(
                "time python integrate_hyper.py -hfd cv_outputs.txt -ind cv_integrated.txt -hl "
                + args.hyper_list_file + "\n")
            outputfile.write(
                "time python set_hyper.py -ind cv_integrated.txt -r " +
                "hyper" + os.sep + "hyper_df.txt -o " + "hyper" + os.sep +
                "best_hyper.p -hl " + args.hyper_list_file + " -tn " +
                args.test_name + " \n")
            outputfile.write("END=$(date)\n")
            outputfile.write("echo " + hyper_script +
                             ",$START,$END,$SECONDS >> " + resulttimefile +
                             "\n")
        os.chmod(hyper_script, 0o777)

        print("set_hyper.sh written")

    print("*************")
    print("FITTING")
    print("*************")

    # Run the actual fit
    if not os.path.exists("fit"):
        os.makedirs("fit")

    if not os.path.exists("fit-scripts"):
        os.makedirs("fit-scripts")

    fittimefile = os.path.join("timing", "fit_time.csv")
    if not os.path.exists(fittimefile):
        with open(fittimefile, 'w') as csvfile:
            f = csv.writer(csvfile)
            f.writerow(["Name", "Start", "End", "Elapsed"])

    fit_scripts = []
    fit_output_prefixes = []
    for partition_row, i, row_filename in zip(partition_rows,
                                              list(range(len(partition_rows))),
                                              row_filenames):

        fit_prefix = args.output_name + "-fit-row-" + str(i)

        fit_script = os.path.join("fit-scripts", fit_prefix + ".sh")
        fit_scripts.append(fit_script)

        fit_output_prefix = "fit" + os.sep + fit_prefix
        fit_output_prefixes.append(fit_output_prefix)


        command_string = "time python fit_all.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \
                         " -o " + fit_output_prefix + " -bh " + \
                        "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \
                         str(row_filename) + " -n " + args.null + " -oa " + str(args.only_array)

        with open(fit_script, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("START=$(date)\n")
            #outputfile.write("module load python/2.7\n")
            # outputfile.write("module load python/2.7/scipy-mkl\n")
            # outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda3\n")
            outputfile.write(command_string)
            outputfile.write("\n")
            outputfile.write("END=$(date)\n")
            outputfile.write("echo " + fit_script +
                             ",$START,$END,$SECONDS >> " + fittimefile + "\n")
        os.chmod(fit_script, 0o777)

    with open("fit_script_list.txt", 'w') as outfile:
        for fit_script in fit_scripts:
            outfile.write("./" + fit_script + "\n")
        print("Fit scripts written to fit_script_list.txt")

    if args.parallel_num > 0:
        print("Parallel Number (# processes per job): " +
              str(args.parallel_num))

        script_groups = pj.partition_inputs(
            fit_scripts,
            number=int(math.ceil(len(fit_scripts) * 1.0 / args.parallel_num)))

        print("Number of script groups ", len(script_groups))

        parallel_scripts = []
        for i, script_group in zip(list(range(len(script_groups))),
                                   script_groups):
            appended_script_filenames = [
                "./" + script_filename for script_filename in script_group
            ]
            parallel_script = " & ".join(appended_script_filenames)
            parallel_scripts.append(parallel_script)

        with open("fit_parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print(
                "Parallel script list written to fit_parallel_script_list.txt")

    # Note the output files

    fit_coefs = [
        fit_output_prefix + "_coefs.p"
        for fit_output_prefix in fit_output_prefixes
    ]
    fit_intercepts = [
        fit_output_prefix + "_intercepts.p"
        for fit_output_prefix in fit_output_prefixes
    ]
    fit_results = [
        fit_output_prefix + "_fit_result_df.txt"
        for fit_output_prefix in fit_output_prefixes
    ]
    fit_coefsr = [
        fit_output_prefix + "_coefsr.p"
        for fit_output_prefix in fit_output_prefixes
    ]
    # fit_interceptsr = [fit_output_prefix + "_interceptsr.p" for fit_output_prefix in fit_output_prefixes]
    fit_resultsr = [
        fit_output_prefix + "_fit_result_dfr.txt"
        for fit_output_prefix in fit_output_prefixes
    ]

    fit_output_dict = collections.OrderedDict()
    fit_output_dict["coef"] = fit_coefs
    fit_output_dict["coefr"] = fit_coefsr
    fit_output_dict["intercept"] = fit_intercepts
    # fit_output_dict["interceptr"] = fit_interceptsr

    output_matr_df = pd.DataFrame(fit_output_dict)
    output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False)
    print("Output matrices written to output_matr_list.txt")

    int_matr_dict = collections.OrderedDict()
    int_matr_dict["coef"] = "fit" + os.sep + args.output_name + "_coefs.p"
    int_matr_dict["coefr"] = "fit" + os.sep + args.output_name + "_coefsr.p"
    int_matr_dict[
        "intercept"] = "fit" + os.sep + args.output_name + "_intercepts.p"
    # int_matr_dict["interceptr"] = "fit" + os.sep + args.output_name + "_interceptsr.p"

    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False)
    print("integrated matrices written to int_matr_list.txt")

    fit_result_dict = collections.OrderedDict()
    fit_result_dict["fit_result"] = fit_results
    fit_result_dict["fit_resultr"] = fit_resultsr

    output_df_df = pd.DataFrame(fit_result_dict)
    output_df_df.to_csv("output_df_list.txt", sep="\t", index=False)
    print("output dfs written to output_df_list.txt")

    int_df_dict = collections.OrderedDict()
    int_df_dict[
        "fit_result"] = "fit" + os.sep + args.output_name + "_fit_result_df.txt"
    int_df_dict[
        "fit_resultr"] = "fit" + os.sep + args.output_name + "_fit_result_dfr.txt"

    int_df_df = pd.DataFrame(int_df_dict, index=[0])
    int_df_df.to_csv("int_df_list.txt", sep="\t", index=False)
    print("Integrated dfs written to int_df_list.txt")

    with open("finish-none.sh", 'w') as ifile:
        ifile.write("#!/bin/bash\n")
        ifile.write("START=$(date)\n")
        ifile.write("set -e\n")
        ifile.write(
            "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt "
            + (" -t m -a 1 " if args.only_array else " -t a "))
        ifile.write(" && " + \
                    "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n")
        ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                    " -lr " + str(args.load_reps) + \
                    " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                    " -o " + \
                    args.output_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                    " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                    int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                    " -sb " + "n" + " -tn " + args.test_name + "\n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + "finish-none.sh" + ",$START,$END,$SECONDS >> " +
                    resulttimefile + "\n")
        print("Finish script, stratby None, written to finish-none.sh")
        os.chmod("finish-none.sh", 0o777)

    with open("finish-effect.sh", 'w') as ifile:
        ifile.write("#!/bin/bash\n")
        ifile.write("START=$(date)\n")
        ifile.write("set -e\n")
        ifile.write(
            "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt "
            + (" -t m -a 1 " if args.only_array else " -t a "))
        ifile.write(" && " + \
                    "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n")
        ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                    " -lr " + str(args.load_reps) + \
                    " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                    " -o " + \
                    args.output_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                    " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                    int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                    " -sb " + "e" + " -tn " + args.test_name + "\n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + "finish-effect.sh" +
                    ",$START,$END,$SECONDS >> " + resulttimefile + "\n")

        print("Finish script, stratby effect, written to finish-effect.sh")
        os.chmod("finish-effect.sh", 0o777)

    with open("plot_coef.sh", 'w') as ifile:
        ifile.write("#!/bin/bash\n")
        ifile.write("START=$(date)\n")
        ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                    " -lr " + str(args.load_reps) + \
                    " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                    " -o " + \
                    args.output_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                    " -cfr " + int_matr_dict["coefr"]  + " -fr " + \
                    int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                    " -sb " + "n" + " -tn " + args.test_name +  " -pcf 1 " + "\n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + "plot_coef.sh" + ",$START,$END,$SECONDS >> " +
                    resulttimefile + "\n")

        print("Plot coef script written to plot_coef.sh")
        os.chmod("plot_coef.sh", 0o777)

    with open("cleanup_list.txt", 'w') as outfile:
        cleanup_list = row_filenames
        if args.cv:
            cleanup_list += cv_scripts + list(
                itertools.chain.from_iterable(list(
                    hyper_output_dict.values())))

        cleanup_list += fit_scripts + fit_coefs + fit_intercepts + fit_results + fit_coefsr + fit_resultsr
        for script in cleanup_list:
            outfile.write(script + "\n")
        print("Cleanup scripts written to cleanup_list.txt")

    with open("timing/timing_list.txt", 'w') as outfile:
        outfile.write(cvtimefile + "\n")
        outfile.write(fittimefile + "\n")
        outfile.write(resulttimefile + "\n")
    print("Timing files written to timing_list.txt")

    with open("summarize_time.sh", 'w') as outfile:
        outfile.write(
            "python summarize_time.py -i timing/timing_list.txt -o timing/summary_time.csv -oo timing/overall_time.csv\n"
        )
    os.chmod("summarize_time.sh", 0o777)
    print("Summarize timing script written to summarize_time.sh")
Ejemplo n.º 5
0
def run(args):

    data_file = args.data_file.split('/')[-1]
    rand_data_file = args.rand_data_file.split('/')[-1]

    df = gtm.load_file_and_avg(data_file)

    genes = df['gene'].values

    n = len(genes)

    script_filenames = []
    output_filenames = []
    output_rand_filenames = []

    if args.test == "e":
        all_res_filenames = []
        use_filenames = []
        all_res_rand_filenames = []
        use_rand_filenames = []
    else:
        all_res_filenames = None
        use_filenames = None
        all_res_rand_filenames = None
        use_rand_filenames = None

    partition_rows = pj.partition_inputs(range(n), args.job_num)

    for partition_row, i in zip(partition_rows, range(len(partition_rows))):

        script_filename = args.output_name + "-script-" + str(i) + ".sh"
        script_filenames.append(script_filename)

        output_filename = args.output_name + "-" + str(i) + ".p"
        output_filenames.append(output_filename)

        output_rand_filename = args.output_name + "-randomized-" + str(
            i) + ".p"
        output_rand_filenames.append(output_rand_filename)

        # prepare the job associated with this

        row_filename = args.output_name + "-row-" + str(i) + ".txt"

        command_string = "python run_causal_rand_row.py -d " + data_file +  " -rd " + rand_data_file + \
                         " -a " + args.args_file.split('/')[-1] + " -t " + args.test + " -rl " + \
                         str(row_filename) + " -o " + output_filename + " -or " + output_rand_filename

        if args.test == "e":
            all_res_filename = args.output_name + "-all-params-" + str(
                i) + ".txt"
            all_res_filenames.append(all_res_filename)

            use_filename = args.output_name + "-used-params-" + str(i) + ".txt"
            use_filenames.append(use_filename)

            all_res_rand_filename = args.output_name + "-all-params-randomized-" + str(
                i) + ".txt"
            all_res_rand_filenames.append(all_res_rand_filename)

            use_rand_filename = args.output_name + "-used-params-randomized-" + str(
                i) + ".txt"
            use_rand_filenames.append(use_rand_filename)

            command_string += " -oa " + all_res_filename + " -ou " + use_filename + " -ora " + all_res_rand_filename + " -oru " + use_rand_filename

        with open(row_filename, 'w') as rowfile:
            rowfile.write(str(partition_row) + "\n")

        print "Partition row written to ", row_filename

        with open(script_filename, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("module load python/2.7\n")
            outputfile.write("module load python/2.7/scipy-mkl\n")
            outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda\n")
            outputfile.write(command_string)
            outputfile.write("\n")
        os.chmod(script_filename, 0777)

        print "Script written to ", script_filename

    integrated_name_dict = {}
    integrated_name_dict["Output"] = args.output_name + ".p"
    integrated_name_dict["Rand-Output"] = args.output_name + "-randomized.p"
    integrated_name_dict["All-Params"] = args.output_name + "-all-params.txt"
    integrated_name_dict["Use-Params"] = args.output_name + "-use-params.txt"
    integrated_name_dict[
        "All-Rand-Params"] = args.output_name + "-all-params-randomized.txt"
    integrated_name_dict[
        "Use-Rand-Params"] = args.output_name + "-use-params-randomized.txt"

    with open("script_list.txt", 'w') as scriptfile:
        for script_filename in script_filenames:
            scriptfile.write(script_filename + "\n")
        print "Script list written to script_list.txt"

    # list of matrices to integrate
    output_matr_dict = {
        "Output": output_filenames,
        "Rand-Output": output_rand_filenames
    }
    output_matr_df = pd.DataFrame(output_matr_dict)
    output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False)
    print "Output matrices written to output_matr_list.txt"

    int_matr_dict = dict([(x, integrated_name_dict[x])
                          for x in ["Output", "Rand-Output"]])
    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False)
    print "integrated matrices written to int_matr_list.txt"

    if args.test == "e":
        # lists of dataframes (param files) to integrate
        # These will only be integrated if
        output_df_dict = {}
        output_df_lists = [
            all_res_filenames, use_filenames, all_res_rand_filenames,
            use_rand_filenames
        ]
        output_df_names = [
            "All-Params", "Use-Params", "All-Rand-Params", "Use-Rand-Params"
        ]
        for out_list, out_name in zip(output_df_lists, output_df_names):
            if out_list != None:
                output_df_dict[out_name] = out_list

        output_df_df = pd.DataFrame(output_df_dict)
        output_df_df.to_csv("output_df_list.txt", sep="\t", index=False)
        print "output dfs written to output_df_list.txt"

        int_df_dict = dict([
            (x, integrated_name_dict[x])
            for x in set(output_df_names).intersection(output_df_dict.keys())
        ])
        int_df_df = pd.DataFrame(int_df_dict, index=[0])
        int_df_df.to_csv("int_df_list.txt", sep="\t", index=False)
        print "Integrated dfs written to int_df_list.txt"

    with open("integrate_outputs.sh", 'w') as ifile:

        if args.test == "e":
            # here , "a" means the axis to integrate by
            ifile.write("python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1 && " + \
                        "python integrate_outputs_rand_row.py -i output_df_list.txt -t d -o int_df_list.txt\n")

        else:
            ifile.write(
                "python integrate_outputs_rand_row.py -i output_matr_list.txt -t m -o int_matr_list.txt -a 1\n"
            )

        print "Integration script written to integrate_outputs.sh"
        os.chmod("integrate_outputs.sh", 0777)

    with open("fdr_control.sh", 'w') as ffile:
        fdr_string = "python fdr_control.py -m " + integrated_name_dict["Output"] + " -rm " + integrated_name_dict["Rand-Output"] + \
                    " -d " + data_file + " -rd " + rand_data_file + " -n " + args.output_name + " -f \"" + str(args.fdr) + "\" " + \
                    " -c " + str(args.coef_num) + " -mn " + str(1) + " -pp " + args.output_name + "-all-beta-histogram "
        ffile.write(fdr_string + " -sb e && " + fdr_string + " -sb n\n")
        print "FDR CONTROL script written to fdr_control.sh"
        os.chmod("fdr_control.sh", 0777)

    if args.parallel_num > 0:
        print "Parallel Number (# processes per job): " + str(
            args.parallel_num)

        script_groups = pj.partition_inputs(
            script_filenames,
            number=int(
                math.ceil(len(script_filenames) * 1.0 / args.parallel_num)))

        print "Number of script groups ", len(script_groups)

        parallel_scripts = []
        for i, script_group in zip(range(len(script_groups)), script_groups):
            appended_script_filenames = [
                "./" + script_filename for script_filename in script_group
            ]
            parallel_script = " & ".join(appended_script_filenames)
            print "Parallel Script ", i, ":", parallel_script
            parallel_scripts.append(parallel_script)

        with open("parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print "Parallel script list written to parallel_script_list.txt"
Ejemplo n.º 6
0
def run(args):
    if args.test not in {"r", "l", "e"}:
        raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)")

    if args.null not in {"l", "g"}:
        raise ValueError("args.null must be l (local) or g (global)")

    # Load files
    data_file = args.data_file
    rand_data_file = args.rand_data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)




    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []


    print("*************")
    print("ROWS")
    print("*************")

    for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))):

        row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

    print("Reading rows from format: ", row_filename)

    print("*************")
    print("BOOTSTRAP")
    print("*************")


    # Run the actual fit
    # Need an integration
    if not os.path.exists("bootstrap"):
        os.makedirs("bootstrap")

    # For the bootstrap individual fit scripts
    if not os.path.exists("bootstrap-fit-scripts"):
        os.makedirs("bootstrap-fit-scripts")


    # For the bootstrap finish scripts
    if not os.path.exists("bootstrap-finish-scripts"):
        os.makedirs("bootstrap-finish-scripts")

    # Finish, aggregating all the coefficients (stratification = none)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "none"))

    # Finish, stratifying each coefficient by the effect gene (stratification = effect)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "effect"))








    # if args.write_all_bootstrap_scripts_first:

    print("WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!")

    for b in range(args.bootstrap_num):
        if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
            os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))

    all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh")
                             for b in range(args.bootstrap_num) for i in range(len(row_filenames))]


    print("SCRIPTS")

    with open("bootstrap_script_list.txt", 'w') as outfile:
        for bootstrap_script in all_bootstrap_scripts:
            outfile.write("./" + bootstrap_script + "\n")
        print("bootstrap scripts written to bootstrap_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " + str(args.parallel_num))

            script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))), script_groups):
                appended_script_filenames = ["./" + script_filename for script_filename in script_group]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print("Parallel script list written to bootstrap_parallel_script_list.txt")









    # make one script for each...

    # all_bootstrap_scripts = set([])

    all_int_coefs = []
    all_int_intercepts = []

    finish_none_scripts = []
    finish_effect_scripts = []

    # record where the thresholded coefficients are written
    # For integrating these, later.
    fdrs = [0.01, 0.05, 0.1, 0.2]
    all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs])
    all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs])

    all_fdr_none_intercepts_dict = dict([(x, []) for x in fdrs])
    all_fdr_effect_intercepts_dict = dict([(x, []) for x in fdrs])



    try:
        fittimefile = os.path.join("timing", "bootstrap_fit_time.csv")
        if not os.path.exists(fittimefile):
            with open(fittimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])


        finishtimefile = os.path.join("timing", "bootstrap_finish_time.csv")
        if not os.path.exists(finishtimefile):
            with open(finishtimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        resulttimefile = os.path.join("timing", "bootstrap_result_time.csv")
        if not os.path.exists(resulttimefile):
            with open(resulttimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        with open(os.path.join("timing/timing_list.txt"), 'a') as f:
            f.write(fittimefile + "\n")
            f.write(finishtimefile + "\n")
            f.write(resulttimefile + "\n")


    except IOError:
        raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.")


    for b in range(args.bootstrap_num):
        if b % 50 == 0:
            print("SEED/BOOTSTRAP NUM: ", b)

        bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b)

        bootstrap_folder = os.path.join("bootstrap", str(b))
        if not os.path.exists(bootstrap_folder):
            os.makedirs(bootstrap_folder)
        # print "Created folder: ", bootstrap_folder

        bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name)



        if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
            os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))


        # create scripts for bootstrap
        bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh")
                             for i in range(len(partition_rows))]
        bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))]

        command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \
                             " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \
                            "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \
                             "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array)

        for i, row_filename in zip(list(range(len(partition_rows))), row_filenames):

            # writing results to the bootstrap prefix

            command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename)

            with open(bootstrap_scripts[i], 'w') as outputfile:
                    outputfile.write("#!/bin/bash\n")
                    outputfile.write("START=$(date)\n")
                    #outputfile.write("module load python/2.7\n")
                    # outputfile.write("module load python/2.7/scipy-mkl\n")
                    # outputfile.write("module load python/2.7/numpy-mkl\n")
                    #outputfile.write("module load anaconda\n")
                    outputfile.write("module load anaconda3\n")
                    outputfile.write(command_string)
                    outputfile.write("\n")
                    outputfile.write("END=$(date)\n")
                    outputfile.write("echo " + bootstrap_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n")
            os.chmod(bootstrap_scripts[i], 0o777)


        # print "Scripts made"

        # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts))

        # Note the output files

        bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]

        bootstrap_output_dict = collections.OrderedDict()
        bootstrap_output_dict["coef"] = bootstrap_coefs
        bootstrap_output_dict["coefr"] = bootstrap_coefsr
        bootstrap_output_dict["intercept"] = bootstrap_intercepts
        # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr
        # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene

        output_matr_df = pd.DataFrame(bootstrap_output_dict)
        output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt")
        output_matr_df.to_csv(output_matr_file, sep="\t", index=False)
        # print "Raw parallelilized output matrices, before integration, written to", output_matr_file




        int_matr_dict = collections.OrderedDict()
        int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p"
        int_matr_dict["coefr"] = bootstrap_outmost_prefix +  "_coefsr.p"
        int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p"
        # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p"

        # append these to the list of final bootstrapped coefficients
        all_int_coefs.append(int_matr_dict["coef"])
        all_int_intercepts.append(int_matr_dict["intercept"])

        int_matr_file = bootstrap_outmost_prefix +  "_int_matr_list.txt"
        int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
        int_matr_df.to_csv(int_matr_file, sep="\t", index=False)
        # print "integrated matrices written to " + int_matr_file


        bootstrap_result_dict = collections.OrderedDict()
        bootstrap_result_dict["fit_result"] = bootstrap_results
        bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr



        output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt"
        output_df_df = pd.DataFrame(bootstrap_result_dict)
        output_df_df.to_csv(output_df_file, sep="\t", index=False)
        # print "output dfs file written to ", output_df_file

        int_df_dict = collections.OrderedDict()
        int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt"
        int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt"

        int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt"
        int_df_df = pd.DataFrame(int_df_dict, index=[0])
        int_df_df.to_csv(int_df_file, sep="\t", index=False)
        # print "Integrated dfs file written to ", int_df_file



        # just need to put all of this into the outmost name


        finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh")
        with open(finish_none_script, 'w') as ifile:
            ifile.write("set -e\n")
            ifile.write("START=$(date)\n")
            ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
            ifile.write(" && " + \
                        "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
                        )
            ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                        " -lr " + str(args.load_reps) + \
                        " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                        " -o " + \
                         bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                        " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                        int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                        " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n")
            ifile.write("END=$(date)\n")
            ifile.write("echo " + finish_none_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n")
            # print "Finish script, stratby None, written to", finish_none_script
            os.chmod(finish_none_script, 0o777)

        finish_none_scripts.append(finish_none_script)


        finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh")
        with open(finish_effect_script, 'w') as ifile:
            ifile.write("set -e\n")
            ifile.write("START=$(date)\n")
            ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
            ifile.write(" && " + \
                        "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
                        )
            ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                        " -lr " + str(args.load_reps) + \
                        " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                        " -o " + \
                        bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                        " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                        int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                        " -sb " + "e" + " -tn " + args.test_name  + " -of " + bootstrap_folder + "\n")
            ifile.write("END=$(date)\n")
            ifile.write("echo " + finish_effect_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n")

            # print "Finish script, stratby effect, written to", finish_effect_script
            os.chmod(finish_effect_script, 0o777)

        finish_effect_scripts.append(finish_effect_script)


        # get all the fdr files immediately

        for fdr in fdrs:
            all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none",
                               bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" +  "-coefs.p"))
            all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect",
                                bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" +  "-coefs.p"))

            all_fdr_none_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none",
                               bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" +  "-intercepts.p"))
            all_fdr_effect_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect",
                                bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" +  "-intercepts.p"))




        # print "-----------"


    int_coef_file = "all_bootstrap_coefs.txt"
    with open(int_coef_file, 'w') as f:
        for b_coef in all_int_coefs:
            f.write(b_coef + "\n")
    print("All integrated bootstrapped coef files written to ", int_coef_file)

    int_intercept_file = "all_bootstrap_intercepts.txt"
    with open(int_intercept_file, 'w') as f:
        for b_intercept in all_int_intercepts:
            f.write(b_intercept + "\n")
    print("All integrated bootstrapped intercept files written to ", int_intercept_file)



    all_finish_effect_script = "finish-effect-bootstrap-all.sh"
    with open(all_finish_effect_script, 'w') as f:
        f.write("set -e\n")
        for s in finish_effect_scripts:
            f.write("./" + s + "\n")
    os.chmod(all_finish_effect_script, 0o777)

    print("All bootstrap effects scripts written to ", all_finish_effect_script)


    if args.parallel_num > 0:
        print("Parallel Number (# processes per job): " + str(args.parallel_num))

        script_groups = pj.partition_inputs(finish_effect_scripts, number=int(math.ceil(len(finish_effect_scripts) * 1.0/args.parallel_num)))

        print("Number of script groups ", len(script_groups))

        parallel_scripts = []
        for i, script_group in zip(list(range(len(script_groups))), script_groups):
            appended_script_filenames = ["./" + script_filename for script_filename in script_group]
            parallel_script = " & ".join(appended_script_filenames)
            parallel_scripts.append(parallel_script)

        with open("finish-effect-bootstrap_parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print("Parallel script list written to finish-effect-bootstrap_parallel_script_list.txt")



    all_finish_none_script = "finish-none-bootstrap-all.sh"
    with open(all_finish_none_script, 'w') as f:
        f.write("set -e\n")
        for s in finish_none_scripts:
            f.write("./" + s + "\n")
    os.chmod(all_finish_none_script, 0o777)

    print("All bootstrap nones scripts written to ", all_finish_none_script)


    if args.parallel_num > 0:
        print("Parallel Number (# processes per job): " + str(args.parallel_num))

        script_groups = pj.partition_inputs(finish_none_scripts, number=int(math.ceil(len(finish_none_scripts) * 1.0/args.parallel_num)))

        print("Number of script groups ", len(script_groups))

        parallel_scripts = []
        for i, script_group in zip(list(range(len(script_groups))), script_groups):
            appended_script_filenames = ["./" + script_filename for script_filename in script_group]
            parallel_script = " & ".join(appended_script_filenames)
            parallel_scripts.append(parallel_script)

        with open("finish-none-bootstrap_parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print("Parallel script list written to finish-none-bootstrap_parallel_script_list.txt")





    # integrate all the bootrastrapped FDR

    bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results")
    if not os.path.exists(bootstrap_result_folder):
        os.makedirs(bootstrap_result_folder)


    bootstrap_summary_file = "get_result_bootstrap.sh"
    with open(bootstrap_summary_file, 'w') as f:
        f.write("START=$(date)\n")
        f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                             " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + " -l " + str(args.lag) + " -tn " + args.test + \
                " -b " + int_coef_file + " -da 1"+ " -tbf " + "bootstrap-transpose" + " -uabrd 0\n")
        f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n")
        f.write("END=$(date)\n")
        f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
    os.chmod(bootstrap_summary_file, 0o777)
    print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file)


    # integrate in a lite version

    bootstrap_summary_file = "get_result_bootstrap_lite.sh"
    with open(bootstrap_summary_file, 'w') as f:
        f.write("START=$(date)\n")
        f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                             " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + " -l " + str(args.lag) + " -tn " + args.test + \
                " -b " + int_coef_file + " -da 1"+ " -dl 1 -uabrd 0\n")
        f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n")
        f.write("END=$(date)\n")
        f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
    os.chmod(bootstrap_summary_file, 0o777)
    print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file)



    for fdr in fdrs:
        print("*************************")
        print("Integrating bootstrap files for FDR ", fdr)

        print("****EFFECT***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        # write the fdr file out
        bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt"
        with open(bootstrap_fdr_effect_list_file, 'w') as f:
            for b_coef in all_fdr_effect_coefs_dict[fdr]:
                f.write(b_coef + "\n")

            print("All fdr effect written to ", bootstrap_fdr_effect_list_file)


        bootstrap_fdr_effect_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-effect.txt"
        with open(bootstrap_fdr_effect_intercept_list_file, 'w') as f:
            for b_intercept in all_fdr_effect_intercepts_dict[fdr]:
                f.write(b_intercept + "\n")

            print("All fdr effect written to ", bootstrap_fdr_effect_intercept_list_file)


        bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh"

        with open(bootstrap_fdr_effect_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_effect_list_file +  " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-effect  -uabrd 1\n")
            # f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_effect_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script)


        bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect_lite.sh"

        with open(bootstrap_fdr_effect_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-effect"  + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_effect_list_file +  " -da 0" + " -dl 1 -uabrd 1\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_effect_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script)




        print("-----------------------")


        print("****NONE***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt"
        with open(bootstrap_fdr_none_list_file, 'w') as f:
            for b_coef in all_fdr_none_coefs_dict[fdr]:
                f.write(b_coef + "\n")

            print("All fdr none written to ", bootstrap_fdr_none_list_file)


        bootstrap_fdr_none_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-none.txt"
        with open(bootstrap_fdr_none_intercept_list_file, 'w') as f:
            for b_intercept in all_fdr_none_intercepts_dict[fdr]:
                f.write(b_intercept + "\n")

            print("All fdr none written to ", bootstrap_fdr_none_intercept_list_file)


        bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh"

        with open(bootstrap_fdr_none_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-none -uabrd 1\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_none_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script)



        bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none_lite.sh"

        with open(bootstrap_fdr_none_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -dl 1 -uabrd 1\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_none_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script)



        print()
    print("FDR DONE ")
    print(" *************************************")


    print("SCRIPTS")

    with open("bootstrap_script_list.txt", 'w') as outfile:
        # lEFT OFF HERE
        for bootstrap_script in sorted(all_bootstrap_scripts):
            outfile.write("./" + bootstrap_script + "\n")
        print("bootstrap scripts written to bootstrap_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " + str(args.parallel_num))

            script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))), script_groups):
                appended_script_filenames = ["./" + script_filename for script_filename in script_group]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print("Parallel script list written to bootstrap_parallel_script_list.txt")


    print("TIMING")
Ejemplo n.º 7
0
def run(args):

    data_file = args.data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)




    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []


    print("*************")
    print("ROWS")
    print("*************")

    for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))):

        row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

    print("Reading rows from format: ", row_filename)

    print("*************")
    print("PAIRWISE")
    print("*************")


    # Run the actual fit
    # Need an integration
    if not os.path.exists("pairwise"):
        os.makedirs("pairwise")

    # For the pairwise individual fit scripts
    if not os.path.exists("pairwise-fit-scripts"):
        os.makedirs("pairwise-fit-scripts")


    # For the pairwise finish scripts
    if not os.path.exists("pairwise-finish-scripts"):
        os.makedirs("pairwise-finish-scripts")


    pairwise_result_folder = os.path.join("pairwise", "pairwise-results")
    if not os.path.exists(pairwise_result_folder):
        os.makedirs(pairwise_result_folder)





    # make one script for each...

    # all_bootstrap_scripts = set([])

    # all_int_coefs = []
    # all_int_intercepts = []

    # record where the thresholded coefficients are written
    # For integrating these, later.



    try:
        fittimefile = os.path.join("timing", "pairwise_fit_time.csv")
        if not os.path.exists(fittimefile):
            with open(fittimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])


        finishtimefile = os.path.join("timing", "pairwise_finish_time.csv")
        if not os.path.exists(finishtimefile):
            with open(finishtimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        # resulttimefile = os.path.join("timing", "bootstrap_result_time.csv")
        # if not os.path.exists(resulttimefile):
        #     with open(resulttimefile, 'w') as csvfile:
        #         f = csv.writer(csvfile)
        #         f.writerow(["Name", "Start", "End", "Elapsed"])

        with open(os.path.join("timing/timing_list.txt"), 'a') as f:
            f.write(fittimefile + "\n")
            f.write(finishtimefile + "\n")
            # f.write(resulttimefile + "\n")


    except IOError:
        raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.")

    pairwise_outmost_name = args.output_name + "-pairwise"
    pairwise_outmost_prefix = os.path.join("pairwise", pairwise_outmost_name)


    # create scripts for pairwise
    pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh")
                         for i in range(len(partition_rows))]
    pairwise_row_prefixes = [pairwise_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))]

    command_template = "time python fit_pairwise.py -d " + data_file + " -lr " + str(args.load_reps) + \
                         " -o " + "pairwise_row_prefixes[i]" +  " -l " + str(args.lag) + " -rl " + \
                         "row_filename"

    for i, row_filename in zip(list(range(len(partition_rows))), row_filenames):

        # writing results to the pairwise prefix

        command_string = command_template.replace("pairwise_row_prefixes[i]", pairwise_row_prefixes[i]).replace("row_filename", row_filename)

        with open(pairwise_scripts[i], 'w') as outputfile:
                outputfile.write("#!/bin/bash\n")
                outputfile.write("START=$(date)\n")
                outputfile.write("module load python/2.7\n")
                # outputfile.write("module load python/2.7/scipy-mkl\n")
                # outputfile.write("module load python/2.7/numpy-mkl\n")
                outputfile.write("module load anaconda\n")
                outputfile.write(command_string)
                outputfile.write("\n")
                outputfile.write("END=$(date)\n")
                outputfile.write("echo " + pairwise_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n")
        os.chmod(pairwise_scripts[i], 0o777)


        print("Scripts made")

        # all_pairwise_scripts = all_pairwise_scripts.union(set(pairwise_scripts))

        # Note the output files

    pairwise_coefs = [pairwise_row_prefix + "_coefs.p" for pairwise_row_prefix in pairwise_row_prefixes]

    pairwise_output_dict = collections.OrderedDict()
    pairwise_output_dict["coef"] = pairwise_coefs

    output_matr_df = pd.DataFrame(pairwise_output_dict)
    output_matr_file = os.path.join("pairwise", pairwise_outmost_name + "_output_matr_list.txt")
    output_matr_df.to_csv(output_matr_file, sep="\t", index=False)
    print("Raw parallelilized output matrices, before integration, written to", output_matr_file)




    int_matr_dict = collections.OrderedDict()
    int_matr_dict["coef"] = os.path.join(pairwise_result_folder, pairwise_outmost_name + "_coefs.p")

    # # append these to the list of final bootstrapped coefficients
    # all_int_coefs.append(int_matr_dict["coef"])
    # all_int_intercepts.append(int_matr_dict["intercept"])

    int_matr_file = pairwise_outmost_prefix +  "_int_matr_list.txt"
    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv(int_matr_file, sep="\t", index=False)
    print("integrated matrices written to " + int_matr_file)



    # just need to put all of this into the outmost name

    all_pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh")
                             for i in range(len(partition_rows))]


    print("SCRIPTS")

    with open("pairwise_script_list.txt", 'w') as outfile:
        for pairwise_script in all_pairwise_scripts:
            outfile.write("./" + pairwise_script + "\n")
        print("pairwise scripts written to pairwise_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " + str(args.parallel_num))

            script_groups = pj.partition_inputs(all_pairwise_scripts, number=int(math.ceil(len(all_pairwise_scripts) * 1.0/args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))), script_groups):
                appended_script_filenames = ["./" + script_filename for script_filename in script_group]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("pairwise_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print("Parallel script list written to pairwise_parallel_script_list.txt")


    finish_script = os.path.join("pairwise-finish-scripts", "finish.sh")
    with open(finish_script, 'w') as ifile:
        ifile.write("set -e\n")
        ifile.write("START=$(date)\n")
        ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  " -t a \n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + finish_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n")
        print("Finish script, written to", finish_script)
        os.chmod(finish_script, 0o777)
def run(args):
    if args.test not in {"r", "l", "e"}:
        raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)")

    if args.null not in {"l", "g"}:
        raise ValueError("args.null must be l (local) or g (global)")

    # Load files
    data_file = args.data_file
    rand_data_file = args.rand_data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)




    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []


    print("*************")
    print("ROWS")
    print("*************")

    for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))):

        row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

    print("Reading rows from format: ", row_filename)

    print("*************")
    print("BOOTSTRAP")
    print("*************")


    # Run the actual fit
    # Need an integration
    if not os.path.exists("bootstrap"):
        os.makedirs("bootstrap")

    # For the bootstrap individual fit scripts
    if not os.path.exists("bootstrap-fit-scripts"):
        os.makedirs("bootstrap-fit-scripts")


    # For the bootstrap finish scripts
    if not os.path.exists("bootstrap-finish-scripts"):
        os.makedirs("bootstrap-finish-scripts")

    # Finish, aggregating all the coefficients (stratification = none)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "none"))

    # Finish, stratifying each coefficient by the effect gene (stratification = effect)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "effect"))








    # if args.write_all_bootstrap_scripts_first:
    #
    # print "WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!"
    #
    # for b in range(args.bootstrap_num):
    #     if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
    #         os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))
    #
    # all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh")
    #                          for b in range(args.bootstrap_num) for i in range(len(row_filenames))]


    # print "SCRIPTS"
    #
    # with open("bootstrap_script_list.txt", 'w') as outfile:
    #     for bootstrap_script in all_bootstrap_scripts:
    #         outfile.write("./" + bootstrap_script + "\n")
    #     print "bootstrap scripts written to bootstrap_script_list.txt"
    #
    #     if args.parallel_num > 0:
    #         print "Parallel Number (# processes per job): " + str(args.parallel_num)
    #
    #         script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num)))
    #
    #         print "Number of script groups ", len(script_groups)
    #
    #         parallel_scripts = []
    #         for i, script_group in zip(range(len(script_groups)), script_groups):
    #             appended_script_filenames = ["./" + script_filename for script_filename in script_group]
    #             parallel_script = " & ".join(appended_script_filenames)
    #             parallel_scripts.append(parallel_script)
    #
    #         with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile:
    #             for parallel_script in parallel_scripts:
    #                 scriptfile.write(parallel_script + "\n")
    #             print "Parallel script list written to bootstrap_parallel_script_list.txt"









    # make one script for each...

    # # all_bootstrap_scripts = set([])
    #
    # all_int_coefs = []
    #
    # finish_none_scripts = []
    # finish_effect_scripts = []

    # record where the thresholded coefficients are written
    # For integrating these, later.
    fdrs = [0.01, 0.05, 0.1, 0.2]
    # all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs])
    # all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs])




    # for b in range(args.bootstrap_num):
    #     print "SEED/BOOTSTRAP NUM: ", b
    #
    #     bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b)
    #
    #     bootstrap_folder = os.path.join("bootstrap", str(b))
    #     if not os.path.exists(bootstrap_folder):
    #         os.makedirs(bootstrap_folder)
    #     print "Created folder: ", bootstrap_folder

        # bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name)


        #
        # if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
        #     os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))
        #
        #
        # # create scripts for bootstrap
        # bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh")
        #                      for i in range(len(partition_rows))]
        # bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))]
        #
        # command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \
        #                      " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \
        #                     "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \
        #                      "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array)
        #
        # for i, row_filename in zip(range(len(partition_rows)), row_filenames):
        #
        #     # writing results to the bootstrap prefix
        #
        #     command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename)
        #
        #     with open(bootstrap_scripts[i], 'w') as outputfile:
        #         outputfile.write("#!/bin/bash\nmodule load python/2.7\nmodule load python/2.7/scipy-mkl\nmodule load python/2.7/numpy-mkl\nmodule load anaconda\n")
        #         outputfile.write(command_string + "\n")
        #     os.chmod(bootstrap_scripts[i], 0777)
        #
        #
        # print "Scripts made"
        #
        # # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts))
        #
        # # Note the output files
        #
        # bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]
        #
        # bootstrap_output_dict = collections.OrderedDict()
        # bootstrap_output_dict["coef"] = bootstrap_coefs
        # bootstrap_output_dict["coefr"] = bootstrap_coefsr
        # bootstrap_output_dict["intercept"] = bootstrap_intercepts
        # # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr
        # # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene
        #
        # output_matr_df = pd.DataFrame(bootstrap_output_dict)
        # output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt")
        # output_matr_df.to_csv(output_matr_file, sep="\t", index=False)
        # print "Raw parallelilized output matrices, before integration, written to", output_matr_file
        #
        #
        #
        #
        # int_matr_dict = collections.OrderedDict()
        # int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p"
        # int_matr_dict["coefr"] = bootstrap_outmost_prefix +  "_coefsr.p"
        # int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p"
        # # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p"
        #
        # # append these to the list of final bootstrapped coefficients
        # all_int_coefs.append(int_matr_dict["coef"])
        #
        # int_matr_file = bootstrap_outmost_prefix +  "_int_matr_list.txt"
        # int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
        # int_matr_df.to_csv(int_matr_file, sep="\t", index=False)
        # print "integrated matrices written to " + int_matr_file
        #
        #
        # bootstrap_result_dict = collections.OrderedDict()
        # bootstrap_result_dict["fit_result"] = bootstrap_results
        # bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr
        #
        #
        #
        # output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt"
        # output_df_df = pd.DataFrame(bootstrap_result_dict)
        # output_df_df.to_csv(output_df_file, sep="\t", index=False)
        # print "output dfs file written to ", output_df_file
        #
        # int_df_dict = collections.OrderedDict()
        # int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt"
        # int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt"
        #
        # int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt"
        # int_df_df = pd.DataFrame(int_df_dict, index=[0])
        # int_df_df.to_csv(int_df_file, sep="\t", index=False)
        # print "Integrated dfs file written to ", int_df_file
        #
        #
        #
        # # just need to put all of this into the outmost name
        #
        #
        # finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh")
        # with open(finish_none_script, 'w') as ifile:
        #     ifile.write("set -e\n")
        #     ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
        #     ifile.write(" && " + \
        #                 "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
        #                 )
        #     ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
        #                 " -lr " + str(args.load_reps) + \
        #                 " -bh " + "hyper" + os.sep + "best_hyper.p" + \
        #                 " -o " + \
        #                  bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
        #                 " -cfr " + int_matr_dict["coefr"] + " -fr " + \
        #                 int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
        #                 " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n")
        #     print "Finish script, stratby None, written to", finish_none_script
        #     os.chmod(finish_none_script, 0777)
        #
        # finish_none_scripts.append(finish_none_script)
        #
        #
        # finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh")
        # with open(finish_effect_script, 'w') as ifile:
        #     ifile.write("set -e\n")
        #     ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
        #     ifile.write(" && " + \
        #                 "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
        #                 )
        #     ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
        #                 " -lr " + str(args.load_reps) + \
        #                 " -bh " + "hyper" + os.sep + "best_hyper.p" + \
        #                 " -o " + \
        #                 bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
        #                 " -cfr " + int_matr_dict["coefr"] + " -fr " + \
        #                 int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
        #                 " -sb " + "e" + " -tn " + args.test_name  + " -of " + bootstrap_folder + "\n")
        #     print "Finish script, stratby effect, written to", finish_effect_script
        #     os.chmod(finish_effect_script, 0777)
        #
        # finish_effect_scripts.append(finish_effect_script)


        # get all the fdr files immediately

        # for fdr in fdrs:
        #     all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none",
        #                        bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" +  "_coefs.p"))
        #     all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect",
        #                         bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" +  "_coefs.p"))




        # print "-----------"


    int_coef_file = "all_bootstrap_coefs.txt"



    # integrate all the bootrastrapped FDR

    bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results")
    if not os.path.exists(bootstrap_result_folder):
        os.makedirs(bootstrap_result_folder)


    bootstrap_summary_file = "get_result_bootstrap.sh"
    with open(bootstrap_summary_file, 'w') as f:
        f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                             " -o " + os.path.join(bootstrap_result_folder, args.output_name) + " -l " + str(args.lag) + " -tn " + args.test + \
                " -b " + int_coef_file + " -da 1")
    os.chmod(bootstrap_summary_file, 0o777)
    print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file)


    for fdr in fdrs:
        print("*************************")
        print("Integrating bootstrap files for FDR ", fdr)

        print("****EFFECT***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        # write the fdr file out
        bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt"
        # with open(bootstrap_fdr_effect_list_file, 'w') as f:
        #     for b_coef in all_fdr_effect_coefs_dict[fdr]:
        #         f.write(b_coef + "\n")
        #
        #     print "All fdr effect written to ", bootstrap_fdr_effect_list_file


        bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh"

        with open(bootstrap_fdr_effect_summary_script, 'w') as f:
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_effect_list_file +  " -da 0")
            os.chmod(bootstrap_fdr_effect_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script)


        print("-----------------------")


        print("****NONE***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt"
        # with open(bootstrap_fdr_none_list_file, 'w') as f:
        #     for b_coef in all_fdr_none_coefs_dict[fdr]:
        #         f.write(b_coef + "\n")
        #
        #     print "All fdr none written to ", bootstrap_fdr_none_list_file


        bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh"

        with open(bootstrap_fdr_none_summary_script, 'w') as f:
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_none_list_file + " -da 0")
            os.chmod(bootstrap_fdr_none_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script)


        print()
    print("FDR DONE ")
    print(" *************************************")