Beispiel #1
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)


    dfr = gtm.load_file_and_avg(args.rand_data_file)

    genesr = dfr['gene'].values

    found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr)

    n = geneTSr.shape[0]

    args_dict = ct.load_kwargs_file(argsfile=args.args_file)

    print args_dict




    if args.rowlist_file != None:
        with open(args.rowlist_file, 'rU') as f:
            rowlist = eval(f.readline())
    else:
        rowlist = range(n)


    if args.test == "e":
        beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv(geneTS, geneTS, rowlist, **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(beta_tuple, outfile)
        all_res_df.to_csv(args.output_all_name, sep="\t", index=False)
        use_df.to_csv(args.output_use_name, sep="\t", index=False)


        param_df = use_df[["alpha", "lambda.min", "Row"]]

        rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load(geneTSr, geneTS, rowlist, param_df, **args_dict)

        with open(args.output_rand_name, 'w') as outfile:
            pickle.dump(rand_beta_tuple, outfile)

        rand_all_res_df.to_csv(args.output_rand_all_name, sep="\t", index=False)
        rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False)

        print "HIIIIIII"
        print "Output written to ", args.output_name
        print "All results written to ", args.output_all_name
        print "Used params written to ", args.output_use_name

        print "Rand output written to ", args.output_rand_name
        print "All rand results written to ", args.output_rand_all_name
        print "Used rand params written to ", args.output_rand_use_name
Beispiel #2
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    dfr = gtm.load_file_and_avg(args.rand_data_file)

    genesr = dfr['gene'].values

    found_genesr, geneTSr = gtm.get_gene_TS(dfr, genesr)

    n = geneTSr.shape[0]

    args_dict = ct.load_kwargs_file(argsfile=args.args_file)

    print args_dict

    if args.rowlist_file != None:
        with open(args.rowlist_file, 'rU') as f:
            rowlist = eval(f.readline())
    else:
        rowlist = range(n)

    if args.test == "e":
        beta_tuple, all_res_df, use_df = ct.enet_granger_causality_row_cv(
            geneTS, geneTS, rowlist, **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(beta_tuple, outfile)
        all_res_df.to_csv(args.output_all_name, sep="\t", index=False)
        use_df.to_csv(args.output_use_name, sep="\t", index=False)

        param_df = use_df[["alpha", "lambda.min", "Row"]]

        rand_beta_tuple, rand_all_res_df, rand_use_df = ct.enet_granger_causality_row_load(
            geneTSr, geneTS, rowlist, param_df, **args_dict)

        with open(args.output_rand_name, 'w') as outfile:
            pickle.dump(rand_beta_tuple, outfile)

        rand_all_res_df.to_csv(args.output_rand_all_name,
                               sep="\t",
                               index=False)
        rand_use_df.to_csv(args.output_rand_use_name, sep="\t", index=False)

        print "HIIIIIII"
        print "Output written to ", args.output_name
        print "All results written to ", args.output_all_name
        print "Used params written to ", args.output_use_name

        print "Rand output written to ", args.output_rand_name
        print "All rand results written to ", args.output_rand_all_name
        print "Used rand params written to ", args.output_rand_use_name
Beispiel #3
0
def load_and_run(args):

    data_file = args.data_file
    rand_data_file = args.rand_data_file
    save_prefix = args.out_prefix

    assert args.test in {'e', 'l', 'r'}
    fit_method = cp.test2fit_method[args.test]

    lag = args.lag
    best_hyper = pickle.load(open(args.best_hyper_file, 'rb'))

    if args.row_file != None:
        rows = pickle.load(open(args.row_file, 'rb'))
    else:
        rows = None

    assert args.null in {"l", "g"}

    # Load data file    # Load data file
    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        genesr, geneTSr = gtm.load_basic_rep_file_list(rand_data_file)

        # dfs, genes, geneTS, df, __, __  = gtm.load_rep_file_list(data_file)
        # dfsr, genesr, geneTSr, dfr, __, __  = gtm.load_rep_file_list(rand_data_file)

    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
        dfr = pd.read_csv(rand_data_file, sep="\t")
        genesr, geneTSr = gtm.get_gene_TS(dfr)

    assert (geneTS.shape == geneTSr.shape)
    assert (genes == genesr).all()

    coefs, intercepts, fit_result_df, coefsr, fit_result_dfr = cp.run(
        geneTS,
        geneTSr,
        hyper=best_hyper,
        fit_method=fit_method,
        lag=lag,
        rows=rows,
        save_prefix=save_prefix,
        has_reps=args.load_reps,
        null=args.null,
        only_array=args.only_array)

    print("RESULTS of causal fit")
    print("*************************")
    print("NORMAL: ")
    cp.summarize_fit(coefs, intercepts, fit_result_df)
Beispiel #4
0
def load_and_run(args):

    data_file = args.data_file

    lag = args.lag

    if args.row_file != None:
        rows = pickle.load(open(args.row_file, 'rb'))
    else:
        rows = None

    # Load data file    # Load data file
    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)

    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)

    coefs = fit_all_pairwise_conditional(geneTS=geneTS,
                                         lag=lag,
                                         rows=rows,
                                         coeflag_options=None,
                                         has_reps=args.load_reps)

    outfile = args.out_prefix + "_coefs.p"
    pickle.dump(coefs, open(outfile, 'wb'))
    print("Coefs saved to ", outfile)
Beispiel #5
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    args_dict = load_kwargs_file(argsfile=args.args_file)

    if args.pairlist_file == None:
        pairlist = None
    else:
        pairlist = np.load(open(args.pairlist_file))

    print args_dict

    if args.test == 'g':
        output = ct.pairwise_granger_causality_all(geneTS, pairlist, **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(output, outfile)

    print "HELLOOOOOOOO"
    print "Output written to ", args.output_name
Beispiel #6
0
def run(args):

    # load the data

    df = gtm.load_file_and_avg(args.data_file)

    genes = df['gene'].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    args_dict = load_kwargs_file(argsfile=args.args_file)

    if args.pairlist_file == None:
        pairlist = None
    else:
        pairlist = np.load(open(args.pairlist_file))

    print args_dict

    if args.test == 'g':
        output = ct.pairwise_granger_causality_all(geneTS, pairlist,
                                                   **args_dict)
        with open(args.output_name, 'w') as outfile:
            pickle.dump(output, outfile)

    print "HELLOOOOOOOO"
    print "Output written to ", args.output_name
def main():
    tstart = time.time()

    input_file = args.input_file
    out_file_prefix = args.out_file_prefix

    start_index = args.start_index
    end_index = args.end_index

    df = gtm.load_file_and_avg(input_file)

    genes = df['gene'][start_index:end_index].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    cause_type = args.cause_type

    if cause_type == 'g':
        model_orders = range(args.model_order_min, args.model_order_max + 1)

        threshold = args.p_threshold

        p_matr_list = []
        sig_matr_list = []

        for model_order in model_orders:
            t_gc = time.time()
            p_matr = pairwise_granger_causality_all(
                geneTS,
                model_order=model_order,
                use_processes=args.use_processes,
                procnum=args.procnum)
            print "Time for granger causality", time.time() - t_gc

            sig_matr = p_matr < threshold

            p_matr_list.append(p_matr)
            sig_matr_list.append(sig_matr)

        all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr(
            sig_matr_list=sig_matr_list)

        print "Total number of significant pairs ", all_sig_num + not_sig_num
        print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / (
            all_sig_num + not_sig_num)

        out_file_name = out_file_prefix + "_GC.p"
        pickle.dump([
            model_orders, p_matr_list, sig_matr_list,
            (all_sig_matr, all_sig_num, not_sig_num)
        ], open(out_file_name, "w"))

        print "Results written  to", out_file_name

    # compare the significant matrices

    # save the output p matrices

    print "Total time used ", time.time() - tstart
def main():
    tstart = time.time()


    input_file = args.input_file
    out_file_prefix = args.out_file_prefix


    start_index = args.start_index
    end_index = args.end_index


    df = gtm.load_file_and_avg(input_file)

    genes = df['gene'][start_index:end_index].values

    found_genes, geneTS = gtm.get_gene_TS(df, genes)

    cause_type = args.cause_type

    if cause_type == 'g':
        model_orders = range(args.model_order_min, args.model_order_max + 1)

        threshold = args.p_threshold

        p_matr_list = []
        sig_matr_list = []

        for model_order in model_orders:
            t_gc = time.time()
            p_matr = pairwise_granger_causality_all(geneTS, model_order=model_order, use_processes=args.use_processes, procnum=args.procnum)
            print "Time for granger causality", time.time() - t_gc


            sig_matr = p_matr < threshold

            p_matr_list.append(p_matr)
            sig_matr_list.append(sig_matr)



        all_sig_matr, all_sig_num, not_sig_num = gtm.compare_sig_matr(sig_matr_list=sig_matr_list)

        print "Total number of significant pairs ", all_sig_num + not_sig_num
        print "Pairs significant across all matrices ", all_sig_num, all_sig_num * 1.0 / (all_sig_num + not_sig_num)


        out_file_name = out_file_prefix + "_GC.p"
        pickle.dump([model_orders, p_matr_list, sig_matr_list, (all_sig_matr, all_sig_num, not_sig_num)], open(out_file_name, "w"))

        print "Results written  to", out_file_name



    # compare the significant matrices

    # save the output p matrices

    print "Total time used ", time.time() - tstart
Beispiel #9
0
def load_and_run(args):

    data_file = args.data_file
    output_names = args.output_names


    assert args.test in {'e', 'l', 'r'}
    fit_method = cp.test2fit_method[args.test]

    lag = args.lag
    hyperlist = pickle.load(open(args.hyper_file, 'rb'))

    if args.row_file != None:
        rows = pickle.load(open(args.row_file, 'rb'))
    else:
        rows = None


    # Load data file
    if args.load_reps:

        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __  = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)





    best_hyper, best, hyper_df, hyper_fit_dfs = cp.run_cross_validate(geneTS, fit_method=fit_method,
                                                                      hyperlist=hyperlist, lag=lag, rows=rows,
                                                                      has_reps=args.load_reps)



    print("Best hyper is : ", best_hyper)
    print("Best result : ", best)

    print("Hyper df: ")
    print(hyper_df)

    for output_name, hyper_fit_df, hyper in zip(output_names, hyper_fit_dfs, hyperlist):
        hyper_fit_df.to_csv(output_name, sep="\t", index=0)

        print("Result for ", hyper, " written to ", output_name)
Beispiel #10
0
def load_and_run(args):

    lag = args.lag
    save_prefix = args.save_prefix

    assert args.stratify_by in {"e", "n"}

    stratify_by = cp.args2stratify_by[args.stratify_by]

    # Load data file and prepare a file to pass to plotters
    if args.load_reps:
        # load
        dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list(
            args.data_file)
        dfsr, genesr, geneTSr, dfr, __, __ = gtm.load_rep_file_list(
            args.rand_data_file)

        # get shared prefix timekeys

        print "Timekeys: ", timekeys
        print "Num per key: ", num_per_keys

    else:
        df = pd.read_csv(args.data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
        dfr = pd.read_csv(args.rand_data_file, sep="\t")
        genesr, geneTSr = gtm.get_gene_TS(dfr)

        timekeys = df.columns.values[1:]
        print "Timekeys: ", timekeys

        # Num. replicates per key
        num_per_keys = None

    assert (geneTS.shape == geneTSr.shape)
    assert (genes == genesr).all()

    coefs = pickle.load(open(args.coef_file, 'rB'))
    intercepts = pickle.load(open(args.intercept_file, 'rB'))
    fit_result_df = pd.read_csv(args.fit_result_file, sep="\t")

    coefsr = pickle.load(open(args.coef_rand_file, 'rB'))
    # interceptsr = pickle.load(open(args.intercept_rand_file, 'rB'))
    fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t")

    if args.best_hyper_file != None:
        best_hyper = pickle.load(open(args.best_hyper_file, 'rB'))
    else:
        best_hyper = None

    print "RESULTS"
    print "*************************"
    print "NORMAL: "
    cp.summarize_fit(coefs,
                     intercepts,
                     fit_result_df,
                     filename="fit_all_summary_normal.txt",
                     hyper=best_hyper,
                     test_name=args.test_name,
                     lag=lag)

    # print "*************************"
    # print "RANDOM:"
    # cp.summarize_fit(coefsr, interceptsr, fit_result_dfr, filename="fit_all_summary_random.txt", hyper=best_hyper,
    #                  test_name=args.test_name, lag=lag)

    # LEFT OFF HERE: SEE IF YOU CAN STILL DO FIT_RESULT_SUMMARY W/O INTERCEPT
    # -Jlu 1/25/17 10:14 AM

    # Align the coefs

    # print "Aligning coefficients"
    acoefs = lc.align_coefs(coefs, lag)
    acoefsr = lc.align_coefs(coefsr, lag)

    print "Removing alphas (gene-on-self effects) "

    acoefs = lc.remove_alphas(acoefs, lag)
    acoefsr = lc.remove_alphas(acoefsr, lag)

    coef_nets = []
    coefr_nets = []

    # Save the gene matrices
    for i in range(acoefs.shape[0]):
        coef_matr_filename = save_prefix + "-" + str(i + 1) + "-matrix.txt"
        coefr_matr_filename = save_prefix + "-" + str(i + 1) + "-r-matrix.txt"

        coef_net_filename = save_prefix + "-" + str(i + 1) + "-network.txt"
        coefr_net_filename = save_prefix + "-" + str(i + 1) + "-r-network.txt"

        coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename,
                                         matrix=acoefs[i],
                                         genes=genes)
        coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename,
                                          matrix=acoefsr[i],
                                          genes=genes)

        extra_dict = collections.OrderedDict()
        extra_dict["Test"] = args.test_name
        extra_dict["Lag"] = acoefs.shape[0]
        extra_dict["Coef"] = i + 1

        coef_net = nh.matr_to_net(coef_matr,
                                  extra_dict=extra_dict,
                                  make_type=False)
        coefr_net = nh.matr_to_net(coefr_matr,
                                   extra_dict=extra_dict,
                                   make_type=False)

        coef_net.to_csv(coef_net_filename, sep="\t", index=False)
        coefr_net.to_csv(coefr_net_filename, sep="\t", index=False)

        coef_nets.append(coef_net)
        coefr_nets.append(coefr_net)

        print "Coef ", i + 1
        print "Networks written to "
        print coef_net_filename
        print coefr_net_filename

    # max_net_filename = save_prefix + "-max-network.txt"
    # max_r_net_filename = save_prefix + "-max-r-network.txt"
    union_net_filename = save_prefix + "-union-network.txt"
    union_r_net_filename = save_prefix + "-union-r-network.txt"

    if acoefs.shape[0] > 1:
        m_net = cp.get_max_network(coef_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_net = cp.get_union_network(
            coef_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
        print "Max network edges: ", m_net.shape
        print "Union network edges: ", union_net.shape
    else:
        union_net = coef_nets[0]
    union_net.to_csv(union_net_filename, sep="\t", index=False)

    if acoefsr.shape[0] > 1:
        m_net = cp.get_max_network(coefr_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_r_net = cp.get_union_network(
            coefr_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
    else:
        union_r_net = coefr_nets[0]
    union_r_net.to_csv(union_r_net_filename, sep="\t", index=False)

    # print "Max networks written to "
    # print max_net_filename
    # print max_r_net_filename
    print "Unioned networks written to "
    print union_net_filename
    print union_r_net_filename

    if not os.path.exists("plots"):
        os.makedirs("plots")
    if not os.path.exists("plots" + os.sep + "betas"):
        os.makedirs("plots" + os.sep + "betas")

    # Plot the betas
    for i in range(acoefs.shape[0]):

        if len(np.nonzero(acoefs[i])[0]) > 0 and len(
                np.nonzero(acoefsr[i])[0]) > 0:

            fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(),
                          acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                          filename="plots" + os.sep + "betas" + os.sep +
                          "beta_nonzero_coef-" + str(i + 1),
                          title="Causal coefs, Coef " + str(i + 1),
                          xlabel="Causal Coefficient")
            fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(),
                          acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                          filename="plots" + os.sep + "betas" + os.sep +
                          "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90",
                          zoom_in_top_percentile=95,
                          zoom_in_bottom_percentile=5,
                          title="Causal coefs, Coef " + str(i + 1),
                          xlabel="Causal Coefficient")

            fc.plot_betas(
                np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                filename="plots" + os.sep + "betas" + os.sep +
                "beta_abs_coef-" + str(i + 1),
                title="Absolute causal coefs, Coef " + str(i + 1),
                xlabel="Absolute Causal Coefficient")
            fc.plot_betas(
                np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                filename="plots" + os.sep + "betas" + os.sep +
                "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95",
                zoom_in_top_percentile=95,
                title="Absolute causal coefs, Coef " + str(i + 1),
                xlabel="Absolute Causal Coefficient")
            fc.plot_betas(
                np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                filename="plots" + os.sep + "betas" + os.sep +
                "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5",
                zoom_in_bottom_percentile=95,
                title="Absolute causal coefs, Coef " + str(i + 1),
                xlabel="Absolute Causal Coefficient")

        print "Coef ", i + 1
        print "Plots of betas written to: plots" + os.sep + "betas"

    # get FDRS
    fdrs = [0.01, 0.05, 0.1, 0.2]

    acoefs_fdrs = []
    sf_dfs = []

    for fdr in fdrs:

        fdr_dir = "fdr-" + str(fdr) + "-" + stratify_by
        if not os.path.exists(fdr_dir):
            os.makedirs(fdr_dir)

        fdr_prefix = fdr_dir + os.sep + save_prefix

        acoefs_fdr = np.zeros(acoefs.shape)

        fdr_nets = []

        print "*************"
        for i in range(acoefs.shape[0]):
            print "-----"
            print "FDR = ", fdr
            print "Lag ", lag
            print "Coef ", i + 1
            print "Stratify ", stratify_by
            acoefs_fdr[i], threshes = fc.get_abs_thresh(
                acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by)
            # print "Threshes", threshes

            fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-matrix.txt"
            fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-network.txt"

            fdr_matr = gtm.save_gene_matrix(fdr_matr_filename,
                                            matrix=acoefs_fdr[i],
                                            genes=genes)
            pickle.dump(
                threshes,
                open(
                    fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" +
                    stratify_by + "-threshes.p", 'wB'))

            extra_dict = collections.OrderedDict()
            extra_dict["Test"] = args.test_name
            extra_dict["Lag"] = acoefs.shape[0]
            extra_dict["Coef"] = i + 1

            fdr_net = nh.matr_to_net(fdr_matr,
                                     extra_dict=extra_dict,
                                     make_type=False)
            fdr_net.to_csv(fdr_net_filename, sep="\t", index=False)
            fdr_nets.append(fdr_net)

            # write summary readme
            sf_df = fc.summarize_fdr(matr=acoefs_fdr[i],
                                     test=args.test_name,
                                     fdr=fdr,
                                     lag=lag,
                                     coef=i + 1,
                                     hyper=best_hyper,
                                     thresh=threshes,
                                     readme_name=fdr_prefix + "-" +
                                     str(i + 1) + "-fdr-" + str(fdr) + "-" +
                                     stratify_by + "-README.txt",
                                     matrixname=fdr_matr_filename,
                                     filename=fdr_net_filename)

            sf_dfs.append(sf_df)

            print "Network edges: ", fdr_net.shape[0]

        if acoefs_fdr.shape[0] > 1:
            m_net = cp.get_max_network(fdr_nets,
                                       max_col="AbsWeight",
                                       index_col="Cause-Effect")
            union_net = cp.get_union_network(
                fdr_nets + [m_net],
                suffixes=[str(i)
                          for i in range(1, acoefs_fdr.shape[0] + 1)] + [""])

        else:
            union_net = fdr_nets[0]

        union_net_filename = fdr_prefix + "-union-fdr-" + str(
            fdr) + "-" + stratify_by + "-network.txt"
        union_net.to_csv(union_net_filename, sep="\t", index=False)

        print "Union network edges", union_net.shape[0]
        print "Written to ", union_net_filename

        acoefs_fdrs.append(acoefs_fdr.copy())

    all_sf_dfs = pd.concat(sf_dfs)

    all_sf_dfs.to_csv("fit_all_summary_fdr-" + stratify_by + ".txt",
                      sep="\t",
                      index=False)
    print "********"
    print "Summaries of all fdrs written to fit_all_summary_fdr-" + stratify_by + ".txt"
    print "Matrices done."

    with open("matrices_done.txt", 'w') as donefile:
        donefile.write("done\n")

    if args.plot_coef_fdr:
        print "*******"
        print "PLOTS"
        for i, fdr in zip(range(len(fdrs)), fdrs):
            acoefs_fdr = acoefs_fdrs[i]

            if not os.path.exists("plots" + os.sep + "fdr-" + str(fdr)):
                os.makedirs("plots" + os.sep + "fdr-" + str(fdr))

            # Only plot the bar if replicates were loaded
            cp.plot_all_coef(acoefs_fdr,
                             df,
                             genes,
                             lag,
                             file_prefix="plots" + os.sep + "fdr-" + str(fdr) +
                             os.sep + save_prefix + "-",
                             plot_bar=args.load_reps,
                             keys=timekeys,
                             num_per_keys=num_per_keys,
                             linewidth=2,
                             capsize=5,
                             capwidth=2,
                             verbose=True)

            # Plot them without error bars just to check
            if args.load_reps:
                cp.plot_all_coef(acoefs_fdr,
                                 df,
                                 genes,
                                 lag,
                                 file_prefix="plots" + os.sep + "fdr-" +
                                 str(fdr) + os.sep + save_prefix + "-nobar-",
                                 plot_bar=False,
                                 keys=timekeys,
                                 num_per_keys=num_per_keys,
                                 linewidth=2,
                                 capsize=5,
                                 capwidth=2)

            print "FDR plots written to: ", "plots" + os.sep + "fdr-" + str(
                fdr)

    # Plot all the coefs
    # NOTE: this will take a long time!
    if args.plot_all:

        raise ValueError(
            "Fix all the below first before trying to do plot all")

        if not os.path.exists("plots" + os.sep + "original"):
            os.makedirs("plots" + os.sep + "original")
        cp.plot_all_coef(acoefs,
                         df,
                         genes,
                         lag,
                         file_prefix="plots" + os.sep + "original" + os.sep +
                         save_prefix + "-",
                         plot_bar=args.load_reps,
                         keys=timekeys,
                         num_per_keys=num_per_keys,
                         linewidth=2,
                         capsize=5,
                         capwidth=2)
        print "Original plots written to: ", "plots" + os.sep + "original"

        if not os.path.exists("plots" + os.sep + "randomized"):
            os.makedirs("plots" + os.sep + "randomized")
        cp.plot_all_coef(acoefsr,
                         dfr,
                         genes,
                         lag,
                         file_prefix="plots" + os.sep + "randomized" + os.sep +
                         save_prefix + "-",
                         plot_bar=args.load_reps,
                         keys=timekeys,
                         num_per_keys=num_per_keys,
                         linewidth=2,
                         capsize=5,
                         capwidth=2)

        print "Randomized plots written to: ", "plots" + os.sep + "randomized"
Beispiel #11
0
def run(args):
    if args.test not in {"r", "l", "e"}:
        raise ValueError(
            "args.test must be r (ridge), l (lasso) or e (elastic net)")

    if args.null not in {"l", "g"}:
        raise ValueError("args.null must be l (local) or g (global)")

    # Load files
    data_file = args.data_file
    rand_data_file = args.rand_data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)

    hyperlist = pickle.load(open(args.hyper_list_file, 'rb'))
    # hyper_names = cp.hyperlist_to_namelist(hyperlist)

    # Make hyper files for cross_validate loading.

    hyper_filenames = []

    print("*************")
    print("HYPERS")
    print("*************")

    if not os.path.exists("hyper"):
        os.makedirs("hyper")

    # for hyper, hyper_name in zip(hyperlist, hyper_names):
    for hyper, h in zip(hyperlist, list(range(len(hyperlist)))):
        hyper_filename = "hyper" + os.sep + args.output_name + "-hyper-" + str(
            h) + ".p"

        hyper_filenames.append(hyper_filename)

        pickle.dump([hyper], open(hyper_filename, 'wb'))

    print("Hypers written in format: ", hyper_filename)

    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []

    print("*************")
    print("ROWS")
    print("*************")

    if not os.path.exists("rows"):
        os.makedirs("rows")

    for partition_row, i in zip(partition_rows,
                                list(range(len(partition_rows)))):

        row_filename = os.path.join("rows",
                                    args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

        pickle.dump(partition_row, open(row_filename, 'wb'))

    print("Row written in format: ", row_filename)

    if not os.path.exists("timing"):
        os.makedirs("timing")
        print("Folder timing created")
    resulttimefile = os.path.join("timing", "result_time.csv")
    if not os.path.exists(resulttimefile):
        with open(resulttimefile, 'w') as csvfile:
            f = csv.writer(csvfile)
            f.writerow(["Name", "Start", "End", "Elapsed"])

    if args.cv != 0:
        print("*************")
        print("CV")
        print("*************")

        # Make CV scripts

        cv_scripts = []

        hyper_output_dict = collections.OrderedDict()
        hyper_int_dict = collections.OrderedDict()

        if not os.path.exists("cv-scripts"):
            os.makedirs("cv-scripts")

        cvtimefile = os.path.join("timing", "hyper_time.csv")
        if not os.path.exists(cvtimefile):
            with open(cvtimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        for hyper, h, hyper_filename in zip(hyperlist,
                                            list(range(len(hyperlist))),
                                            hyper_filenames):

            hyper_output_group = []

            for partition_row, i, row_filename in zip(
                    partition_rows, list(range(len(partition_rows))),
                    row_filenames):

                cv_prefix = args.output_name + "-cv-" + str(h) + "-row-" + str(
                    i)

                cv_script = os.path.join("cv-scripts", cv_prefix + ".sh")
                cv_scripts.append(cv_script)

                cv_output = "hyper" + os.sep + cv_prefix + "-result.txt"
                hyper_output_group.append(cv_output)

                command_string = "time python cross_validate.py -d " + data_file + " -lr " + str(args.load_reps) +  " -o " + cv_output + " -hl " + str(hyper_filename) \
                                 + " -t " + args.test + " -l " + str(args.lag) + " -rl " + str(row_filename)

                with open(cv_script, 'w') as outputfile:
                    outputfile.write("#!/bin/bash\n")
                    outputfile.write("START=$(date)\n")
                    #outputfile.write("module load python/2.7\n")
                    # outputfile.write("module load python/2.7/scipy-mkl\n")
                    # outputfile.write("module load python/2.7/numpy-mkl\n")
                    #outputfile.write("module load anaconda\n")
                    outputfile.write("module load anaconda3\n")
                    outputfile.write(command_string)
                    outputfile.write("\n")
                    outputfile.write("END=$(date)\n")
                    outputfile.write("echo " + cv_script +
                                     ",$START,$END,$SECONDS >> " + cvtimefile +
                                     "\n")
                os.chmod(cv_script, 0o777)

            # Set the output names, prepare for integration of all the hyper parameter fit results
            hyper_output_dict[str(hyper)] = hyper_output_group
            hyper_int_dict[str(
                hyper)] = "hyper" + os.sep + args.output_name + "-cv-" + str(
                    h) + "-result.txt"

        hyper_output_df = pd.DataFrame(hyper_output_dict)
        hyper_int_df = pd.DataFrame(hyper_int_dict, index=[0])

        print("Hyper output df is in form", hyper_output_df.head(n=5))

        hyper_output_df.to_csv("cv_outputs.txt", sep="\t", index=0)
        hyper_int_df.to_csv("cv_integrated.txt", sep="\t", index=0)

        print("Partitioned CV fit_result_dfs in cv_outputs.txt",
              "Integrated CV fit_result_dfs in cv_integrated.txt")

        with open("cv_script_list.txt", 'w') as outfile:
            for cv_script in cv_scripts:
                outfile.write(cv_script + "\n")
            print("CV scripts written to cv_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " +
                  str(args.parallel_num))

            script_groups = pj.partition_inputs(
                cv_scripts,
                number=int(math.ceil(
                    len(cv_scripts) * 1.0 / args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))),
                                       script_groups):
                appended_script_filenames = [
                    "./" + script_filename for script_filename in script_group
                ]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("cv_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print(
                    "Parallel script list written to cv_parallel_script_list.txt"
                )

        # Integrate hyperparameters
        # Begin whole normal fit

        hyper_script = "set_hyper.sh"

        with open(hyper_script, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("START=$(date)\n")
            outputfile.write("set -e\n")
            outputfile.write(
                "time python integrate_hyper.py -hfd cv_outputs.txt -ind cv_integrated.txt -hl "
                + args.hyper_list_file + "\n")
            outputfile.write(
                "time python set_hyper.py -ind cv_integrated.txt -r " +
                "hyper" + os.sep + "hyper_df.txt -o " + "hyper" + os.sep +
                "best_hyper.p -hl " + args.hyper_list_file + " -tn " +
                args.test_name + " \n")
            outputfile.write("END=$(date)\n")
            outputfile.write("echo " + hyper_script +
                             ",$START,$END,$SECONDS >> " + resulttimefile +
                             "\n")
        os.chmod(hyper_script, 0o777)

        print("set_hyper.sh written")

    print("*************")
    print("FITTING")
    print("*************")

    # Run the actual fit
    if not os.path.exists("fit"):
        os.makedirs("fit")

    if not os.path.exists("fit-scripts"):
        os.makedirs("fit-scripts")

    fittimefile = os.path.join("timing", "fit_time.csv")
    if not os.path.exists(fittimefile):
        with open(fittimefile, 'w') as csvfile:
            f = csv.writer(csvfile)
            f.writerow(["Name", "Start", "End", "Elapsed"])

    fit_scripts = []
    fit_output_prefixes = []
    for partition_row, i, row_filename in zip(partition_rows,
                                              list(range(len(partition_rows))),
                                              row_filenames):

        fit_prefix = args.output_name + "-fit-row-" + str(i)

        fit_script = os.path.join("fit-scripts", fit_prefix + ".sh")
        fit_scripts.append(fit_script)

        fit_output_prefix = "fit" + os.sep + fit_prefix
        fit_output_prefixes.append(fit_output_prefix)


        command_string = "time python fit_all.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \
                         " -o " + fit_output_prefix + " -bh " + \
                        "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \
                         str(row_filename) + " -n " + args.null + " -oa " + str(args.only_array)

        with open(fit_script, 'w') as outputfile:
            outputfile.write("#!/bin/bash\n")
            outputfile.write("START=$(date)\n")
            #outputfile.write("module load python/2.7\n")
            # outputfile.write("module load python/2.7/scipy-mkl\n")
            # outputfile.write("module load python/2.7/numpy-mkl\n")
            outputfile.write("module load anaconda3\n")
            outputfile.write(command_string)
            outputfile.write("\n")
            outputfile.write("END=$(date)\n")
            outputfile.write("echo " + fit_script +
                             ",$START,$END,$SECONDS >> " + fittimefile + "\n")
        os.chmod(fit_script, 0o777)

    with open("fit_script_list.txt", 'w') as outfile:
        for fit_script in fit_scripts:
            outfile.write("./" + fit_script + "\n")
        print("Fit scripts written to fit_script_list.txt")

    if args.parallel_num > 0:
        print("Parallel Number (# processes per job): " +
              str(args.parallel_num))

        script_groups = pj.partition_inputs(
            fit_scripts,
            number=int(math.ceil(len(fit_scripts) * 1.0 / args.parallel_num)))

        print("Number of script groups ", len(script_groups))

        parallel_scripts = []
        for i, script_group in zip(list(range(len(script_groups))),
                                   script_groups):
            appended_script_filenames = [
                "./" + script_filename for script_filename in script_group
            ]
            parallel_script = " & ".join(appended_script_filenames)
            parallel_scripts.append(parallel_script)

        with open("fit_parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print(
                "Parallel script list written to fit_parallel_script_list.txt")

    # Note the output files

    fit_coefs = [
        fit_output_prefix + "_coefs.p"
        for fit_output_prefix in fit_output_prefixes
    ]
    fit_intercepts = [
        fit_output_prefix + "_intercepts.p"
        for fit_output_prefix in fit_output_prefixes
    ]
    fit_results = [
        fit_output_prefix + "_fit_result_df.txt"
        for fit_output_prefix in fit_output_prefixes
    ]
    fit_coefsr = [
        fit_output_prefix + "_coefsr.p"
        for fit_output_prefix in fit_output_prefixes
    ]
    # fit_interceptsr = [fit_output_prefix + "_interceptsr.p" for fit_output_prefix in fit_output_prefixes]
    fit_resultsr = [
        fit_output_prefix + "_fit_result_dfr.txt"
        for fit_output_prefix in fit_output_prefixes
    ]

    fit_output_dict = collections.OrderedDict()
    fit_output_dict["coef"] = fit_coefs
    fit_output_dict["coefr"] = fit_coefsr
    fit_output_dict["intercept"] = fit_intercepts
    # fit_output_dict["interceptr"] = fit_interceptsr

    output_matr_df = pd.DataFrame(fit_output_dict)
    output_matr_df.to_csv("output_matr_list.txt", sep="\t", index=False)
    print("Output matrices written to output_matr_list.txt")

    int_matr_dict = collections.OrderedDict()
    int_matr_dict["coef"] = "fit" + os.sep + args.output_name + "_coefs.p"
    int_matr_dict["coefr"] = "fit" + os.sep + args.output_name + "_coefsr.p"
    int_matr_dict[
        "intercept"] = "fit" + os.sep + args.output_name + "_intercepts.p"
    # int_matr_dict["interceptr"] = "fit" + os.sep + args.output_name + "_interceptsr.p"

    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv("int_matr_list.txt", sep="\t", index=False)
    print("integrated matrices written to int_matr_list.txt")

    fit_result_dict = collections.OrderedDict()
    fit_result_dict["fit_result"] = fit_results
    fit_result_dict["fit_resultr"] = fit_resultsr

    output_df_df = pd.DataFrame(fit_result_dict)
    output_df_df.to_csv("output_df_list.txt", sep="\t", index=False)
    print("output dfs written to output_df_list.txt")

    int_df_dict = collections.OrderedDict()
    int_df_dict[
        "fit_result"] = "fit" + os.sep + args.output_name + "_fit_result_df.txt"
    int_df_dict[
        "fit_resultr"] = "fit" + os.sep + args.output_name + "_fit_result_dfr.txt"

    int_df_df = pd.DataFrame(int_df_dict, index=[0])
    int_df_df.to_csv("int_df_list.txt", sep="\t", index=False)
    print("Integrated dfs written to int_df_list.txt")

    with open("finish-none.sh", 'w') as ifile:
        ifile.write("#!/bin/bash\n")
        ifile.write("START=$(date)\n")
        ifile.write("set -e\n")
        ifile.write(
            "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt "
            + (" -t m -a 1 " if args.only_array else " -t a "))
        ifile.write(" && " + \
                    "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n")
        ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                    " -lr " + str(args.load_reps) + \
                    " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                    " -o " + \
                    args.output_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                    " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                    int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                    " -sb " + "n" + " -tn " + args.test_name + "\n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + "finish-none.sh" + ",$START,$END,$SECONDS >> " +
                    resulttimefile + "\n")
        print("Finish script, stratby None, written to finish-none.sh")
        os.chmod("finish-none.sh", 0o777)

    with open("finish-effect.sh", 'w') as ifile:
        ifile.write("#!/bin/bash\n")
        ifile.write("START=$(date)\n")
        ifile.write("set -e\n")
        ifile.write(
            "time python integrate_outputs_rand_row.py -i output_matr_list.txt -o int_matr_list.txt "
            + (" -t m -a 1 " if args.only_array else " -t a "))
        ifile.write(" && " + \
                    "time python integrate_outputs_rand_row.py -i output_df_list.txt -o int_df_list.txt -t d " + "\n")
        ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                    " -lr " + str(args.load_reps) + \
                    " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                    " -o " + \
                    args.output_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                    " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                    int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                    " -sb " + "e" + " -tn " + args.test_name + "\n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + "finish-effect.sh" +
                    ",$START,$END,$SECONDS >> " + resulttimefile + "\n")

        print("Finish script, stratby effect, written to finish-effect.sh")
        os.chmod("finish-effect.sh", 0o777)

    with open("plot_coef.sh", 'w') as ifile:
        ifile.write("#!/bin/bash\n")
        ifile.write("START=$(date)\n")
        ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                    " -lr " + str(args.load_reps) + \
                    " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                    " -o " + \
                    args.output_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                    " -cfr " + int_matr_dict["coefr"]  + " -fr " + \
                    int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                    " -sb " + "n" + " -tn " + args.test_name +  " -pcf 1 " + "\n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + "plot_coef.sh" + ",$START,$END,$SECONDS >> " +
                    resulttimefile + "\n")

        print("Plot coef script written to plot_coef.sh")
        os.chmod("plot_coef.sh", 0o777)

    with open("cleanup_list.txt", 'w') as outfile:
        cleanup_list = row_filenames
        if args.cv:
            cleanup_list += cv_scripts + list(
                itertools.chain.from_iterable(list(
                    hyper_output_dict.values())))

        cleanup_list += fit_scripts + fit_coefs + fit_intercepts + fit_results + fit_coefsr + fit_resultsr
        for script in cleanup_list:
            outfile.write(script + "\n")
        print("Cleanup scripts written to cleanup_list.txt")

    with open("timing/timing_list.txt", 'w') as outfile:
        outfile.write(cvtimefile + "\n")
        outfile.write(fittimefile + "\n")
        outfile.write(resulttimefile + "\n")
    print("Timing files written to timing_list.txt")

    with open("summarize_time.sh", 'w') as outfile:
        outfile.write(
            "python summarize_time.py -i timing/timing_list.txt -o timing/summary_time.csv -oo timing/overall_time.csv\n"
        )
    os.chmod("summarize_time.sh", 0o777)
    print("Summarize timing script written to summarize_time.sh")
def load_and_run(args):


    lag = args.lag
    save_prefix = args.save_prefix
    full_save_prefix = os.path.join(args.result_save_folder, save_prefix)



     # Load data file and prepare a file to pass to plotters
    if args.load_reps:
        # load
        genes, _ = gtm.load_basic_rep_file_list(args.data_file)
        # _, genes, _, _, _, _  = gtm.load_rep_file_list(args.data_file)

        # dfs, genes, geneTS, df, timekeys, num_per_keys  = gtm.load_rep_file_list(args.data_file)

        # print "Timekeys: ", timekeys
        # print "Num per key: ", num_per_keys


    else:
        df = pd.read_csv(args.data_file, sep="\t")
        genes, _ = gtm.get_gene_TS(df)
        # dfr = pd.read_csv(args.rand_data_file, sep="\t")
        # genesr, geneTSr = gtm.get_gene_TS(dfr)
        #
        # timekeys = df.columns.values[1:]
        # print "Timekeys: ", timekeys
        #
        # # Num. replicates per key
        # num_per_keys = None



    with open(args.bootstrap_file_with_names, 'r') as f:
        filenames = [line.split("\n")[0] for line in f.readlines()]




        if args.do_lite:

            stats_matr_dict = cp.bootstrap_matrices_iter_free(filenames)


        else:

            if args.transpose_bootstrap_folder == None:
                raise ValueError("If doing bootstrap calculation, transpose is required")

        # allow the other problem




            transpose_bootstrap_folder = os.path.join(args.outer_save_folder, args.transpose_bootstrap_folder)

            if not os.path.exists(transpose_bootstrap_folder):
                os.makedirs(transpose_bootstrap_folder)
            if not os.path.exists(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump))):
                os.makedirs(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump)))


            transpose_prefix = os.path.join(transpose_bootstrap_folder,
                                            save_prefix)
            dump_prefix = os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump), save_prefix)


            t = time.time()
            bootstrap_coef_file_matr = transpose_bootstrap_matrices(filenames,
                                                                    length_before_dump=args.length_before_dump,
                                                                    save_prefix=transpose_prefix,
                                                                    dump_prefix=dump_prefix
                                                                    )
            print("Time to transpose: ", time.time() - t)

            bootstrap_coef_filename = dump_prefix + "-NAMES.p"

            pickle.dump(bootstrap_coef_file_matr, open(bootstrap_coef_filename, 'wb'))

            print("Bootstrap coef matrix dumped to ", bootstrap_coef_filename)

            t = time.time()
            stats_matr_dict = compute_bootstrap_stats_matr(bootstrap_coef_file_matr)
            print("Time to get stats: ", time.time() - t)



        # align results

    if args.dump_raw:
        dump_stats_matr_dict = stats_matr_dict.copy()

        if args.unalign_before_raw_dump:
            for k in dump_stats_matr_dict:
                dump_stats_matr_dict[k] = lc.unalign_coefs(dump_stats_matr_dict[k],
                                                           lag)


        for k in dump_stats_matr_dict:
            outfile = full_save_prefix + "_raw_" + k + "_coefs.p"
            with open(outfile, 'wb') as f:
                pickle.dump(dump_stats_matr_dict[k], f)

            print("For ", k , "Saved to ", outfile)





    if args.do_align:
        for k in stats_matr_dict:
            stats_matr_dict[k] = lc.align_coefs(stats_matr_dict[k], lag)




    # Save the gene matrices

    # Note bootstrap_matr is of form lag x n x n

    full_nets = []
    for i in range(1, lag + 1):
        print("Lag: ", i)

        print("Aggregating results")
        #bootstrap_mean, bootstrap_std, bootstrap_freq = cp.get_bootstrap_results(bootstrap_lag_to_matrs[i])


        extra_dict = collections.OrderedDict()
        extra_dict["Test"] = args.test_name
        extra_dict["Lag"] = lag
        extra_dict["Coef"] = i



        nets = []

        for k in stats_matr_dict:
            raw_matr = stats_matr_dict[k][i-1]
            matr_filename = full_save_prefix + "-" + str(i) + "-bootstrap-" + k + "-matrix.txt"

            matr = gtm.save_gene_matrix(matr_filename, matrix=raw_matr, genes=genes)

            print("Saved ", k, " to ", matr_filename)

            if k == "mean":
                net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(),
                                      abs_name="AbsBootstrap:" + k.capitalize(),
                                     do_sort=False, extra_dict=extra_dict)
            else:
                net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(),
                                      no_abs=True,
                                     do_sort=False, extra_dict=extra_dict)

            nets.append(net)

        full_net = nets[0]

        for j in range(1, len(nets)):
            full_net = full_net.merge(nets[j], how='outer')



        print("Final net: ", full_net.shape[0])

        sortby = "Bootstrap:Freq"
        print("Sorting by :", sortby)
        full_net.sort_values(sortby, inplace=True, ascending=False)

        full_net_filename = full_save_prefix +"-" + str(i) + "-bootstrap-network.txt"
        full_net.to_csv(full_net_filename, sep="\t", index=False)
        print("Written to ", full_net_filename)

        full_nets.append(full_net)

    union_net_filename = full_save_prefix + "-union-bootstrap-network.txt"

    if lag > 1:

        m_net = cp.get_max_network(full_nets, max_col="AbsBootstrap:Mean", index_col="Cause-Effect")
        union_net = cp.get_union_network(full_nets + [m_net], suffixes=[str(i) for i in range(1, lag + 1)] + [""])
        print("Max network edges: ", m_net.shape)
        print("Union network edges: ", union_net.shape)
    else:
        union_net = full_nets[0]

    sortby = "Bootstrap:Freq"
    print("Sorting by :", sortby)
    union_net.sort_values(sortby, inplace=True, ascending=False)

    union_net.to_csv(union_net_filename, sep="\t", index=False)
    print("Unioned bootstrap network written to ", union_net_filename)
Beispiel #13
0
def run(args):
    if args.test not in {"r", "l", "e"}:
        raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)")

    if args.null not in {"l", "g"}:
        raise ValueError("args.null must be l (local) or g (global)")

    # Load files
    data_file = args.data_file
    rand_data_file = args.rand_data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)




    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []


    print("*************")
    print("ROWS")
    print("*************")

    for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))):

        row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

    print("Reading rows from format: ", row_filename)

    print("*************")
    print("BOOTSTRAP")
    print("*************")


    # Run the actual fit
    # Need an integration
    if not os.path.exists("bootstrap"):
        os.makedirs("bootstrap")

    # For the bootstrap individual fit scripts
    if not os.path.exists("bootstrap-fit-scripts"):
        os.makedirs("bootstrap-fit-scripts")


    # For the bootstrap finish scripts
    if not os.path.exists("bootstrap-finish-scripts"):
        os.makedirs("bootstrap-finish-scripts")

    # Finish, aggregating all the coefficients (stratification = none)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "none"))

    # Finish, stratifying each coefficient by the effect gene (stratification = effect)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "effect"))








    # if args.write_all_bootstrap_scripts_first:

    print("WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!")

    for b in range(args.bootstrap_num):
        if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
            os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))

    all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh")
                             for b in range(args.bootstrap_num) for i in range(len(row_filenames))]


    print("SCRIPTS")

    with open("bootstrap_script_list.txt", 'w') as outfile:
        for bootstrap_script in all_bootstrap_scripts:
            outfile.write("./" + bootstrap_script + "\n")
        print("bootstrap scripts written to bootstrap_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " + str(args.parallel_num))

            script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))), script_groups):
                appended_script_filenames = ["./" + script_filename for script_filename in script_group]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print("Parallel script list written to bootstrap_parallel_script_list.txt")









    # make one script for each...

    # all_bootstrap_scripts = set([])

    all_int_coefs = []
    all_int_intercepts = []

    finish_none_scripts = []
    finish_effect_scripts = []

    # record where the thresholded coefficients are written
    # For integrating these, later.
    fdrs = [0.01, 0.05, 0.1, 0.2]
    all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs])
    all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs])

    all_fdr_none_intercepts_dict = dict([(x, []) for x in fdrs])
    all_fdr_effect_intercepts_dict = dict([(x, []) for x in fdrs])



    try:
        fittimefile = os.path.join("timing", "bootstrap_fit_time.csv")
        if not os.path.exists(fittimefile):
            with open(fittimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])


        finishtimefile = os.path.join("timing", "bootstrap_finish_time.csv")
        if not os.path.exists(finishtimefile):
            with open(finishtimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        resulttimefile = os.path.join("timing", "bootstrap_result_time.csv")
        if not os.path.exists(resulttimefile):
            with open(resulttimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        with open(os.path.join("timing/timing_list.txt"), 'a') as f:
            f.write(fittimefile + "\n")
            f.write(finishtimefile + "\n")
            f.write(resulttimefile + "\n")


    except IOError:
        raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.")


    for b in range(args.bootstrap_num):
        if b % 50 == 0:
            print("SEED/BOOTSTRAP NUM: ", b)

        bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b)

        bootstrap_folder = os.path.join("bootstrap", str(b))
        if not os.path.exists(bootstrap_folder):
            os.makedirs(bootstrap_folder)
        # print "Created folder: ", bootstrap_folder

        bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name)



        if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
            os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))


        # create scripts for bootstrap
        bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh")
                             for i in range(len(partition_rows))]
        bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))]

        command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \
                             " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \
                            "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \
                             "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array)

        for i, row_filename in zip(list(range(len(partition_rows))), row_filenames):

            # writing results to the bootstrap prefix

            command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename)

            with open(bootstrap_scripts[i], 'w') as outputfile:
                    outputfile.write("#!/bin/bash\n")
                    outputfile.write("START=$(date)\n")
                    #outputfile.write("module load python/2.7\n")
                    # outputfile.write("module load python/2.7/scipy-mkl\n")
                    # outputfile.write("module load python/2.7/numpy-mkl\n")
                    #outputfile.write("module load anaconda\n")
                    outputfile.write("module load anaconda3\n")
                    outputfile.write(command_string)
                    outputfile.write("\n")
                    outputfile.write("END=$(date)\n")
                    outputfile.write("echo " + bootstrap_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n")
            os.chmod(bootstrap_scripts[i], 0o777)


        # print "Scripts made"

        # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts))

        # Note the output files

        bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]

        bootstrap_output_dict = collections.OrderedDict()
        bootstrap_output_dict["coef"] = bootstrap_coefs
        bootstrap_output_dict["coefr"] = bootstrap_coefsr
        bootstrap_output_dict["intercept"] = bootstrap_intercepts
        # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr
        # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene

        output_matr_df = pd.DataFrame(bootstrap_output_dict)
        output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt")
        output_matr_df.to_csv(output_matr_file, sep="\t", index=False)
        # print "Raw parallelilized output matrices, before integration, written to", output_matr_file




        int_matr_dict = collections.OrderedDict()
        int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p"
        int_matr_dict["coefr"] = bootstrap_outmost_prefix +  "_coefsr.p"
        int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p"
        # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p"

        # append these to the list of final bootstrapped coefficients
        all_int_coefs.append(int_matr_dict["coef"])
        all_int_intercepts.append(int_matr_dict["intercept"])

        int_matr_file = bootstrap_outmost_prefix +  "_int_matr_list.txt"
        int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
        int_matr_df.to_csv(int_matr_file, sep="\t", index=False)
        # print "integrated matrices written to " + int_matr_file


        bootstrap_result_dict = collections.OrderedDict()
        bootstrap_result_dict["fit_result"] = bootstrap_results
        bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr



        output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt"
        output_df_df = pd.DataFrame(bootstrap_result_dict)
        output_df_df.to_csv(output_df_file, sep="\t", index=False)
        # print "output dfs file written to ", output_df_file

        int_df_dict = collections.OrderedDict()
        int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt"
        int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt"

        int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt"
        int_df_df = pd.DataFrame(int_df_dict, index=[0])
        int_df_df.to_csv(int_df_file, sep="\t", index=False)
        # print "Integrated dfs file written to ", int_df_file



        # just need to put all of this into the outmost name


        finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh")
        with open(finish_none_script, 'w') as ifile:
            ifile.write("set -e\n")
            ifile.write("START=$(date)\n")
            ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
            ifile.write(" && " + \
                        "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
                        )
            ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                        " -lr " + str(args.load_reps) + \
                        " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                        " -o " + \
                         bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                        " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                        int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                        " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n")
            ifile.write("END=$(date)\n")
            ifile.write("echo " + finish_none_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n")
            # print "Finish script, stratby None, written to", finish_none_script
            os.chmod(finish_none_script, 0o777)

        finish_none_scripts.append(finish_none_script)


        finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh")
        with open(finish_effect_script, 'w') as ifile:
            ifile.write("set -e\n")
            ifile.write("START=$(date)\n")
            ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
            ifile.write(" && " + \
                        "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
                        )
            ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
                        " -lr " + str(args.load_reps) + \
                        " -bh " + "hyper" + os.sep + "best_hyper.p" + \
                        " -o " + \
                        bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
                        " -cfr " + int_matr_dict["coefr"] + " -fr " + \
                        int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
                        " -sb " + "e" + " -tn " + args.test_name  + " -of " + bootstrap_folder + "\n")
            ifile.write("END=$(date)\n")
            ifile.write("echo " + finish_effect_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n")

            # print "Finish script, stratby effect, written to", finish_effect_script
            os.chmod(finish_effect_script, 0o777)

        finish_effect_scripts.append(finish_effect_script)


        # get all the fdr files immediately

        for fdr in fdrs:
            all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none",
                               bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" +  "-coefs.p"))
            all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect",
                                bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" +  "-coefs.p"))

            all_fdr_none_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none",
                               bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" +  "-intercepts.p"))
            all_fdr_effect_intercepts_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect",
                                bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" +  "-intercepts.p"))




        # print "-----------"


    int_coef_file = "all_bootstrap_coefs.txt"
    with open(int_coef_file, 'w') as f:
        for b_coef in all_int_coefs:
            f.write(b_coef + "\n")
    print("All integrated bootstrapped coef files written to ", int_coef_file)

    int_intercept_file = "all_bootstrap_intercepts.txt"
    with open(int_intercept_file, 'w') as f:
        for b_intercept in all_int_intercepts:
            f.write(b_intercept + "\n")
    print("All integrated bootstrapped intercept files written to ", int_intercept_file)



    all_finish_effect_script = "finish-effect-bootstrap-all.sh"
    with open(all_finish_effect_script, 'w') as f:
        f.write("set -e\n")
        for s in finish_effect_scripts:
            f.write("./" + s + "\n")
    os.chmod(all_finish_effect_script, 0o777)

    print("All bootstrap effects scripts written to ", all_finish_effect_script)


    if args.parallel_num > 0:
        print("Parallel Number (# processes per job): " + str(args.parallel_num))

        script_groups = pj.partition_inputs(finish_effect_scripts, number=int(math.ceil(len(finish_effect_scripts) * 1.0/args.parallel_num)))

        print("Number of script groups ", len(script_groups))

        parallel_scripts = []
        for i, script_group in zip(list(range(len(script_groups))), script_groups):
            appended_script_filenames = ["./" + script_filename for script_filename in script_group]
            parallel_script = " & ".join(appended_script_filenames)
            parallel_scripts.append(parallel_script)

        with open("finish-effect-bootstrap_parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print("Parallel script list written to finish-effect-bootstrap_parallel_script_list.txt")



    all_finish_none_script = "finish-none-bootstrap-all.sh"
    with open(all_finish_none_script, 'w') as f:
        f.write("set -e\n")
        for s in finish_none_scripts:
            f.write("./" + s + "\n")
    os.chmod(all_finish_none_script, 0o777)

    print("All bootstrap nones scripts written to ", all_finish_none_script)


    if args.parallel_num > 0:
        print("Parallel Number (# processes per job): " + str(args.parallel_num))

        script_groups = pj.partition_inputs(finish_none_scripts, number=int(math.ceil(len(finish_none_scripts) * 1.0/args.parallel_num)))

        print("Number of script groups ", len(script_groups))

        parallel_scripts = []
        for i, script_group in zip(list(range(len(script_groups))), script_groups):
            appended_script_filenames = ["./" + script_filename for script_filename in script_group]
            parallel_script = " & ".join(appended_script_filenames)
            parallel_scripts.append(parallel_script)

        with open("finish-none-bootstrap_parallel_script_list.txt", 'w') as scriptfile:
            for parallel_script in parallel_scripts:
                scriptfile.write(parallel_script + "\n")
            print("Parallel script list written to finish-none-bootstrap_parallel_script_list.txt")





    # integrate all the bootrastrapped FDR

    bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results")
    if not os.path.exists(bootstrap_result_folder):
        os.makedirs(bootstrap_result_folder)


    bootstrap_summary_file = "get_result_bootstrap.sh"
    with open(bootstrap_summary_file, 'w') as f:
        f.write("START=$(date)\n")
        f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                             " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + " -l " + str(args.lag) + " -tn " + args.test + \
                " -b " + int_coef_file + " -da 1"+ " -tbf " + "bootstrap-transpose" + " -uabrd 0\n")
        f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n")
        f.write("END=$(date)\n")
        f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
    os.chmod(bootstrap_summary_file, 0o777)
    print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file)


    # integrate in a lite version

    bootstrap_summary_file = "get_result_bootstrap_lite.sh"
    with open(bootstrap_summary_file, 'w') as f:
        f.write("START=$(date)\n")
        f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                             " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + " -l " + str(args.lag) + " -tn " + args.test + \
                " -b " + int_coef_file + " -da 1"+ " -dl 1 -uabrd 0\n")
        f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n")
        f.write("END=$(date)\n")
        f.write("echo " + bootstrap_summary_file + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
    os.chmod(bootstrap_summary_file, 0o777)
    print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file)



    for fdr in fdrs:
        print("*************************")
        print("Integrating bootstrap files for FDR ", fdr)

        print("****EFFECT***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        # write the fdr file out
        bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt"
        with open(bootstrap_fdr_effect_list_file, 'w') as f:
            for b_coef in all_fdr_effect_coefs_dict[fdr]:
                f.write(b_coef + "\n")

            print("All fdr effect written to ", bootstrap_fdr_effect_list_file)


        bootstrap_fdr_effect_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-effect.txt"
        with open(bootstrap_fdr_effect_intercept_list_file, 'w') as f:
            for b_intercept in all_fdr_effect_intercepts_dict[fdr]:
                f.write(b_intercept + "\n")

            print("All fdr effect written to ", bootstrap_fdr_effect_intercept_list_file)


        bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh"

        with open(bootstrap_fdr_effect_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_effect_list_file +  " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-effect  -uabrd 1\n")
            # f.write("time python get_intercept_bootstrap.py -b " + int_intercept_file + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_effect_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script)


        bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect_lite.sh"

        with open(bootstrap_fdr_effect_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-effect"  + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_effect_list_file +  " -da 0" + " -dl 1 -uabrd 1\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_effect_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_effect_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script)




        print("-----------------------")


        print("****NONE***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt"
        with open(bootstrap_fdr_none_list_file, 'w') as f:
            for b_coef in all_fdr_none_coefs_dict[fdr]:
                f.write(b_coef + "\n")

            print("All fdr none written to ", bootstrap_fdr_none_list_file)


        bootstrap_fdr_none_intercept_list_file = "all_bootstrap_intercepts_fdr-" + str(fdr) + "-none.txt"
        with open(bootstrap_fdr_none_intercept_list_file, 'w') as f:
            for b_intercept in all_fdr_none_intercepts_dict[fdr]:
                f.write(b_intercept + "\n")

            print("All fdr none written to ", bootstrap_fdr_none_intercept_list_file)


        bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh"

        with open(bootstrap_fdr_none_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -tbf " + "bootstrap-transpose" + "-fdr-" + str(fdr) + "-none -uabrd 1\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_none_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script)



        bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none_lite.sh"

        with open(bootstrap_fdr_none_summary_script, 'w') as f:
            f.write("START=$(date)\n")
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -osf " + "bootstrap" + " -rsf " + bootstrap_result_folder + " -o " + args.output_name + "_lite" + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_none_list_file + " -da 0" + " -dl 1 -uabrd 1\n")
            f.write("END=$(date)\n")
            f.write("echo " + bootstrap_fdr_none_summary_script + ",$START,$END,$SECONDS >> " + resulttimefile + "\n")
        os.chmod(bootstrap_fdr_none_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script)



        print()
    print("FDR DONE ")
    print(" *************************************")


    print("SCRIPTS")

    with open("bootstrap_script_list.txt", 'w') as outfile:
        # lEFT OFF HERE
        for bootstrap_script in sorted(all_bootstrap_scripts):
            outfile.write("./" + bootstrap_script + "\n")
        print("bootstrap scripts written to bootstrap_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " + str(args.parallel_num))

            script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))), script_groups):
                appended_script_filenames = ["./" + script_filename for script_filename in script_group]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print("Parallel script list written to bootstrap_parallel_script_list.txt")


    print("TIMING")
Beispiel #14
0
def load_and_run(args):

    lag = args.lag
    save_prefix = args.save_prefix

    assert args.stratify_by in {"e", "n"}

    stratify_by = cp.args2stratify_by[args.stratify_by]

    if args.output_folder == None:
        args.output_folder = "."

    # Load data file and prepare a file to pass to plotters
    if args.load_reps:
        # load

        genes, geneTS = gtm.load_basic_rep_file_list(args.data_file)
        genesr, geneTSr = gtm.load_basic_rep_file_list(args.rand_data_file)

        # dfs, genes, geneTS, df, timekeys, num_per_keys  = gtm.load_rep_file_list(args.data_file)
        # dfsr, genesr, geneTSr, dfr, __, __  = gtm.load_rep_file_list(args.rand_data_file)

        # get shared prefix timekeys

        # print "Timekeys: ", timekeys
        # print "Num per key: ", num_per_keys

    else:
        df = pd.read_csv(args.data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
        dfr = pd.read_csv(args.rand_data_file, sep="\t")
        genesr, geneTSr = gtm.get_gene_TS(dfr)

        timekeys = df.columns.values[1:]
        print("Timekeys: ", timekeys)

        # Num. replicates per key
        num_per_keys = None

    assert (geneTS.shape == geneTSr.shape)
    assert (genes == genesr).all()

    coefs = pickle.load(open(args.coef_file, 'rb'))
    intercepts = pickle.load(open(args.intercept_file, 'rb'))
    fit_result_df = pd.read_csv(args.fit_result_file, sep="\t")

    coefsr = pickle.load(open(args.coef_rand_file, 'rb'))
    # interceptsr = pickle.load(open(args.intercept_rand_file, 'rb'))
    fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t")

    if args.best_hyper_file != None:
        best_hyper = pickle.load(open(args.best_hyper_file, 'rb'))
    else:
        best_hyper = None

    print("RESULTS")

    print("*************************")
    print("RESIDUALS: ")

    print("*************************")
    print("NORMAL: ")
    cp.summarize_fit(coefs,
                     intercepts,
                     fit_result_df,
                     filename=os.path.join(args.output_folder,
                                           "fit_all_summary_normal.txt"),
                     hyper=best_hyper,
                     test_name=args.test_name,
                     lag=lag)

    # Align the coefs

    # print "Aligning coefficients"
    acoefs = lc.align_coefs(coefs, lag)
    acoefsr = lc.align_coefs(coefsr, lag)

    print("Removing alphas (gene-on-self effects) ")

    acoefs = lc.remove_alphas(acoefs, lag)
    acoefsr = lc.remove_alphas(acoefsr, lag)

    coef_nets = []
    coefr_nets = []

    # Save the gene matrices
    for i in range(acoefs.shape[0]):
        coef_matr_filename = os.path.join(
            args.output_folder, save_prefix + "-" + str(i + 1) + "-matrix.txt")
        coefr_matr_filename = os.path.join(
            args.output_folder,
            save_prefix + "-" + str(i + 1) + "-r-matrix.txt")

        coef_net_filename = os.path.join(
            args.output_folder,
            save_prefix + "-" + str(i + 1) + "-network.txt")
        coefr_net_filename = os.path.join(
            args.output_folder,
            save_prefix + "-" + str(i + 1) + "-r-network.txt")

        coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename,
                                         matrix=acoefs[i],
                                         genes=genes)
        coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename,
                                          matrix=acoefsr[i],
                                          genes=genes)

        extra_dict = collections.OrderedDict()
        extra_dict["Test"] = args.test_name
        extra_dict["Lag"] = acoefs.shape[0]
        extra_dict["Coef"] = i + 1

        coef_net = nh.matr_to_net(coef_matr,
                                  extra_dict=extra_dict,
                                  make_type=False)
        coefr_net = nh.matr_to_net(coefr_matr,
                                   extra_dict=extra_dict,
                                   make_type=False)

        coef_net.to_csv(coef_net_filename, sep="\t", index=False)
        coefr_net.to_csv(coefr_net_filename, sep="\t", index=False)

        coef_nets.append(coef_net)
        coefr_nets.append(coefr_net)

        print("Coef ", i + 1)
        print("Networks written to ")
        print(coef_net_filename)
        print(coefr_net_filename)

    # max_net_filename = save_prefix + "-max-network.txt"
    # max_r_net_filename = save_prefix + "-max-r-network.txt"
    union_net_filename = os.path.join(args.output_folder,
                                      save_prefix + "-union-network.txt")
    union_r_net_filename = os.path.join(args.output_folder,
                                        save_prefix + "-union-r-network.txt")

    if acoefs.shape[0] > 1:
        m_net = cp.get_max_network(coef_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_net = cp.get_union_network(
            coef_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
        print("Max network edges: ", m_net.shape)
        print("Union network edges: ", union_net.shape)
    else:
        union_net = coef_nets[0]
    union_net.to_csv(union_net_filename, sep="\t", index=False)

    if acoefsr.shape[0] > 1:
        m_net = cp.get_max_network(coefr_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_r_net = cp.get_union_network(
            coefr_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
    else:
        union_r_net = coefr_nets[0]
    union_r_net.to_csv(union_r_net_filename, sep="\t", index=False)

    # print "Max networks written to "
    # print max_net_filename
    # print max_r_net_filename
    print("Unioned networks written to ")
    print(union_net_filename)
    print(union_r_net_filename)

    if not os.path.exists(os.path.join(args.output_folder, "plots")):
        os.makedirs(os.path.join(args.output_folder, "plots"))

    if args.plot_coef:
        if not os.path.exists(
                os.path.join(args.output_folder, "plots", "betas")):
            os.makedirs(os.path.join(args.output_folder, "plots", "betas"))

        # Plot the betas
        for i in range(acoefs.shape[0]):

            if len(np.nonzero(acoefs[i])[0]) > 0 and len(
                    np.nonzero(acoefsr[i])[0]) > 0:

                fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(),
                              acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                              filename=os.path.join(
                                  args.output_folder, "plots", "betas",
                                  "beta_nonzero_coef-" + str(i + 1)),
                              title="Causal coefs, Coef " + str(i + 1),
                              xlabel="Causal Coefficient")
                fc.plot_betas(
                    acoefs[i][np.nonzero(acoefs[i])].flatten(),
                    acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                    filename=os.path.join(
                        args.output_folder, "plots", "betas",
                        "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90"),
                    zoom_in_top_percentile=95,
                    zoom_in_bottom_percentile=5,
                    title="Causal coefs, Coef " + str(i + 1),
                    xlabel="Causal Coefficient")

                fc.plot_betas(
                    np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                    np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                    filename=os.path.join(args.output_folder, "plots", "betas",
                                          "beta_abs_coef-" + str(i + 1)),
                    title="Absolute causal coefs, Coef " + str(i + 1),
                    xlabel="Absolute Causal Coefficient")
                fc.plot_betas(
                    np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                    np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                    filename=os.path.join(
                        args.output_folder, "plots", "betas",
                        "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95"),
                    zoom_in_top_percentile=95,
                    title="Absolute causal coefs, Coef " + str(i + 1),
                    xlabel="Absolute Causal Coefficient")
                fc.plot_betas(
                    np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                    np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                    filename=os.path.join(
                        args.output_folder, "plots", "betas",
                        "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5"),
                    zoom_in_bottom_percentile=95,
                    title="Absolute causal coefs, Coef " + str(i + 1),
                    xlabel="Absolute Causal Coefficient")

            print("Coef ", i + 1)
            print("Plots of betas written to: ",
                  os.path.join(args.output_folder, "plots", "betas"))

    # get FDRS
    fdrs = [0.01, 0.05, 0.1, 0.2]

    acoefs_fdrs = []
    sf_dfs = []

    for fdr in fdrs:

        fdr_dir = os.path.join(args.output_folder,
                               "fdr-" + str(fdr) + "-" + stratify_by)
        if not os.path.exists(fdr_dir):
            os.makedirs(fdr_dir)

        fdr_prefix = fdr_dir + os.sep + save_prefix

        # in case we want there to be an intermediate directory for fdr, like the bootstrap case.
        # if not os.path.exists(os.path.dirname(fdr_prefix)):
        #     os.makedirs(os.path.dirname(fdr_prefix))

        acoefs_fdr = np.zeros(acoefs.shape)

        fdr_nets = []

        print("*************")
        for i in range(acoefs.shape[0]):
            print("-----")
            print("FDR = ", fdr)
            print("Lag ", lag)
            print("Coef ", i + 1)
            print("Stratify ", stratify_by)
            acoefs_fdr[i], threshes = fc.get_abs_thresh(
                acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by)
            # print "Threshes", threshes

            fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-matrix.txt"
            fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-network.txt"

            fdr_matr = gtm.save_gene_matrix(fdr_matr_filename,
                                            matrix=acoefs_fdr[i],
                                            genes=genes)
            pickle.dump(
                threshes,
                open(
                    fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" +
                    stratify_by + "-threshes.p", 'wb'))

            extra_dict = collections.OrderedDict()
            extra_dict["Test"] = args.test_name
            extra_dict["Lag"] = acoefs.shape[0]
            extra_dict["Coef"] = i + 1

            fdr_net = nh.matr_to_net(fdr_matr,
                                     extra_dict=extra_dict,
                                     make_type=False)
            fdr_net.to_csv(fdr_net_filename, sep="\t", index=False)
            fdr_nets.append(fdr_net)

            # write summary readme
            sf_df = fc.summarize_fdr(matr=acoefs_fdr[i],
                                     test=args.test_name,
                                     fdr=fdr,
                                     lag=lag,
                                     coef=i + 1,
                                     hyper=best_hyper,
                                     thresh=threshes,
                                     readme_name=fdr_prefix + "-" +
                                     str(i + 1) + "-fdr-" + str(fdr) + "-" +
                                     stratify_by + "-README.txt",
                                     matrixname=fdr_matr_filename,
                                     filename=fdr_net_filename)

            sf_dfs.append(sf_df)

            print("Network edges: ", fdr_net.shape[0])

        if acoefs_fdr.shape[0] > 1:
            m_net = cp.get_max_network(fdr_nets,
                                       max_col="AbsWeight",
                                       index_col="Cause-Effect")
            union_net = cp.get_union_network(
                fdr_nets + [m_net],
                suffixes=[str(i)
                          for i in range(1, acoefs_fdr.shape[0] + 1)] + [""])

        else:
            union_net = fdr_nets[0]

        union_net_filename = fdr_prefix + "-union-fdr-" + str(
            fdr) + "-" + stratify_by + "-network.txt"
        union_net.to_csv(union_net_filename, sep="\t", index=False)

        print("Union network edges", union_net.shape[0])
        print("Written to ", union_net_filename)

        fdr_agg_matr_filename = fdr_prefix + "-union-fdr-" + str(
            fdr) + "-" + stratify_by + "-coefs.p"
        pickle.dump(acoefs_fdr, open(fdr_agg_matr_filename, 'wb'))

        print("Thresholded matrix written as pickle file: ",
              fdr_agg_matr_filename)

        acoefs_fdrs.append(acoefs_fdr.copy())

    all_sf_dfs = pd.concat(sf_dfs)

    # Hack to allow the base to still be fit_all_summary_fdr-stratby.txt
    # While the bootstrap will write to its own file, in its own corresponding folder
    # bullshit. just sent the output folder

    save_file = os.path.join(args.output_folder,
                             "fit_all_summary_fdr-" + stratify_by + ".txt")
    all_sf_dfs.to_csv(save_file, sep="\t", index=False)
    print("********")
    print("Summaries of all fdrs written to ", save_file)
    print("Matrices done.")

    with open(os.path.join(args.output_folder, "matrices_done.txt"),
              'w') as donefile:
        donefile.write("done\n")
def run(args):

    data_file = args.data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)




    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []


    print("*************")
    print("ROWS")
    print("*************")

    for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))):

        row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

    print("Reading rows from format: ", row_filename)

    print("*************")
    print("PAIRWISE")
    print("*************")


    # Run the actual fit
    # Need an integration
    if not os.path.exists("pairwise"):
        os.makedirs("pairwise")

    # For the pairwise individual fit scripts
    if not os.path.exists("pairwise-fit-scripts"):
        os.makedirs("pairwise-fit-scripts")


    # For the pairwise finish scripts
    if not os.path.exists("pairwise-finish-scripts"):
        os.makedirs("pairwise-finish-scripts")


    pairwise_result_folder = os.path.join("pairwise", "pairwise-results")
    if not os.path.exists(pairwise_result_folder):
        os.makedirs(pairwise_result_folder)





    # make one script for each...

    # all_bootstrap_scripts = set([])

    # all_int_coefs = []
    # all_int_intercepts = []

    # record where the thresholded coefficients are written
    # For integrating these, later.



    try:
        fittimefile = os.path.join("timing", "pairwise_fit_time.csv")
        if not os.path.exists(fittimefile):
            with open(fittimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])


        finishtimefile = os.path.join("timing", "pairwise_finish_time.csv")
        if not os.path.exists(finishtimefile):
            with open(finishtimefile, 'w') as csvfile:
                f = csv.writer(csvfile)
                f.writerow(["Name", "Start", "End", "Elapsed"])

        # resulttimefile = os.path.join("timing", "bootstrap_result_time.csv")
        # if not os.path.exists(resulttimefile):
        #     with open(resulttimefile, 'w') as csvfile:
        #         f = csv.writer(csvfile)
        #         f.writerow(["Name", "Start", "End", "Elapsed"])

        with open(os.path.join("timing/timing_list.txt"), 'a') as f:
            f.write(fittimefile + "\n")
            f.write(finishtimefile + "\n")
            # f.write(resulttimefile + "\n")


    except IOError:
        raise IOError("the timing folder does not exist. Please run ./prep_jobs_rand_cv.sh first.")

    pairwise_outmost_name = args.output_name + "-pairwise"
    pairwise_outmost_prefix = os.path.join("pairwise", pairwise_outmost_name)


    # create scripts for pairwise
    pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh")
                         for i in range(len(partition_rows))]
    pairwise_row_prefixes = [pairwise_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))]

    command_template = "time python fit_pairwise.py -d " + data_file + " -lr " + str(args.load_reps) + \
                         " -o " + "pairwise_row_prefixes[i]" +  " -l " + str(args.lag) + " -rl " + \
                         "row_filename"

    for i, row_filename in zip(list(range(len(partition_rows))), row_filenames):

        # writing results to the pairwise prefix

        command_string = command_template.replace("pairwise_row_prefixes[i]", pairwise_row_prefixes[i]).replace("row_filename", row_filename)

        with open(pairwise_scripts[i], 'w') as outputfile:
                outputfile.write("#!/bin/bash\n")
                outputfile.write("START=$(date)\n")
                outputfile.write("module load python/2.7\n")
                # outputfile.write("module load python/2.7/scipy-mkl\n")
                # outputfile.write("module load python/2.7/numpy-mkl\n")
                outputfile.write("module load anaconda\n")
                outputfile.write(command_string)
                outputfile.write("\n")
                outputfile.write("END=$(date)\n")
                outputfile.write("echo " + pairwise_scripts[i] + ",$START,$END,$SECONDS >> " + fittimefile + "\n")
        os.chmod(pairwise_scripts[i], 0o777)


        print("Scripts made")

        # all_pairwise_scripts = all_pairwise_scripts.union(set(pairwise_scripts))

        # Note the output files

    pairwise_coefs = [pairwise_row_prefix + "_coefs.p" for pairwise_row_prefix in pairwise_row_prefixes]

    pairwise_output_dict = collections.OrderedDict()
    pairwise_output_dict["coef"] = pairwise_coefs

    output_matr_df = pd.DataFrame(pairwise_output_dict)
    output_matr_file = os.path.join("pairwise", pairwise_outmost_name + "_output_matr_list.txt")
    output_matr_df.to_csv(output_matr_file, sep="\t", index=False)
    print("Raw parallelilized output matrices, before integration, written to", output_matr_file)




    int_matr_dict = collections.OrderedDict()
    int_matr_dict["coef"] = os.path.join(pairwise_result_folder, pairwise_outmost_name + "_coefs.p")

    # # append these to the list of final bootstrapped coefficients
    # all_int_coefs.append(int_matr_dict["coef"])
    # all_int_intercepts.append(int_matr_dict["intercept"])

    int_matr_file = pairwise_outmost_prefix +  "_int_matr_list.txt"
    int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
    int_matr_df.to_csv(int_matr_file, sep="\t", index=False)
    print("integrated matrices written to " + int_matr_file)



    # just need to put all of this into the outmost name

    all_pairwise_scripts = [os.path.join("pairwise-fit-scripts", pairwise_outmost_name + "-row-" + str(i) + ".sh")
                             for i in range(len(partition_rows))]


    print("SCRIPTS")

    with open("pairwise_script_list.txt", 'w') as outfile:
        for pairwise_script in all_pairwise_scripts:
            outfile.write("./" + pairwise_script + "\n")
        print("pairwise scripts written to pairwise_script_list.txt")

        if args.parallel_num > 0:
            print("Parallel Number (# processes per job): " + str(args.parallel_num))

            script_groups = pj.partition_inputs(all_pairwise_scripts, number=int(math.ceil(len(all_pairwise_scripts) * 1.0/args.parallel_num)))

            print("Number of script groups ", len(script_groups))

            parallel_scripts = []
            for i, script_group in zip(list(range(len(script_groups))), script_groups):
                appended_script_filenames = ["./" + script_filename for script_filename in script_group]
                parallel_script = " & ".join(appended_script_filenames)
                parallel_scripts.append(parallel_script)

            with open("pairwise_parallel_script_list.txt", 'w') as scriptfile:
                for parallel_script in parallel_scripts:
                    scriptfile.write(parallel_script + "\n")
                print("Parallel script list written to pairwise_parallel_script_list.txt")


    finish_script = os.path.join("pairwise-finish-scripts", "finish.sh")
    with open(finish_script, 'w') as ifile:
        ifile.write("set -e\n")
        ifile.write("START=$(date)\n")
        ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  " -t a \n")
        ifile.write("END=$(date)\n")
        ifile.write("echo " + finish_script + ",$START,$END,$SECONDS >> " + finishtimefile + "\n")
        print("Finish script, written to", finish_script)
        os.chmod(finish_script, 0o777)
def run(args):
    if args.test not in {"r", "l", "e"}:
        raise ValueError("args.test must be r (ridge), l (lasso) or e (elastic net)")

    if args.null not in {"l", "g"}:
        raise ValueError("args.null must be l (local) or g (global)")

    # Load files
    data_file = args.data_file
    rand_data_file = args.rand_data_file

    if args.load_reps:
        genes, geneTS = gtm.load_basic_rep_file_list(data_file)
        #dfs, genes, geneTS, df, __, __ = gtm.load_rep_file_list(data_file)
    else:
        df = pd.read_csv(data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
    n = len(genes)




    # Make row files
    # Split up the rows according to number of input scripts
    partition_rows = pj.partition_inputs(list(range(n)), args.script_num)

    row_filenames = []


    print("*************")
    print("ROWS")
    print("*************")

    for partition_row, i in zip(partition_rows, list(range(len(partition_rows)))):

        row_filename = os.path.join("rows", args.output_name + "-row-" + str(i) + ".p")
        row_filenames.append(row_filename)

    print("Reading rows from format: ", row_filename)

    print("*************")
    print("BOOTSTRAP")
    print("*************")


    # Run the actual fit
    # Need an integration
    if not os.path.exists("bootstrap"):
        os.makedirs("bootstrap")

    # For the bootstrap individual fit scripts
    if not os.path.exists("bootstrap-fit-scripts"):
        os.makedirs("bootstrap-fit-scripts")


    # For the bootstrap finish scripts
    if not os.path.exists("bootstrap-finish-scripts"):
        os.makedirs("bootstrap-finish-scripts")

    # Finish, aggregating all the coefficients (stratification = none)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "none")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "none"))

    # Finish, stratifying each coefficient by the effect gene (stratification = effect)
    if not os.path.exists(os.path.join("bootstrap-finish-scripts", "effect")):
        os.makedirs(os.path.join("bootstrap-finish-scripts", "effect"))








    # if args.write_all_bootstrap_scripts_first:
    #
    # print "WRITING ALL THE SCRIPTS INITIALLY!!!!!! NOTE the list will be written before all the files are written!!!"
    #
    # for b in range(args.bootstrap_num):
    #     if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
    #         os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))
    #
    # all_bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), args.output_name + "-bootstrap-" + str(b) + "-row-" + str(i) + ".sh")
    #                          for b in range(args.bootstrap_num) for i in range(len(row_filenames))]


    # print "SCRIPTS"
    #
    # with open("bootstrap_script_list.txt", 'w') as outfile:
    #     for bootstrap_script in all_bootstrap_scripts:
    #         outfile.write("./" + bootstrap_script + "\n")
    #     print "bootstrap scripts written to bootstrap_script_list.txt"
    #
    #     if args.parallel_num > 0:
    #         print "Parallel Number (# processes per job): " + str(args.parallel_num)
    #
    #         script_groups = pj.partition_inputs(all_bootstrap_scripts, number=int(math.ceil(len(all_bootstrap_scripts) * 1.0/args.parallel_num)))
    #
    #         print "Number of script groups ", len(script_groups)
    #
    #         parallel_scripts = []
    #         for i, script_group in zip(range(len(script_groups)), script_groups):
    #             appended_script_filenames = ["./" + script_filename for script_filename in script_group]
    #             parallel_script = " & ".join(appended_script_filenames)
    #             parallel_scripts.append(parallel_script)
    #
    #         with open("bootstrap_parallel_script_list.txt", 'w') as scriptfile:
    #             for parallel_script in parallel_scripts:
    #                 scriptfile.write(parallel_script + "\n")
    #             print "Parallel script list written to bootstrap_parallel_script_list.txt"









    # make one script for each...

    # # all_bootstrap_scripts = set([])
    #
    # all_int_coefs = []
    #
    # finish_none_scripts = []
    # finish_effect_scripts = []

    # record where the thresholded coefficients are written
    # For integrating these, later.
    fdrs = [0.01, 0.05, 0.1, 0.2]
    # all_fdr_none_coefs_dict = dict([(x, []) for x in fdrs])
    # all_fdr_effect_coefs_dict = dict([(x, []) for x in fdrs])




    # for b in range(args.bootstrap_num):
    #     print "SEED/BOOTSTRAP NUM: ", b
    #
    #     bootstrap_outmost_name = args.output_name + "-bootstrap-" + str(b)
    #
    #     bootstrap_folder = os.path.join("bootstrap", str(b))
    #     if not os.path.exists(bootstrap_folder):
    #         os.makedirs(bootstrap_folder)
    #     print "Created folder: ", bootstrap_folder

        # bootstrap_outmost_prefix = os.path.join(bootstrap_folder, bootstrap_outmost_name)


        #
        # if not os.path.exists(os.path.join("bootstrap-fit-scripts", str(b))):
        #     os.makedirs(os.path.join("bootstrap-fit-scripts", str(b)))
        #
        #
        # # create scripts for bootstrap
        # bootstrap_scripts = [os.path.join("bootstrap-fit-scripts", str(b), bootstrap_outmost_name + "-row-" + str(i) + ".sh")
        #                      for i in range(len(partition_rows))]
        # bootstrap_row_prefixes = [bootstrap_outmost_prefix + "-row-" + str(i) for i in range(len(partition_rows))]
        #
        # command_template = "time python fit_bootstrap.py -d " + data_file + " -rd " + rand_data_file + " -lr " + str(args.load_reps) + \
        #                      " -o " + "bootstrap_row_prefixes[i]" + " -bh " + \
        #                     "hyper" + os.sep + "best_hyper.p" + " -t " + args.test + " -l " + str(args.lag) + " -rl " + \
        #                      "row_filename" + " -n " + args.null + " -s " + str(b) + " -oa " + str(args.only_array)
        #
        # for i, row_filename in zip(range(len(partition_rows)), row_filenames):
        #
        #     # writing results to the bootstrap prefix
        #
        #     command_string = command_template.replace("bootstrap_row_prefixes[i]", bootstrap_row_prefixes[i]).replace("row_filename", row_filename)
        #
        #     with open(bootstrap_scripts[i], 'w') as outputfile:
        #         outputfile.write("#!/bin/bash\nmodule load python/2.7\nmodule load python/2.7/scipy-mkl\nmodule load python/2.7/numpy-mkl\nmodule load anaconda\n")
        #         outputfile.write(command_string + "\n")
        #     os.chmod(bootstrap_scripts[i], 0777)
        #
        #
        # print "Scripts made"
        #
        # # all_bootstrap_scripts = all_bootstrap_scripts.union(set(bootstrap_scripts))
        #
        # # Note the output files
        #
        # bootstrap_coefs = [bootstrap_row_prefix + "_coefs.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_intercepts = [bootstrap_row_prefix + "_intercepts.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_results = [bootstrap_row_prefix + "_fit_result_df.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_coefsr = [bootstrap_row_prefix + "_coefsr.p" for bootstrap_row_prefix in bootstrap_row_prefixes]
        # bootstrap_resultsr = [bootstrap_row_prefix + "_fit_result_dfr.txt" for bootstrap_row_prefix in bootstrap_row_prefixes]
        #
        # bootstrap_output_dict = collections.OrderedDict()
        # bootstrap_output_dict["coef"] = bootstrap_coefs
        # bootstrap_output_dict["coefr"] = bootstrap_coefsr
        # bootstrap_output_dict["intercept"] = bootstrap_intercepts
        # # bootstrap_output_dict["interceptr"] = bootstrap_interceptsr
        # # rand intercepts aren't put above because if it's a local null fit, then too many possible intercepts for each effect gene
        #
        # output_matr_df = pd.DataFrame(bootstrap_output_dict)
        # output_matr_file = os.path.join(bootstrap_folder, bootstrap_outmost_name + "_output_matr_list.txt")
        # output_matr_df.to_csv(output_matr_file, sep="\t", index=False)
        # print "Raw parallelilized output matrices, before integration, written to", output_matr_file
        #
        #
        #
        #
        # int_matr_dict = collections.OrderedDict()
        # int_matr_dict["coef"] = bootstrap_outmost_prefix + "_coefs.p"
        # int_matr_dict["coefr"] = bootstrap_outmost_prefix +  "_coefsr.p"
        # int_matr_dict["intercept"] = bootstrap_outmost_prefix + "_intercepts.p"
        # # int_matr_dict["interceptr"] = "bootstrap" + os.sep + bootstrap_outmost_name + "_interceptsr.p"
        #
        # # append these to the list of final bootstrapped coefficients
        # all_int_coefs.append(int_matr_dict["coef"])
        #
        # int_matr_file = bootstrap_outmost_prefix +  "_int_matr_list.txt"
        # int_matr_df = pd.DataFrame(int_matr_dict, index=[0])
        # int_matr_df.to_csv(int_matr_file, sep="\t", index=False)
        # print "integrated matrices written to " + int_matr_file
        #
        #
        # bootstrap_result_dict = collections.OrderedDict()
        # bootstrap_result_dict["fit_result"] = bootstrap_results
        # bootstrap_result_dict["fit_resultr"] = bootstrap_resultsr
        #
        #
        #
        # output_df_file = bootstrap_outmost_prefix + "_output_df_list.txt"
        # output_df_df = pd.DataFrame(bootstrap_result_dict)
        # output_df_df.to_csv(output_df_file, sep="\t", index=False)
        # print "output dfs file written to ", output_df_file
        #
        # int_df_dict = collections.OrderedDict()
        # int_df_dict["fit_result"] = bootstrap_outmost_prefix + "_fit_result_df.txt"
        # int_df_dict["fit_resultr"] = bootstrap_outmost_prefix + "_fit_result_dfr.txt"
        #
        # int_df_file = bootstrap_outmost_prefix + "_int_df_list.txt"
        # int_df_df = pd.DataFrame(int_df_dict, index=[0])
        # int_df_df.to_csv(int_df_file, sep="\t", index=False)
        # print "Integrated dfs file written to ", int_df_file
        #
        #
        #
        # # just need to put all of this into the outmost name
        #
        #
        # finish_none_script = os.path.join("bootstrap-finish-scripts", "none", "finish-none-bootstrap-" + str(b) + ".sh")
        # with open(finish_none_script, 'w') as ifile:
        #     ifile.write("set -e\n")
        #     ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
        #     ifile.write(" && " + \
        #                 "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
        #                 )
        #     ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
        #                 " -lr " + str(args.load_reps) + \
        #                 " -bh " + "hyper" + os.sep + "best_hyper.p" + \
        #                 " -o " + \
        #                  bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
        #                 " -cfr " + int_matr_dict["coefr"] + " -fr " + \
        #                 int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
        #                 " -sb " + "n" + " -tn " + args.test_name + " -of " + bootstrap_folder + "\n")
        #     print "Finish script, stratby None, written to", finish_none_script
        #     os.chmod(finish_none_script, 0777)
        #
        # finish_none_scripts.append(finish_none_script)
        #
        #
        # finish_effect_script = os.path.join("bootstrap-finish-scripts", "effect", "finish-effect-bootstrap-" + str(b) + ".sh")
        # with open(finish_effect_script, 'w') as ifile:
        #     ifile.write("set -e\n")
        #     ifile.write("time python integrate_outputs_rand_row.py -i " + output_matr_file + " -o " + int_matr_file +  (" -t m -a 1 " if args.only_array else " -t a "))
        #     ifile.write(" && " + \
        #                 "time python integrate_outputs_rand_row.py -i " + output_df_file + " -t d -o " + int_df_file + "\n"
        #                 )
        #     ifile.write("time python get_result_coef.py -df " + data_file + " -rdf " + rand_data_file +\
        #                 " -lr " + str(args.load_reps) + \
        #                 " -bh " + "hyper" + os.sep + "best_hyper.p" + \
        #                 " -o " + \
        #                 bootstrap_outmost_name + " -cf " +  int_matr_dict["coef"] + " -if " + int_matr_dict["intercept"] + \
        #                 " -cfr " + int_matr_dict["coefr"] + " -fr " + \
        #                 int_df_dict["fit_result"] + " -frr " + int_df_dict["fit_resultr"] + " -l " + str(args.lag) + \
        #                 " -sb " + "e" + " -tn " + args.test_name  + " -of " + bootstrap_folder + "\n")
        #     print "Finish script, stratby effect, written to", finish_effect_script
        #     os.chmod(finish_effect_script, 0777)
        #
        # finish_effect_scripts.append(finish_effect_script)


        # get all the fdr files immediately

        # for fdr in fdrs:
        #     all_fdr_none_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "none",
        #                        bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "none" +  "_coefs.p"))
        #     all_fdr_effect_coefs_dict[fdr].append(os.path.join(bootstrap_folder, "fdr-" + str(fdr) + "-" + "effect",
        #                         bootstrap_outmost_name + "-union-fdr-" + str(fdr) + "-" + "effect" +  "_coefs.p"))




        # print "-----------"


    int_coef_file = "all_bootstrap_coefs.txt"



    # integrate all the bootrastrapped FDR

    bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results")
    if not os.path.exists(bootstrap_result_folder):
        os.makedirs(bootstrap_result_folder)


    bootstrap_summary_file = "get_result_bootstrap.sh"
    with open(bootstrap_summary_file, 'w') as f:
        f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                             " -o " + os.path.join(bootstrap_result_folder, args.output_name) + " -l " + str(args.lag) + " -tn " + args.test + \
                " -b " + int_coef_file + " -da 1")
    os.chmod(bootstrap_summary_file, 0o777)
    print("Script to analyze integrated bootstrapped coefs in", bootstrap_summary_file)


    for fdr in fdrs:
        print("*************************")
        print("Integrating bootstrap files for FDR ", fdr)

        print("****EFFECT***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-effect")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        # write the fdr file out
        bootstrap_fdr_effect_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-effect.txt"
        # with open(bootstrap_fdr_effect_list_file, 'w') as f:
        #     for b_coef in all_fdr_effect_coefs_dict[fdr]:
        #         f.write(b_coef + "\n")
        #
        #     print "All fdr effect written to ", bootstrap_fdr_effect_list_file


        bootstrap_fdr_effect_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-effect.sh"

        with open(bootstrap_fdr_effect_summary_script, 'w') as f:
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-effect" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_effect_list_file +  " -da 0")
            os.chmod(bootstrap_fdr_effect_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_effect_summary_script)


        print("-----------------------")


        print("****NONE***")

        bootstrap_result_folder = os.path.join("bootstrap", "bootstrap-results-fdr-" + str(fdr) + "-none")
        if not os.path.exists(bootstrap_result_folder):
            os.makedirs(bootstrap_result_folder)


        bootstrap_fdr_none_list_file = "all_bootstrap_coefs_fdr-" + str(fdr) + "-none.txt"
        # with open(bootstrap_fdr_none_list_file, 'w') as f:
        #     for b_coef in all_fdr_none_coefs_dict[fdr]:
        #         f.write(b_coef + "\n")
        #
        #     print "All fdr none written to ", bootstrap_fdr_none_list_file


        bootstrap_fdr_none_summary_script = "get_result_bootstrap-fdr-" + str(fdr) + "-none.sh"

        with open(bootstrap_fdr_none_summary_script, 'w') as f:
            f.write("set -e\n")
            f.write("time python get_result_bootstrap.py -df " + data_file + " -lr " + str(args.load_reps) + \
                    " -o " + os.path.join(bootstrap_result_folder, args.output_name) + "-fdr-" + str(fdr) + "-none" + " -l " + str(args.lag) + " -tn " + args.test + \
                    " -b " + bootstrap_fdr_none_list_file + " -da 0")
            os.chmod(bootstrap_fdr_none_summary_script, 0o777)
        print("Script to analyze integrated bootstrapped coefs in", bootstrap_fdr_none_summary_script)


        print()
    print("FDR DONE ")
    print(" *************************************")