Exemple #1
0
def load_and_run(args):

    lag = args.lag
    save_prefix = args.save_prefix

    assert args.stratify_by in {"e", "n"}

    stratify_by = cp.args2stratify_by[args.stratify_by]

    # Load data file and prepare a file to pass to plotters
    if args.load_reps:
        # load
        dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list(
            args.data_file)
        dfsr, genesr, geneTSr, dfr, __, __ = gtm.load_rep_file_list(
            args.rand_data_file)

        # get shared prefix timekeys

        print "Timekeys: ", timekeys
        print "Num per key: ", num_per_keys

    else:
        df = pd.read_csv(args.data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
        dfr = pd.read_csv(args.rand_data_file, sep="\t")
        genesr, geneTSr = gtm.get_gene_TS(dfr)

        timekeys = df.columns.values[1:]
        print "Timekeys: ", timekeys

        # Num. replicates per key
        num_per_keys = None

    assert (geneTS.shape == geneTSr.shape)
    assert (genes == genesr).all()

    coefs = pickle.load(open(args.coef_file, 'rB'))
    intercepts = pickle.load(open(args.intercept_file, 'rB'))
    fit_result_df = pd.read_csv(args.fit_result_file, sep="\t")

    coefsr = pickle.load(open(args.coef_rand_file, 'rB'))
    # interceptsr = pickle.load(open(args.intercept_rand_file, 'rB'))
    fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t")

    if args.best_hyper_file != None:
        best_hyper = pickle.load(open(args.best_hyper_file, 'rB'))
    else:
        best_hyper = None

    print "RESULTS"
    print "*************************"
    print "NORMAL: "
    cp.summarize_fit(coefs,
                     intercepts,
                     fit_result_df,
                     filename="fit_all_summary_normal.txt",
                     hyper=best_hyper,
                     test_name=args.test_name,
                     lag=lag)

    # print "*************************"
    # print "RANDOM:"
    # cp.summarize_fit(coefsr, interceptsr, fit_result_dfr, filename="fit_all_summary_random.txt", hyper=best_hyper,
    #                  test_name=args.test_name, lag=lag)

    # LEFT OFF HERE: SEE IF YOU CAN STILL DO FIT_RESULT_SUMMARY W/O INTERCEPT
    # -Jlu 1/25/17 10:14 AM

    # Align the coefs

    # print "Aligning coefficients"
    acoefs = lc.align_coefs(coefs, lag)
    acoefsr = lc.align_coefs(coefsr, lag)

    print "Removing alphas (gene-on-self effects) "

    acoefs = lc.remove_alphas(acoefs, lag)
    acoefsr = lc.remove_alphas(acoefsr, lag)

    coef_nets = []
    coefr_nets = []

    # Save the gene matrices
    for i in range(acoefs.shape[0]):
        coef_matr_filename = save_prefix + "-" + str(i + 1) + "-matrix.txt"
        coefr_matr_filename = save_prefix + "-" + str(i + 1) + "-r-matrix.txt"

        coef_net_filename = save_prefix + "-" + str(i + 1) + "-network.txt"
        coefr_net_filename = save_prefix + "-" + str(i + 1) + "-r-network.txt"

        coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename,
                                         matrix=acoefs[i],
                                         genes=genes)
        coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename,
                                          matrix=acoefsr[i],
                                          genes=genes)

        extra_dict = collections.OrderedDict()
        extra_dict["Test"] = args.test_name
        extra_dict["Lag"] = acoefs.shape[0]
        extra_dict["Coef"] = i + 1

        coef_net = nh.matr_to_net(coef_matr,
                                  extra_dict=extra_dict,
                                  make_type=False)
        coefr_net = nh.matr_to_net(coefr_matr,
                                   extra_dict=extra_dict,
                                   make_type=False)

        coef_net.to_csv(coef_net_filename, sep="\t", index=False)
        coefr_net.to_csv(coefr_net_filename, sep="\t", index=False)

        coef_nets.append(coef_net)
        coefr_nets.append(coefr_net)

        print "Coef ", i + 1
        print "Networks written to "
        print coef_net_filename
        print coefr_net_filename

    # max_net_filename = save_prefix + "-max-network.txt"
    # max_r_net_filename = save_prefix + "-max-r-network.txt"
    union_net_filename = save_prefix + "-union-network.txt"
    union_r_net_filename = save_prefix + "-union-r-network.txt"

    if acoefs.shape[0] > 1:
        m_net = cp.get_max_network(coef_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_net = cp.get_union_network(
            coef_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
        print "Max network edges: ", m_net.shape
        print "Union network edges: ", union_net.shape
    else:
        union_net = coef_nets[0]
    union_net.to_csv(union_net_filename, sep="\t", index=False)

    if acoefsr.shape[0] > 1:
        m_net = cp.get_max_network(coefr_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_r_net = cp.get_union_network(
            coefr_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
    else:
        union_r_net = coefr_nets[0]
    union_r_net.to_csv(union_r_net_filename, sep="\t", index=False)

    # print "Max networks written to "
    # print max_net_filename
    # print max_r_net_filename
    print "Unioned networks written to "
    print union_net_filename
    print union_r_net_filename

    if not os.path.exists("plots"):
        os.makedirs("plots")
    if not os.path.exists("plots" + os.sep + "betas"):
        os.makedirs("plots" + os.sep + "betas")

    # Plot the betas
    for i in range(acoefs.shape[0]):

        if len(np.nonzero(acoefs[i])[0]) > 0 and len(
                np.nonzero(acoefsr[i])[0]) > 0:

            fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(),
                          acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                          filename="plots" + os.sep + "betas" + os.sep +
                          "beta_nonzero_coef-" + str(i + 1),
                          title="Causal coefs, Coef " + str(i + 1),
                          xlabel="Causal Coefficient")
            fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(),
                          acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                          filename="plots" + os.sep + "betas" + os.sep +
                          "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90",
                          zoom_in_top_percentile=95,
                          zoom_in_bottom_percentile=5,
                          title="Causal coefs, Coef " + str(i + 1),
                          xlabel="Causal Coefficient")

            fc.plot_betas(
                np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                filename="plots" + os.sep + "betas" + os.sep +
                "beta_abs_coef-" + str(i + 1),
                title="Absolute causal coefs, Coef " + str(i + 1),
                xlabel="Absolute Causal Coefficient")
            fc.plot_betas(
                np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                filename="plots" + os.sep + "betas" + os.sep +
                "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95",
                zoom_in_top_percentile=95,
                title="Absolute causal coefs, Coef " + str(i + 1),
                xlabel="Absolute Causal Coefficient")
            fc.plot_betas(
                np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                filename="plots" + os.sep + "betas" + os.sep +
                "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5",
                zoom_in_bottom_percentile=95,
                title="Absolute causal coefs, Coef " + str(i + 1),
                xlabel="Absolute Causal Coefficient")

        print "Coef ", i + 1
        print "Plots of betas written to: plots" + os.sep + "betas"

    # get FDRS
    fdrs = [0.01, 0.05, 0.1, 0.2]

    acoefs_fdrs = []
    sf_dfs = []

    for fdr in fdrs:

        fdr_dir = "fdr-" + str(fdr) + "-" + stratify_by
        if not os.path.exists(fdr_dir):
            os.makedirs(fdr_dir)

        fdr_prefix = fdr_dir + os.sep + save_prefix

        acoefs_fdr = np.zeros(acoefs.shape)

        fdr_nets = []

        print "*************"
        for i in range(acoefs.shape[0]):
            print "-----"
            print "FDR = ", fdr
            print "Lag ", lag
            print "Coef ", i + 1
            print "Stratify ", stratify_by
            acoefs_fdr[i], threshes = fc.get_abs_thresh(
                acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by)
            # print "Threshes", threshes

            fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-matrix.txt"
            fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-network.txt"

            fdr_matr = gtm.save_gene_matrix(fdr_matr_filename,
                                            matrix=acoefs_fdr[i],
                                            genes=genes)
            pickle.dump(
                threshes,
                open(
                    fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" +
                    stratify_by + "-threshes.p", 'wB'))

            extra_dict = collections.OrderedDict()
            extra_dict["Test"] = args.test_name
            extra_dict["Lag"] = acoefs.shape[0]
            extra_dict["Coef"] = i + 1

            fdr_net = nh.matr_to_net(fdr_matr,
                                     extra_dict=extra_dict,
                                     make_type=False)
            fdr_net.to_csv(fdr_net_filename, sep="\t", index=False)
            fdr_nets.append(fdr_net)

            # write summary readme
            sf_df = fc.summarize_fdr(matr=acoefs_fdr[i],
                                     test=args.test_name,
                                     fdr=fdr,
                                     lag=lag,
                                     coef=i + 1,
                                     hyper=best_hyper,
                                     thresh=threshes,
                                     readme_name=fdr_prefix + "-" +
                                     str(i + 1) + "-fdr-" + str(fdr) + "-" +
                                     stratify_by + "-README.txt",
                                     matrixname=fdr_matr_filename,
                                     filename=fdr_net_filename)

            sf_dfs.append(sf_df)

            print "Network edges: ", fdr_net.shape[0]

        if acoefs_fdr.shape[0] > 1:
            m_net = cp.get_max_network(fdr_nets,
                                       max_col="AbsWeight",
                                       index_col="Cause-Effect")
            union_net = cp.get_union_network(
                fdr_nets + [m_net],
                suffixes=[str(i)
                          for i in range(1, acoefs_fdr.shape[0] + 1)] + [""])

        else:
            union_net = fdr_nets[0]

        union_net_filename = fdr_prefix + "-union-fdr-" + str(
            fdr) + "-" + stratify_by + "-network.txt"
        union_net.to_csv(union_net_filename, sep="\t", index=False)

        print "Union network edges", union_net.shape[0]
        print "Written to ", union_net_filename

        acoefs_fdrs.append(acoefs_fdr.copy())

    all_sf_dfs = pd.concat(sf_dfs)

    all_sf_dfs.to_csv("fit_all_summary_fdr-" + stratify_by + ".txt",
                      sep="\t",
                      index=False)
    print "********"
    print "Summaries of all fdrs written to fit_all_summary_fdr-" + stratify_by + ".txt"
    print "Matrices done."

    with open("matrices_done.txt", 'w') as donefile:
        donefile.write("done\n")

    if args.plot_coef_fdr:
        print "*******"
        print "PLOTS"
        for i, fdr in zip(range(len(fdrs)), fdrs):
            acoefs_fdr = acoefs_fdrs[i]

            if not os.path.exists("plots" + os.sep + "fdr-" + str(fdr)):
                os.makedirs("plots" + os.sep + "fdr-" + str(fdr))

            # Only plot the bar if replicates were loaded
            cp.plot_all_coef(acoefs_fdr,
                             df,
                             genes,
                             lag,
                             file_prefix="plots" + os.sep + "fdr-" + str(fdr) +
                             os.sep + save_prefix + "-",
                             plot_bar=args.load_reps,
                             keys=timekeys,
                             num_per_keys=num_per_keys,
                             linewidth=2,
                             capsize=5,
                             capwidth=2,
                             verbose=True)

            # Plot them without error bars just to check
            if args.load_reps:
                cp.plot_all_coef(acoefs_fdr,
                                 df,
                                 genes,
                                 lag,
                                 file_prefix="plots" + os.sep + "fdr-" +
                                 str(fdr) + os.sep + save_prefix + "-nobar-",
                                 plot_bar=False,
                                 keys=timekeys,
                                 num_per_keys=num_per_keys,
                                 linewidth=2,
                                 capsize=5,
                                 capwidth=2)

            print "FDR plots written to: ", "plots" + os.sep + "fdr-" + str(
                fdr)

    # Plot all the coefs
    # NOTE: this will take a long time!
    if args.plot_all:

        raise ValueError(
            "Fix all the below first before trying to do plot all")

        if not os.path.exists("plots" + os.sep + "original"):
            os.makedirs("plots" + os.sep + "original")
        cp.plot_all_coef(acoefs,
                         df,
                         genes,
                         lag,
                         file_prefix="plots" + os.sep + "original" + os.sep +
                         save_prefix + "-",
                         plot_bar=args.load_reps,
                         keys=timekeys,
                         num_per_keys=num_per_keys,
                         linewidth=2,
                         capsize=5,
                         capwidth=2)
        print "Original plots written to: ", "plots" + os.sep + "original"

        if not os.path.exists("plots" + os.sep + "randomized"):
            os.makedirs("plots" + os.sep + "randomized")
        cp.plot_all_coef(acoefsr,
                         dfr,
                         genes,
                         lag,
                         file_prefix="plots" + os.sep + "randomized" + os.sep +
                         save_prefix + "-",
                         plot_bar=args.load_reps,
                         keys=timekeys,
                         num_per_keys=num_per_keys,
                         linewidth=2,
                         capsize=5,
                         capwidth=2)

        print "Randomized plots written to: ", "plots" + os.sep + "randomized"
def load_and_run(args):


    lag = args.lag
    save_prefix = args.save_prefix
    full_save_prefix = os.path.join(args.result_save_folder, save_prefix)



     # Load data file and prepare a file to pass to plotters
    if args.load_reps:
        # load
        genes, _ = gtm.load_basic_rep_file_list(args.data_file)
        # _, genes, _, _, _, _  = gtm.load_rep_file_list(args.data_file)

        # dfs, genes, geneTS, df, timekeys, num_per_keys  = gtm.load_rep_file_list(args.data_file)

        # print "Timekeys: ", timekeys
        # print "Num per key: ", num_per_keys


    else:
        df = pd.read_csv(args.data_file, sep="\t")
        genes, _ = gtm.get_gene_TS(df)
        # dfr = pd.read_csv(args.rand_data_file, sep="\t")
        # genesr, geneTSr = gtm.get_gene_TS(dfr)
        #
        # timekeys = df.columns.values[1:]
        # print "Timekeys: ", timekeys
        #
        # # Num. replicates per key
        # num_per_keys = None



    with open(args.bootstrap_file_with_names, 'r') as f:
        filenames = [line.split("\n")[0] for line in f.readlines()]




        if args.do_lite:

            stats_matr_dict = cp.bootstrap_matrices_iter_free(filenames)


        else:

            if args.transpose_bootstrap_folder == None:
                raise ValueError("If doing bootstrap calculation, transpose is required")

        # allow the other problem




            transpose_bootstrap_folder = os.path.join(args.outer_save_folder, args.transpose_bootstrap_folder)

            if not os.path.exists(transpose_bootstrap_folder):
                os.makedirs(transpose_bootstrap_folder)
            if not os.path.exists(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump))):
                os.makedirs(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump)))


            transpose_prefix = os.path.join(transpose_bootstrap_folder,
                                            save_prefix)
            dump_prefix = os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump), save_prefix)


            t = time.time()
            bootstrap_coef_file_matr = transpose_bootstrap_matrices(filenames,
                                                                    length_before_dump=args.length_before_dump,
                                                                    save_prefix=transpose_prefix,
                                                                    dump_prefix=dump_prefix
                                                                    )
            print("Time to transpose: ", time.time() - t)

            bootstrap_coef_filename = dump_prefix + "-NAMES.p"

            pickle.dump(bootstrap_coef_file_matr, open(bootstrap_coef_filename, 'wb'))

            print("Bootstrap coef matrix dumped to ", bootstrap_coef_filename)

            t = time.time()
            stats_matr_dict = compute_bootstrap_stats_matr(bootstrap_coef_file_matr)
            print("Time to get stats: ", time.time() - t)



        # align results

    if args.dump_raw:
        dump_stats_matr_dict = stats_matr_dict.copy()

        if args.unalign_before_raw_dump:
            for k in dump_stats_matr_dict:
                dump_stats_matr_dict[k] = lc.unalign_coefs(dump_stats_matr_dict[k],
                                                           lag)


        for k in dump_stats_matr_dict:
            outfile = full_save_prefix + "_raw_" + k + "_coefs.p"
            with open(outfile, 'wb') as f:
                pickle.dump(dump_stats_matr_dict[k], f)

            print("For ", k , "Saved to ", outfile)





    if args.do_align:
        for k in stats_matr_dict:
            stats_matr_dict[k] = lc.align_coefs(stats_matr_dict[k], lag)




    # Save the gene matrices

    # Note bootstrap_matr is of form lag x n x n

    full_nets = []
    for i in range(1, lag + 1):
        print("Lag: ", i)

        print("Aggregating results")
        #bootstrap_mean, bootstrap_std, bootstrap_freq = cp.get_bootstrap_results(bootstrap_lag_to_matrs[i])


        extra_dict = collections.OrderedDict()
        extra_dict["Test"] = args.test_name
        extra_dict["Lag"] = lag
        extra_dict["Coef"] = i



        nets = []

        for k in stats_matr_dict:
            raw_matr = stats_matr_dict[k][i-1]
            matr_filename = full_save_prefix + "-" + str(i) + "-bootstrap-" + k + "-matrix.txt"

            matr = gtm.save_gene_matrix(matr_filename, matrix=raw_matr, genes=genes)

            print("Saved ", k, " to ", matr_filename)

            if k == "mean":
                net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(),
                                      abs_name="AbsBootstrap:" + k.capitalize(),
                                     do_sort=False, extra_dict=extra_dict)
            else:
                net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(),
                                      no_abs=True,
                                     do_sort=False, extra_dict=extra_dict)

            nets.append(net)

        full_net = nets[0]

        for j in range(1, len(nets)):
            full_net = full_net.merge(nets[j], how='outer')



        print("Final net: ", full_net.shape[0])

        sortby = "Bootstrap:Freq"
        print("Sorting by :", sortby)
        full_net.sort_values(sortby, inplace=True, ascending=False)

        full_net_filename = full_save_prefix +"-" + str(i) + "-bootstrap-network.txt"
        full_net.to_csv(full_net_filename, sep="\t", index=False)
        print("Written to ", full_net_filename)

        full_nets.append(full_net)

    union_net_filename = full_save_prefix + "-union-bootstrap-network.txt"

    if lag > 1:

        m_net = cp.get_max_network(full_nets, max_col="AbsBootstrap:Mean", index_col="Cause-Effect")
        union_net = cp.get_union_network(full_nets + [m_net], suffixes=[str(i) for i in range(1, lag + 1)] + [""])
        print("Max network edges: ", m_net.shape)
        print("Union network edges: ", union_net.shape)
    else:
        union_net = full_nets[0]

    sortby = "Bootstrap:Freq"
    print("Sorting by :", sortby)
    union_net.sort_values(sortby, inplace=True, ascending=False)

    union_net.to_csv(union_net_filename, sep="\t", index=False)
    print("Unioned bootstrap network written to ", union_net_filename)
def load_and_run(args):

    lag = args.lag
    save_prefix = args.save_prefix

    assert args.stratify_by in {"e", "n"}

    stratify_by = cp.args2stratify_by[args.stratify_by]

    if args.output_folder == None:
        args.output_folder = "."

    # Load data file and prepare a file to pass to plotters
    if args.load_reps:
        # load

        genes, geneTS = gtm.load_basic_rep_file_list(args.data_file)
        genesr, geneTSr = gtm.load_basic_rep_file_list(args.rand_data_file)

        # dfs, genes, geneTS, df, timekeys, num_per_keys  = gtm.load_rep_file_list(args.data_file)
        # dfsr, genesr, geneTSr, dfr, __, __  = gtm.load_rep_file_list(args.rand_data_file)

        # get shared prefix timekeys

        # print "Timekeys: ", timekeys
        # print "Num per key: ", num_per_keys

    else:
        df = pd.read_csv(args.data_file, sep="\t")
        genes, geneTS = gtm.get_gene_TS(df)
        dfr = pd.read_csv(args.rand_data_file, sep="\t")
        genesr, geneTSr = gtm.get_gene_TS(dfr)

        timekeys = df.columns.values[1:]
        print("Timekeys: ", timekeys)

        # Num. replicates per key
        num_per_keys = None

    assert (geneTS.shape == geneTSr.shape)
    assert (genes == genesr).all()

    coefs = pickle.load(open(args.coef_file, 'rb'))
    intercepts = pickle.load(open(args.intercept_file, 'rb'))
    fit_result_df = pd.read_csv(args.fit_result_file, sep="\t")

    coefsr = pickle.load(open(args.coef_rand_file, 'rb'))
    # interceptsr = pickle.load(open(args.intercept_rand_file, 'rb'))
    fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t")

    if args.best_hyper_file != None:
        best_hyper = pickle.load(open(args.best_hyper_file, 'rb'))
    else:
        best_hyper = None

    print("RESULTS")

    print("*************************")
    print("RESIDUALS: ")

    print("*************************")
    print("NORMAL: ")
    cp.summarize_fit(coefs,
                     intercepts,
                     fit_result_df,
                     filename=os.path.join(args.output_folder,
                                           "fit_all_summary_normal.txt"),
                     hyper=best_hyper,
                     test_name=args.test_name,
                     lag=lag)

    # Align the coefs

    # print "Aligning coefficients"
    acoefs = lc.align_coefs(coefs, lag)
    acoefsr = lc.align_coefs(coefsr, lag)

    print("Removing alphas (gene-on-self effects) ")

    acoefs = lc.remove_alphas(acoefs, lag)
    acoefsr = lc.remove_alphas(acoefsr, lag)

    coef_nets = []
    coefr_nets = []

    # Save the gene matrices
    for i in range(acoefs.shape[0]):
        coef_matr_filename = os.path.join(
            args.output_folder, save_prefix + "-" + str(i + 1) + "-matrix.txt")
        coefr_matr_filename = os.path.join(
            args.output_folder,
            save_prefix + "-" + str(i + 1) + "-r-matrix.txt")

        coef_net_filename = os.path.join(
            args.output_folder,
            save_prefix + "-" + str(i + 1) + "-network.txt")
        coefr_net_filename = os.path.join(
            args.output_folder,
            save_prefix + "-" + str(i + 1) + "-r-network.txt")

        coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename,
                                         matrix=acoefs[i],
                                         genes=genes)
        coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename,
                                          matrix=acoefsr[i],
                                          genes=genes)

        extra_dict = collections.OrderedDict()
        extra_dict["Test"] = args.test_name
        extra_dict["Lag"] = acoefs.shape[0]
        extra_dict["Coef"] = i + 1

        coef_net = nh.matr_to_net(coef_matr,
                                  extra_dict=extra_dict,
                                  make_type=False)
        coefr_net = nh.matr_to_net(coefr_matr,
                                   extra_dict=extra_dict,
                                   make_type=False)

        coef_net.to_csv(coef_net_filename, sep="\t", index=False)
        coefr_net.to_csv(coefr_net_filename, sep="\t", index=False)

        coef_nets.append(coef_net)
        coefr_nets.append(coefr_net)

        print("Coef ", i + 1)
        print("Networks written to ")
        print(coef_net_filename)
        print(coefr_net_filename)

    # max_net_filename = save_prefix + "-max-network.txt"
    # max_r_net_filename = save_prefix + "-max-r-network.txt"
    union_net_filename = os.path.join(args.output_folder,
                                      save_prefix + "-union-network.txt")
    union_r_net_filename = os.path.join(args.output_folder,
                                        save_prefix + "-union-r-network.txt")

    if acoefs.shape[0] > 1:
        m_net = cp.get_max_network(coef_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_net = cp.get_union_network(
            coef_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
        print("Max network edges: ", m_net.shape)
        print("Union network edges: ", union_net.shape)
    else:
        union_net = coef_nets[0]
    union_net.to_csv(union_net_filename, sep="\t", index=False)

    if acoefsr.shape[0] > 1:
        m_net = cp.get_max_network(coefr_nets,
                                   max_col="AbsWeight",
                                   index_col="Cause-Effect")
        union_r_net = cp.get_union_network(
            coefr_nets + [m_net],
            suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""])
    else:
        union_r_net = coefr_nets[0]
    union_r_net.to_csv(union_r_net_filename, sep="\t", index=False)

    # print "Max networks written to "
    # print max_net_filename
    # print max_r_net_filename
    print("Unioned networks written to ")
    print(union_net_filename)
    print(union_r_net_filename)

    if not os.path.exists(os.path.join(args.output_folder, "plots")):
        os.makedirs(os.path.join(args.output_folder, "plots"))

    if args.plot_coef:
        if not os.path.exists(
                os.path.join(args.output_folder, "plots", "betas")):
            os.makedirs(os.path.join(args.output_folder, "plots", "betas"))

        # Plot the betas
        for i in range(acoefs.shape[0]):

            if len(np.nonzero(acoefs[i])[0]) > 0 and len(
                    np.nonzero(acoefsr[i])[0]) > 0:

                fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(),
                              acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                              filename=os.path.join(
                                  args.output_folder, "plots", "betas",
                                  "beta_nonzero_coef-" + str(i + 1)),
                              title="Causal coefs, Coef " + str(i + 1),
                              xlabel="Causal Coefficient")
                fc.plot_betas(
                    acoefs[i][np.nonzero(acoefs[i])].flatten(),
                    acoefsr[i][np.nonzero(acoefsr[i])].flatten(),
                    filename=os.path.join(
                        args.output_folder, "plots", "betas",
                        "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90"),
                    zoom_in_top_percentile=95,
                    zoom_in_bottom_percentile=5,
                    title="Causal coefs, Coef " + str(i + 1),
                    xlabel="Causal Coefficient")

                fc.plot_betas(
                    np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                    np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                    filename=os.path.join(args.output_folder, "plots", "betas",
                                          "beta_abs_coef-" + str(i + 1)),
                    title="Absolute causal coefs, Coef " + str(i + 1),
                    xlabel="Absolute Causal Coefficient")
                fc.plot_betas(
                    np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                    np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                    filename=os.path.join(
                        args.output_folder, "plots", "betas",
                        "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95"),
                    zoom_in_top_percentile=95,
                    title="Absolute causal coefs, Coef " + str(i + 1),
                    xlabel="Absolute Causal Coefficient")
                fc.plot_betas(
                    np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()),
                    np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()),
                    filename=os.path.join(
                        args.output_folder, "plots", "betas",
                        "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5"),
                    zoom_in_bottom_percentile=95,
                    title="Absolute causal coefs, Coef " + str(i + 1),
                    xlabel="Absolute Causal Coefficient")

            print("Coef ", i + 1)
            print("Plots of betas written to: ",
                  os.path.join(args.output_folder, "plots", "betas"))

    # get FDRS
    fdrs = [0.01, 0.05, 0.1, 0.2]

    acoefs_fdrs = []
    sf_dfs = []

    for fdr in fdrs:

        fdr_dir = os.path.join(args.output_folder,
                               "fdr-" + str(fdr) + "-" + stratify_by)
        if not os.path.exists(fdr_dir):
            os.makedirs(fdr_dir)

        fdr_prefix = fdr_dir + os.sep + save_prefix

        # in case we want there to be an intermediate directory for fdr, like the bootstrap case.
        # if not os.path.exists(os.path.dirname(fdr_prefix)):
        #     os.makedirs(os.path.dirname(fdr_prefix))

        acoefs_fdr = np.zeros(acoefs.shape)

        fdr_nets = []

        print("*************")
        for i in range(acoefs.shape[0]):
            print("-----")
            print("FDR = ", fdr)
            print("Lag ", lag)
            print("Coef ", i + 1)
            print("Stratify ", stratify_by)
            acoefs_fdr[i], threshes = fc.get_abs_thresh(
                acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by)
            # print "Threshes", threshes

            fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-matrix.txt"
            fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(
                fdr) + "-" + stratify_by + "-network.txt"

            fdr_matr = gtm.save_gene_matrix(fdr_matr_filename,
                                            matrix=acoefs_fdr[i],
                                            genes=genes)
            pickle.dump(
                threshes,
                open(
                    fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" +
                    stratify_by + "-threshes.p", 'wb'))

            extra_dict = collections.OrderedDict()
            extra_dict["Test"] = args.test_name
            extra_dict["Lag"] = acoefs.shape[0]
            extra_dict["Coef"] = i + 1

            fdr_net = nh.matr_to_net(fdr_matr,
                                     extra_dict=extra_dict,
                                     make_type=False)
            fdr_net.to_csv(fdr_net_filename, sep="\t", index=False)
            fdr_nets.append(fdr_net)

            # write summary readme
            sf_df = fc.summarize_fdr(matr=acoefs_fdr[i],
                                     test=args.test_name,
                                     fdr=fdr,
                                     lag=lag,
                                     coef=i + 1,
                                     hyper=best_hyper,
                                     thresh=threshes,
                                     readme_name=fdr_prefix + "-" +
                                     str(i + 1) + "-fdr-" + str(fdr) + "-" +
                                     stratify_by + "-README.txt",
                                     matrixname=fdr_matr_filename,
                                     filename=fdr_net_filename)

            sf_dfs.append(sf_df)

            print("Network edges: ", fdr_net.shape[0])

        if acoefs_fdr.shape[0] > 1:
            m_net = cp.get_max_network(fdr_nets,
                                       max_col="AbsWeight",
                                       index_col="Cause-Effect")
            union_net = cp.get_union_network(
                fdr_nets + [m_net],
                suffixes=[str(i)
                          for i in range(1, acoefs_fdr.shape[0] + 1)] + [""])

        else:
            union_net = fdr_nets[0]

        union_net_filename = fdr_prefix + "-union-fdr-" + str(
            fdr) + "-" + stratify_by + "-network.txt"
        union_net.to_csv(union_net_filename, sep="\t", index=False)

        print("Union network edges", union_net.shape[0])
        print("Written to ", union_net_filename)

        fdr_agg_matr_filename = fdr_prefix + "-union-fdr-" + str(
            fdr) + "-" + stratify_by + "-coefs.p"
        pickle.dump(acoefs_fdr, open(fdr_agg_matr_filename, 'wb'))

        print("Thresholded matrix written as pickle file: ",
              fdr_agg_matr_filename)

        acoefs_fdrs.append(acoefs_fdr.copy())

    all_sf_dfs = pd.concat(sf_dfs)

    # Hack to allow the base to still be fit_all_summary_fdr-stratby.txt
    # While the bootstrap will write to its own file, in its own corresponding folder
    # bullshit. just sent the output folder

    save_file = os.path.join(args.output_folder,
                             "fit_all_summary_fdr-" + stratify_by + ".txt")
    all_sf_dfs.to_csv(save_file, sep="\t", index=False)
    print("********")
    print("Summaries of all fdrs written to ", save_file)
    print("Matrices done.")

    with open(os.path.join(args.output_folder, "matrices_done.txt"),
              'w') as donefile:
        donefile.write("done\n")
Exemple #4
0
def load_and_run(args):

    if args.test_name == "":
        name = ""
    else:
        name = args.test_name.capitalize() + " "

    hyperlist = pickle.load(open(args.hyper_file, 'rb'))

    int_name_df = pd.read_csv(args.int_name_dfname, sep="\t")

    print("Loading integrated")
    print(int_name_df.head())

    hyper_fit_dfs = [
        pd.read_csv(int_name_df[x].values[0], sep="\t")
        if os.path.exists(int_name_df[x].values[0]) else None
        for x in int_name_df
    ]

    # Remove the Nones for which there is no information.
    remove_list = []
    for i in range(len(hyper_fit_dfs[:])):
        try:
            # Check if its empty
            if hyper_fit_dfs[i].empty:
                remove_list.append(i)
            # If it's equal to None will have an AttributeError here
        except AttributeError:
            remove_list.append(i)

    hyper_fit_dfs = [
        h for i, h in enumerate(hyper_fit_dfs) if i not in remove_list
    ]
    hyperlist = [h for i, h in enumerate(hyperlist) if i not in remove_list]

    # Get the best hyper
    hyper_df = cp.summarize_hyper_fit_dfs(hyper_fit_dfs, hyperlist)

    best_hyper, best, hyper_df = cp.get_best_hyper(hyper_df,
                                                   sort_by=args.sort_by)

    # Write the hypers out
    pickle.dump(best_hyper, open(args.output_name, 'wb'))
    hyper_df.to_csv(args.result_dfname, sep="\t", index=0)

    print("Test is ", name)
    print("Best hyper is ", best_hyper)
    print("Best hyper result is ", best)

    print("Best hyper written to ", args.output_name)
    print("Hyper result written to ", args.result_dfname)

    if not os.path.exists("hyper"):
        os.makedirs("hyper")

    # Get correlations
    mse_vec = np.array([
        np.array(hyper_fit_df["mse"].values) for hyper_fit_df in hyper_fit_dfs
    ])

    print(mse_vec.shape)

    mse_corr = np.corrcoef(mse_vec)
    gtm.save_gene_matrix("hyper" + os.sep + "mse_corr.txt", mse_corr,
                         hyperlist)
    print("MSE Correlation:")
    print(mse_corr)
    print("MSE corr. matrix saved to ", "hyper" + os.sep + "mse_corr.txt")

    r2_vec = np.array(
        [hyper_fit_df["r2"].values for hyper_fit_df in hyper_fit_dfs])
    r2_corr = np.corrcoef(r2_vec)
    gtm.save_gene_matrix("hyper" + os.sep + "r2_corr.txt", r2_corr, hyperlist)
    print("R2 Correlation")
    print(r2_corr)
    print("R^2 corr. matrix saved to ", "hyper" + os.sep + "r2_corr.txt")

    # Plot the hyperparameters
    if not os.path.exists("plots"):
        os.makedirs("plots")
    if not os.path.exists("plots" + os.sep + "hyper"):
        os.makedirs("plots" + os.sep + "hyper")

    cp.plot_corr_matrix(mse_corr,
                        cp.hyperlist_to_labellist(hyperlist),
                        title="MSE correlation among " + name + "hyperparams",
                        filename="plots" + os.sep + "hyper" + os.sep +
                        "mse_corr")
    cp.plot_corr_matrix(
        r2_corr,
        cp.hyperlist_to_labellist(hyperlist),
        title="$r^2$ correlation among " + name + "hyperparams",
        filename="plots" + os.sep + "hyper" + os.sep + "r2_corr")

    cp.plot_hyper_boxplot(
        cp.hyperlist_to_labellist(hyperlist),
        hyper_fit_dfs,
        "r2",
        xlabel=name + "Hyperparameter",
        ylabel="$r^2$",
        title=name + "Hyperparameter VS $r^2$",
        filename="plots" + os.sep + "hyper" + os.sep + "hyperVSr2",
        hyper_color_labels=[
            (cp.hyper_to_label(best_hyper), "k",
             "Best: " + cp.hyper_to_label(best_hyper) + ", $r^2$ = " +
             str(np.round(best["r2_avg"].values[0], 1)))
        ],
        horizontal_line_color_labels=[(best["r2_avg"].values[0], 'k', None)])
    cp.plot_hyper_boxplot(
        cp.hyperlist_to_labellist(hyperlist),
        hyper_fit_dfs,
        "mse",
        xlabel=name + "Hyperparameter",
        ylabel="Mean-Squared Error",
        title=name + "Hyperparameter VS MSE",
        filename="plots" + os.sep + "hyper" + os.sep + "hyperVSmse",
        hyper_color_labels=[
            (cp.hyper_to_label(best_hyper), "k",
             "Best: " + cp.hyper_to_label(best_hyper) + ", MSE = " +
             str(np.round(best["mse_avg"].values[0], 1)))
        ],
        horizontal_line_color_labels=[(best["mse_avg"].values[0], 'k', None)])
    cp.plot_hyper_boxplot(
        cp.hyperlist_to_labellist(hyperlist),
        hyper_fit_dfs,
        "avg_df",
        xlabel=name + "Hyperparameter",
        ylabel="Degrees of Freedom",
        title=name + "Hyperparameter VS df",
        filename="plots" + os.sep + "hyper" + os.sep + "hyperVSdof",
        hyper_color_labels=[
            (cp.hyper_to_label(best_hyper), "k",
             "Best: " + cp.hyper_to_label(best_hyper) + ", df = " +
             str(int(np.round(best["df_avg"].values[0]))))
        ],
        horizontal_line_color_labels=[(best["df_avg"].values[0], 'k', None)])

    print("Correlation between hyperparameter results",
          "plots" + os.sep + "hyper")
    print("Hyper box plots of r^2, mse, avg d.o.f. written to  ",
          "plots" + os.sep + "hyper")
Exemple #5
0
def run(args):

    data = gtm.load_file_and_avg(args.original_data)
    rand_data = gtm.load_file_and_avg(args.randomized_data)

    matr = pickle.load(open(args.original_matrix, 'rB'))[:, :,
                                                         args.coef_num - 1]
    rand_matr = pickle.load(open(args.randomized_matrix,
                                 'rB'))[:, :, args.coef_num - 1]

    if args.stratify_by not in {"e", "n"}:
        raise ValueError(
            "Stratify_by must be either 'e' for effect or 'n' for none")
    else:
        if args.stratify_by == "e":
            stratify_by = "effect"
        elif args.stratify_by == "n":
            stratify_by = "none"

    print
    print "Beginning FDR control, stratifying the matrix by ", stratify_by

    genes = data["gene"]
    rand_genes = rand_data["gene"]

    if (genes != rand_genes).any():
        raise ValueError("Genes are not the same!")

    print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=matr,
                         filename=args.name + "-unshuffled-matrix.txt",
                         genes=genes)

    print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=rand_matr,
                         filename=args.name + "-shuffled-matrix.txt",
                         genes=rand_genes)

    if args.plot_prefix != None:
        plot_betas(matr.flatten(),
                   rand_matr.flatten(),
                   filename=args.plot_prefix)
        plot_betas(matr.flatten(),
                   rand_matr.flatten(),
                   filename=args.plot_prefix + "_zoom-in-95",
                   zoom_in_percentile=95)

    if args.cap_by != None:
        print "First capping original and randomized matrix"
        matr = cap_matr(matr, args.cap_by, name="Original")
        rand_matr = cap_matr(rand_matr, args.cap_by, name="Randomized")

    print "Using original"
    print "Trying to have an FDR of ", args.fdr
    print args.name

    functions = [get_abs_thresh, get_pos_neg_thresh]
    types = ["abs-thresh", "pos-neg-thresh"]
    # whether to take absolute value of given matrices
    absoluted = [True, True]

    for function, t, a in zip(functions, types, absoluted):

        print
        print "*******************"
        print t
        print "*******************"

        print "making matrix"

        out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str(
            args.fdr) + "-stratby-" + stratify_by

        thresh_matr, threshes = function(matr,
                                         rand_matr,
                                         args.fdr,
                                         stratify_by=stratify_by)

        matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr,
                                       genes)
        pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w'))

        print "Matrix written to ", out_prefix + "-matrix.txt"
        print "Threshes written to ", out_prefix + "-threshes.p"

        #write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix")

        if args.make_network:
            print "making network"
            net_df = nh.matr_to_net(matr_df,
                                    args.name + "-sb-" + args.stratify_by,
                                    make_pair=False)

            net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False)

            print "Network written to ", out_prefix + "-network.txt"

        if absoluted:
            print "Making absoluted matrix "
            abs_matr = np.absolute(thresh_matr)

            abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str(
                args.fdr) + "-stratby-" + stratify_by

            abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr,
                                          genes)

            #write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix")

            if args.make_network:
                print "Making absoluted network"
                abs_net_df = nh.matr_to_net(abs_df,
                                            args.name + "-sb-" +
                                            args.stratify_by,
                                            make_pair=False)

                abs_net_df.to_csv(abs_prefix + "-network.txt",
                                  sep="\t",
                                  index=False)

                print "Network written to ", abs_prefix + "-network.txt"

    print "FINISHED"
    print "#################################################"
    print
Exemple #6
0
def run(args):
    data = gtm.load_file_and_avg(args.original_data)
    rand_data = gtm.load_file_and_avg(args.randomized_data)

    matr = pickle.load(open(args.original_matrix, 'rB'))[:, :, args.coef_num - 1]
    rand_matr = pickle.load(open(args.randomized_matrix, 'rB'))[:, :, args.coef_num - 1]

    if args.stratify_by not in {"e", "n"}:
        raise ValueError("Stratify_by must be either 'e' for effect or 'n' for none")
    else:
        if args.stratify_by == "e":
            stratify_by = "effect"
        elif args.stratify_by == "n":
            stratify_by = "none"

    genes = data["gene"]
    rand_genes = rand_data["gene"]

    if (genes != rand_genes).any():
        raise ValueError("Genes are not the same!")


    print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=matr, filename=args.name + "-unshuffled-matrix.txt", genes=genes)

    print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt"
    gtm.save_gene_matrix(matrix=rand_matr, filename=args.name + "-shuffled-matrix.txt", genes=rand_genes)


    if args.plot_prefix != None:
        plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix)
        plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix + "_zoom-in-95", zoom_in_percentile=95)



    print "Using original"
    print "Trying to have an FDR of ", args.fdr
    print args.name


    functions = [get_abs_thresh, get_pos_thresh, get_neg_thresh, get_pos_neg_thresh]
    types = ["abs-thresh", "pos-thresh", "neg-thresh", "pos-neg-thresh"]
    # whether to take absolute value of given matrices
    absoluted = [True, False, False, True]

    for function, t, a in zip(functions, types, absoluted):
        out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str(args.fdr) + "-stratby-" + stratify_by


        thresh_matr, threshes = function(matr, rand_matr, args.fdr, stratify_by = stratify_by)


        matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr, genes)
        pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w'))

        print "Matrix written to ", out_prefix + "-matrix.txt"
        print "Threshes written to ", out_prefix + "-threshes.p"

        write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix")

        if args.make_network:
            net_df = nh.matr_to_net(matr_df, args.name, make_pair=False)

            net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False)

            print "Network written to ", out_prefix + "-network.txt"

        if absoluted:
            abs_matr = np.absolute(thresh_matr)

            abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str(args.fdr) + "-stratby-" + stratify_by

            abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr, genes)

            write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix")

            if args.make_network:
                abs_net_df = nh.matr_to_net(abs_df, args.name, make_pair=False)

                abs_net_df.to_csv(abs_prefix + "-network.txt", sep="\t", index=False)

                print "Network written to ", abs_prefix + "-network.txt"