def load_and_run(args): lag = args.lag save_prefix = args.save_prefix assert args.stratify_by in {"e", "n"} stratify_by = cp.args2stratify_by[args.stratify_by] # Load data file and prepare a file to pass to plotters if args.load_reps: # load dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list( args.data_file) dfsr, genesr, geneTSr, dfr, __, __ = gtm.load_rep_file_list( args.rand_data_file) # get shared prefix timekeys print "Timekeys: ", timekeys print "Num per key: ", num_per_keys else: df = pd.read_csv(args.data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) dfr = pd.read_csv(args.rand_data_file, sep="\t") genesr, geneTSr = gtm.get_gene_TS(dfr) timekeys = df.columns.values[1:] print "Timekeys: ", timekeys # Num. replicates per key num_per_keys = None assert (geneTS.shape == geneTSr.shape) assert (genes == genesr).all() coefs = pickle.load(open(args.coef_file, 'rB')) intercepts = pickle.load(open(args.intercept_file, 'rB')) fit_result_df = pd.read_csv(args.fit_result_file, sep="\t") coefsr = pickle.load(open(args.coef_rand_file, 'rB')) # interceptsr = pickle.load(open(args.intercept_rand_file, 'rB')) fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t") if args.best_hyper_file != None: best_hyper = pickle.load(open(args.best_hyper_file, 'rB')) else: best_hyper = None print "RESULTS" print "*************************" print "NORMAL: " cp.summarize_fit(coefs, intercepts, fit_result_df, filename="fit_all_summary_normal.txt", hyper=best_hyper, test_name=args.test_name, lag=lag) # print "*************************" # print "RANDOM:" # cp.summarize_fit(coefsr, interceptsr, fit_result_dfr, filename="fit_all_summary_random.txt", hyper=best_hyper, # test_name=args.test_name, lag=lag) # LEFT OFF HERE: SEE IF YOU CAN STILL DO FIT_RESULT_SUMMARY W/O INTERCEPT # -Jlu 1/25/17 10:14 AM # Align the coefs # print "Aligning coefficients" acoefs = lc.align_coefs(coefs, lag) acoefsr = lc.align_coefs(coefsr, lag) print "Removing alphas (gene-on-self effects) " acoefs = lc.remove_alphas(acoefs, lag) acoefsr = lc.remove_alphas(acoefsr, lag) coef_nets = [] coefr_nets = [] # Save the gene matrices for i in range(acoefs.shape[0]): coef_matr_filename = save_prefix + "-" + str(i + 1) + "-matrix.txt" coefr_matr_filename = save_prefix + "-" + str(i + 1) + "-r-matrix.txt" coef_net_filename = save_prefix + "-" + str(i + 1) + "-network.txt" coefr_net_filename = save_prefix + "-" + str(i + 1) + "-r-network.txt" coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename, matrix=acoefs[i], genes=genes) coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename, matrix=acoefsr[i], genes=genes) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 coef_net = nh.matr_to_net(coef_matr, extra_dict=extra_dict, make_type=False) coefr_net = nh.matr_to_net(coefr_matr, extra_dict=extra_dict, make_type=False) coef_net.to_csv(coef_net_filename, sep="\t", index=False) coefr_net.to_csv(coefr_net_filename, sep="\t", index=False) coef_nets.append(coef_net) coefr_nets.append(coefr_net) print "Coef ", i + 1 print "Networks written to " print coef_net_filename print coefr_net_filename # max_net_filename = save_prefix + "-max-network.txt" # max_r_net_filename = save_prefix + "-max-r-network.txt" union_net_filename = save_prefix + "-union-network.txt" union_r_net_filename = save_prefix + "-union-r-network.txt" if acoefs.shape[0] > 1: m_net = cp.get_max_network(coef_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( coef_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) print "Max network edges: ", m_net.shape print "Union network edges: ", union_net.shape else: union_net = coef_nets[0] union_net.to_csv(union_net_filename, sep="\t", index=False) if acoefsr.shape[0] > 1: m_net = cp.get_max_network(coefr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_r_net = cp.get_union_network( coefr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) else: union_r_net = coefr_nets[0] union_r_net.to_csv(union_r_net_filename, sep="\t", index=False) # print "Max networks written to " # print max_net_filename # print max_r_net_filename print "Unioned networks written to " print union_net_filename print union_r_net_filename if not os.path.exists("plots"): os.makedirs("plots") if not os.path.exists("plots" + os.sep + "betas"): os.makedirs("plots" + os.sep + "betas") # Plot the betas for i in range(acoefs.shape[0]): if len(np.nonzero(acoefs[i])[0]) > 0 and len( np.nonzero(acoefsr[i])[0]) > 0: fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename="plots" + os.sep + "betas" + os.sep + "beta_nonzero_coef-" + str(i + 1), title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename="plots" + os.sep + "betas" + os.sep + "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90", zoom_in_top_percentile=95, zoom_in_bottom_percentile=5, title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename="plots" + os.sep + "betas" + os.sep + "beta_abs_coef-" + str(i + 1), title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename="plots" + os.sep + "betas" + os.sep + "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95", zoom_in_top_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename="plots" + os.sep + "betas" + os.sep + "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5", zoom_in_bottom_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") print "Coef ", i + 1 print "Plots of betas written to: plots" + os.sep + "betas" # get FDRS fdrs = [0.01, 0.05, 0.1, 0.2] acoefs_fdrs = [] sf_dfs = [] for fdr in fdrs: fdr_dir = "fdr-" + str(fdr) + "-" + stratify_by if not os.path.exists(fdr_dir): os.makedirs(fdr_dir) fdr_prefix = fdr_dir + os.sep + save_prefix acoefs_fdr = np.zeros(acoefs.shape) fdr_nets = [] print "*************" for i in range(acoefs.shape[0]): print "-----" print "FDR = ", fdr print "Lag ", lag print "Coef ", i + 1 print "Stratify ", stratify_by acoefs_fdr[i], threshes = fc.get_abs_thresh( acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by) # print "Threshes", threshes fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-matrix.txt" fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" fdr_matr = gtm.save_gene_matrix(fdr_matr_filename, matrix=acoefs_fdr[i], genes=genes) pickle.dump( threshes, open( fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-threshes.p", 'wB')) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 fdr_net = nh.matr_to_net(fdr_matr, extra_dict=extra_dict, make_type=False) fdr_net.to_csv(fdr_net_filename, sep="\t", index=False) fdr_nets.append(fdr_net) # write summary readme sf_df = fc.summarize_fdr(matr=acoefs_fdr[i], test=args.test_name, fdr=fdr, lag=lag, coef=i + 1, hyper=best_hyper, thresh=threshes, readme_name=fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-README.txt", matrixname=fdr_matr_filename, filename=fdr_net_filename) sf_dfs.append(sf_df) print "Network edges: ", fdr_net.shape[0] if acoefs_fdr.shape[0] > 1: m_net = cp.get_max_network(fdr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( fdr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs_fdr.shape[0] + 1)] + [""]) else: union_net = fdr_nets[0] union_net_filename = fdr_prefix + "-union-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" union_net.to_csv(union_net_filename, sep="\t", index=False) print "Union network edges", union_net.shape[0] print "Written to ", union_net_filename acoefs_fdrs.append(acoefs_fdr.copy()) all_sf_dfs = pd.concat(sf_dfs) all_sf_dfs.to_csv("fit_all_summary_fdr-" + stratify_by + ".txt", sep="\t", index=False) print "********" print "Summaries of all fdrs written to fit_all_summary_fdr-" + stratify_by + ".txt" print "Matrices done." with open("matrices_done.txt", 'w') as donefile: donefile.write("done\n") if args.plot_coef_fdr: print "*******" print "PLOTS" for i, fdr in zip(range(len(fdrs)), fdrs): acoefs_fdr = acoefs_fdrs[i] if not os.path.exists("plots" + os.sep + "fdr-" + str(fdr)): os.makedirs("plots" + os.sep + "fdr-" + str(fdr)) # Only plot the bar if replicates were loaded cp.plot_all_coef(acoefs_fdr, df, genes, lag, file_prefix="plots" + os.sep + "fdr-" + str(fdr) + os.sep + save_prefix + "-", plot_bar=args.load_reps, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2, verbose=True) # Plot them without error bars just to check if args.load_reps: cp.plot_all_coef(acoefs_fdr, df, genes, lag, file_prefix="plots" + os.sep + "fdr-" + str(fdr) + os.sep + save_prefix + "-nobar-", plot_bar=False, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2) print "FDR plots written to: ", "plots" + os.sep + "fdr-" + str( fdr) # Plot all the coefs # NOTE: this will take a long time! if args.plot_all: raise ValueError( "Fix all the below first before trying to do plot all") if not os.path.exists("plots" + os.sep + "original"): os.makedirs("plots" + os.sep + "original") cp.plot_all_coef(acoefs, df, genes, lag, file_prefix="plots" + os.sep + "original" + os.sep + save_prefix + "-", plot_bar=args.load_reps, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2) print "Original plots written to: ", "plots" + os.sep + "original" if not os.path.exists("plots" + os.sep + "randomized"): os.makedirs("plots" + os.sep + "randomized") cp.plot_all_coef(acoefsr, dfr, genes, lag, file_prefix="plots" + os.sep + "randomized" + os.sep + save_prefix + "-", plot_bar=args.load_reps, keys=timekeys, num_per_keys=num_per_keys, linewidth=2, capsize=5, capwidth=2) print "Randomized plots written to: ", "plots" + os.sep + "randomized"
def load_and_run(args): lag = args.lag save_prefix = args.save_prefix full_save_prefix = os.path.join(args.result_save_folder, save_prefix) # Load data file and prepare a file to pass to plotters if args.load_reps: # load genes, _ = gtm.load_basic_rep_file_list(args.data_file) # _, genes, _, _, _, _ = gtm.load_rep_file_list(args.data_file) # dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list(args.data_file) # print "Timekeys: ", timekeys # print "Num per key: ", num_per_keys else: df = pd.read_csv(args.data_file, sep="\t") genes, _ = gtm.get_gene_TS(df) # dfr = pd.read_csv(args.rand_data_file, sep="\t") # genesr, geneTSr = gtm.get_gene_TS(dfr) # # timekeys = df.columns.values[1:] # print "Timekeys: ", timekeys # # # Num. replicates per key # num_per_keys = None with open(args.bootstrap_file_with_names, 'r') as f: filenames = [line.split("\n")[0] for line in f.readlines()] if args.do_lite: stats_matr_dict = cp.bootstrap_matrices_iter_free(filenames) else: if args.transpose_bootstrap_folder == None: raise ValueError("If doing bootstrap calculation, transpose is required") # allow the other problem transpose_bootstrap_folder = os.path.join(args.outer_save_folder, args.transpose_bootstrap_folder) if not os.path.exists(transpose_bootstrap_folder): os.makedirs(transpose_bootstrap_folder) if not os.path.exists(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump))): os.makedirs(os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump))) transpose_prefix = os.path.join(transpose_bootstrap_folder, save_prefix) dump_prefix = os.path.join(transpose_bootstrap_folder, "dump-" + str(args.length_before_dump), save_prefix) t = time.time() bootstrap_coef_file_matr = transpose_bootstrap_matrices(filenames, length_before_dump=args.length_before_dump, save_prefix=transpose_prefix, dump_prefix=dump_prefix ) print("Time to transpose: ", time.time() - t) bootstrap_coef_filename = dump_prefix + "-NAMES.p" pickle.dump(bootstrap_coef_file_matr, open(bootstrap_coef_filename, 'wb')) print("Bootstrap coef matrix dumped to ", bootstrap_coef_filename) t = time.time() stats_matr_dict = compute_bootstrap_stats_matr(bootstrap_coef_file_matr) print("Time to get stats: ", time.time() - t) # align results if args.dump_raw: dump_stats_matr_dict = stats_matr_dict.copy() if args.unalign_before_raw_dump: for k in dump_stats_matr_dict: dump_stats_matr_dict[k] = lc.unalign_coefs(dump_stats_matr_dict[k], lag) for k in dump_stats_matr_dict: outfile = full_save_prefix + "_raw_" + k + "_coefs.p" with open(outfile, 'wb') as f: pickle.dump(dump_stats_matr_dict[k], f) print("For ", k , "Saved to ", outfile) if args.do_align: for k in stats_matr_dict: stats_matr_dict[k] = lc.align_coefs(stats_matr_dict[k], lag) # Save the gene matrices # Note bootstrap_matr is of form lag x n x n full_nets = [] for i in range(1, lag + 1): print("Lag: ", i) print("Aggregating results") #bootstrap_mean, bootstrap_std, bootstrap_freq = cp.get_bootstrap_results(bootstrap_lag_to_matrs[i]) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = lag extra_dict["Coef"] = i nets = [] for k in stats_matr_dict: raw_matr = stats_matr_dict[k][i-1] matr_filename = full_save_prefix + "-" + str(i) + "-bootstrap-" + k + "-matrix.txt" matr = gtm.save_gene_matrix(matr_filename, matrix=raw_matr, genes=genes) print("Saved ", k, " to ", matr_filename) if k == "mean": net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(), abs_name="AbsBootstrap:" + k.capitalize(), do_sort=False, extra_dict=extra_dict) else: net = nh.matr_to_net(matr, make_type=False, edge_name="Bootstrap:" + k.capitalize(), no_abs=True, do_sort=False, extra_dict=extra_dict) nets.append(net) full_net = nets[0] for j in range(1, len(nets)): full_net = full_net.merge(nets[j], how='outer') print("Final net: ", full_net.shape[0]) sortby = "Bootstrap:Freq" print("Sorting by :", sortby) full_net.sort_values(sortby, inplace=True, ascending=False) full_net_filename = full_save_prefix +"-" + str(i) + "-bootstrap-network.txt" full_net.to_csv(full_net_filename, sep="\t", index=False) print("Written to ", full_net_filename) full_nets.append(full_net) union_net_filename = full_save_prefix + "-union-bootstrap-network.txt" if lag > 1: m_net = cp.get_max_network(full_nets, max_col="AbsBootstrap:Mean", index_col="Cause-Effect") union_net = cp.get_union_network(full_nets + [m_net], suffixes=[str(i) for i in range(1, lag + 1)] + [""]) print("Max network edges: ", m_net.shape) print("Union network edges: ", union_net.shape) else: union_net = full_nets[0] sortby = "Bootstrap:Freq" print("Sorting by :", sortby) union_net.sort_values(sortby, inplace=True, ascending=False) union_net.to_csv(union_net_filename, sep="\t", index=False) print("Unioned bootstrap network written to ", union_net_filename)
def load_and_run(args): lag = args.lag save_prefix = args.save_prefix assert args.stratify_by in {"e", "n"} stratify_by = cp.args2stratify_by[args.stratify_by] if args.output_folder == None: args.output_folder = "." # Load data file and prepare a file to pass to plotters if args.load_reps: # load genes, geneTS = gtm.load_basic_rep_file_list(args.data_file) genesr, geneTSr = gtm.load_basic_rep_file_list(args.rand_data_file) # dfs, genes, geneTS, df, timekeys, num_per_keys = gtm.load_rep_file_list(args.data_file) # dfsr, genesr, geneTSr, dfr, __, __ = gtm.load_rep_file_list(args.rand_data_file) # get shared prefix timekeys # print "Timekeys: ", timekeys # print "Num per key: ", num_per_keys else: df = pd.read_csv(args.data_file, sep="\t") genes, geneTS = gtm.get_gene_TS(df) dfr = pd.read_csv(args.rand_data_file, sep="\t") genesr, geneTSr = gtm.get_gene_TS(dfr) timekeys = df.columns.values[1:] print("Timekeys: ", timekeys) # Num. replicates per key num_per_keys = None assert (geneTS.shape == geneTSr.shape) assert (genes == genesr).all() coefs = pickle.load(open(args.coef_file, 'rb')) intercepts = pickle.load(open(args.intercept_file, 'rb')) fit_result_df = pd.read_csv(args.fit_result_file, sep="\t") coefsr = pickle.load(open(args.coef_rand_file, 'rb')) # interceptsr = pickle.load(open(args.intercept_rand_file, 'rb')) fit_result_dfr = pd.read_csv(args.fit_result_rand_file, sep="\t") if args.best_hyper_file != None: best_hyper = pickle.load(open(args.best_hyper_file, 'rb')) else: best_hyper = None print("RESULTS") print("*************************") print("RESIDUALS: ") print("*************************") print("NORMAL: ") cp.summarize_fit(coefs, intercepts, fit_result_df, filename=os.path.join(args.output_folder, "fit_all_summary_normal.txt"), hyper=best_hyper, test_name=args.test_name, lag=lag) # Align the coefs # print "Aligning coefficients" acoefs = lc.align_coefs(coefs, lag) acoefsr = lc.align_coefs(coefsr, lag) print("Removing alphas (gene-on-self effects) ") acoefs = lc.remove_alphas(acoefs, lag) acoefsr = lc.remove_alphas(acoefsr, lag) coef_nets = [] coefr_nets = [] # Save the gene matrices for i in range(acoefs.shape[0]): coef_matr_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-matrix.txt") coefr_matr_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-r-matrix.txt") coef_net_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-network.txt") coefr_net_filename = os.path.join( args.output_folder, save_prefix + "-" + str(i + 1) + "-r-network.txt") coef_matr = gtm.save_gene_matrix(filename=coef_matr_filename, matrix=acoefs[i], genes=genes) coefr_matr = gtm.save_gene_matrix(filename=coefr_matr_filename, matrix=acoefsr[i], genes=genes) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 coef_net = nh.matr_to_net(coef_matr, extra_dict=extra_dict, make_type=False) coefr_net = nh.matr_to_net(coefr_matr, extra_dict=extra_dict, make_type=False) coef_net.to_csv(coef_net_filename, sep="\t", index=False) coefr_net.to_csv(coefr_net_filename, sep="\t", index=False) coef_nets.append(coef_net) coefr_nets.append(coefr_net) print("Coef ", i + 1) print("Networks written to ") print(coef_net_filename) print(coefr_net_filename) # max_net_filename = save_prefix + "-max-network.txt" # max_r_net_filename = save_prefix + "-max-r-network.txt" union_net_filename = os.path.join(args.output_folder, save_prefix + "-union-network.txt") union_r_net_filename = os.path.join(args.output_folder, save_prefix + "-union-r-network.txt") if acoefs.shape[0] > 1: m_net = cp.get_max_network(coef_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( coef_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) print("Max network edges: ", m_net.shape) print("Union network edges: ", union_net.shape) else: union_net = coef_nets[0] union_net.to_csv(union_net_filename, sep="\t", index=False) if acoefsr.shape[0] > 1: m_net = cp.get_max_network(coefr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_r_net = cp.get_union_network( coefr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs.shape[0] + 1)] + [""]) else: union_r_net = coefr_nets[0] union_r_net.to_csv(union_r_net_filename, sep="\t", index=False) # print "Max networks written to " # print max_net_filename # print max_r_net_filename print("Unioned networks written to ") print(union_net_filename) print(union_r_net_filename) if not os.path.exists(os.path.join(args.output_folder, "plots")): os.makedirs(os.path.join(args.output_folder, "plots")) if args.plot_coef: if not os.path.exists( os.path.join(args.output_folder, "plots", "betas")): os.makedirs(os.path.join(args.output_folder, "plots", "betas")) # Plot the betas for i in range(acoefs.shape[0]): if len(np.nonzero(acoefs[i])[0]) > 0 and len( np.nonzero(acoefsr[i])[0]) > 0: fc.plot_betas(acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename=os.path.join( args.output_folder, "plots", "betas", "beta_nonzero_coef-" + str(i + 1)), title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas( acoefs[i][np.nonzero(acoefs[i])].flatten(), acoefsr[i][np.nonzero(acoefsr[i])].flatten(), filename=os.path.join( args.output_folder, "plots", "betas", "beta_nonzero_coef-" + str(i + 1) + "_zoom-in-90"), zoom_in_top_percentile=95, zoom_in_bottom_percentile=5, title="Causal coefs, Coef " + str(i + 1), xlabel="Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename=os.path.join(args.output_folder, "plots", "betas", "beta_abs_coef-" + str(i + 1)), title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename=os.path.join( args.output_folder, "plots", "betas", "beta_abs_coef-" + str(i + 1) + "_zoom-in-bottom-95"), zoom_in_top_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") fc.plot_betas( np.absolute(acoefs[i][np.nonzero(acoefs[i])].flatten()), np.absolute(acoefsr[i][np.nonzero(acoefsr[i])].flatten()), filename=os.path.join( args.output_folder, "plots", "betas", "beta_abs_coef-" + str(i + 1) + "_zoom-in-top-5"), zoom_in_bottom_percentile=95, title="Absolute causal coefs, Coef " + str(i + 1), xlabel="Absolute Causal Coefficient") print("Coef ", i + 1) print("Plots of betas written to: ", os.path.join(args.output_folder, "plots", "betas")) # get FDRS fdrs = [0.01, 0.05, 0.1, 0.2] acoefs_fdrs = [] sf_dfs = [] for fdr in fdrs: fdr_dir = os.path.join(args.output_folder, "fdr-" + str(fdr) + "-" + stratify_by) if not os.path.exists(fdr_dir): os.makedirs(fdr_dir) fdr_prefix = fdr_dir + os.sep + save_prefix # in case we want there to be an intermediate directory for fdr, like the bootstrap case. # if not os.path.exists(os.path.dirname(fdr_prefix)): # os.makedirs(os.path.dirname(fdr_prefix)) acoefs_fdr = np.zeros(acoefs.shape) fdr_nets = [] print("*************") for i in range(acoefs.shape[0]): print("-----") print("FDR = ", fdr) print("Lag ", lag) print("Coef ", i + 1) print("Stratify ", stratify_by) acoefs_fdr[i], threshes = fc.get_abs_thresh( acoefs[i], acoefsr[i], fdr, stratify_by=stratify_by) # print "Threshes", threshes fdr_matr_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-matrix.txt" fdr_net_filename = fdr_prefix + "-" + str(i + 1) + "-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" fdr_matr = gtm.save_gene_matrix(fdr_matr_filename, matrix=acoefs_fdr[i], genes=genes) pickle.dump( threshes, open( fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-threshes.p", 'wb')) extra_dict = collections.OrderedDict() extra_dict["Test"] = args.test_name extra_dict["Lag"] = acoefs.shape[0] extra_dict["Coef"] = i + 1 fdr_net = nh.matr_to_net(fdr_matr, extra_dict=extra_dict, make_type=False) fdr_net.to_csv(fdr_net_filename, sep="\t", index=False) fdr_nets.append(fdr_net) # write summary readme sf_df = fc.summarize_fdr(matr=acoefs_fdr[i], test=args.test_name, fdr=fdr, lag=lag, coef=i + 1, hyper=best_hyper, thresh=threshes, readme_name=fdr_prefix + "-" + str(i + 1) + "-fdr-" + str(fdr) + "-" + stratify_by + "-README.txt", matrixname=fdr_matr_filename, filename=fdr_net_filename) sf_dfs.append(sf_df) print("Network edges: ", fdr_net.shape[0]) if acoefs_fdr.shape[0] > 1: m_net = cp.get_max_network(fdr_nets, max_col="AbsWeight", index_col="Cause-Effect") union_net = cp.get_union_network( fdr_nets + [m_net], suffixes=[str(i) for i in range(1, acoefs_fdr.shape[0] + 1)] + [""]) else: union_net = fdr_nets[0] union_net_filename = fdr_prefix + "-union-fdr-" + str( fdr) + "-" + stratify_by + "-network.txt" union_net.to_csv(union_net_filename, sep="\t", index=False) print("Union network edges", union_net.shape[0]) print("Written to ", union_net_filename) fdr_agg_matr_filename = fdr_prefix + "-union-fdr-" + str( fdr) + "-" + stratify_by + "-coefs.p" pickle.dump(acoefs_fdr, open(fdr_agg_matr_filename, 'wb')) print("Thresholded matrix written as pickle file: ", fdr_agg_matr_filename) acoefs_fdrs.append(acoefs_fdr.copy()) all_sf_dfs = pd.concat(sf_dfs) # Hack to allow the base to still be fit_all_summary_fdr-stratby.txt # While the bootstrap will write to its own file, in its own corresponding folder # bullshit. just sent the output folder save_file = os.path.join(args.output_folder, "fit_all_summary_fdr-" + stratify_by + ".txt") all_sf_dfs.to_csv(save_file, sep="\t", index=False) print("********") print("Summaries of all fdrs written to ", save_file) print("Matrices done.") with open(os.path.join(args.output_folder, "matrices_done.txt"), 'w') as donefile: donefile.write("done\n")
def load_and_run(args): if args.test_name == "": name = "" else: name = args.test_name.capitalize() + " " hyperlist = pickle.load(open(args.hyper_file, 'rb')) int_name_df = pd.read_csv(args.int_name_dfname, sep="\t") print("Loading integrated") print(int_name_df.head()) hyper_fit_dfs = [ pd.read_csv(int_name_df[x].values[0], sep="\t") if os.path.exists(int_name_df[x].values[0]) else None for x in int_name_df ] # Remove the Nones for which there is no information. remove_list = [] for i in range(len(hyper_fit_dfs[:])): try: # Check if its empty if hyper_fit_dfs[i].empty: remove_list.append(i) # If it's equal to None will have an AttributeError here except AttributeError: remove_list.append(i) hyper_fit_dfs = [ h for i, h in enumerate(hyper_fit_dfs) if i not in remove_list ] hyperlist = [h for i, h in enumerate(hyperlist) if i not in remove_list] # Get the best hyper hyper_df = cp.summarize_hyper_fit_dfs(hyper_fit_dfs, hyperlist) best_hyper, best, hyper_df = cp.get_best_hyper(hyper_df, sort_by=args.sort_by) # Write the hypers out pickle.dump(best_hyper, open(args.output_name, 'wb')) hyper_df.to_csv(args.result_dfname, sep="\t", index=0) print("Test is ", name) print("Best hyper is ", best_hyper) print("Best hyper result is ", best) print("Best hyper written to ", args.output_name) print("Hyper result written to ", args.result_dfname) if not os.path.exists("hyper"): os.makedirs("hyper") # Get correlations mse_vec = np.array([ np.array(hyper_fit_df["mse"].values) for hyper_fit_df in hyper_fit_dfs ]) print(mse_vec.shape) mse_corr = np.corrcoef(mse_vec) gtm.save_gene_matrix("hyper" + os.sep + "mse_corr.txt", mse_corr, hyperlist) print("MSE Correlation:") print(mse_corr) print("MSE corr. matrix saved to ", "hyper" + os.sep + "mse_corr.txt") r2_vec = np.array( [hyper_fit_df["r2"].values for hyper_fit_df in hyper_fit_dfs]) r2_corr = np.corrcoef(r2_vec) gtm.save_gene_matrix("hyper" + os.sep + "r2_corr.txt", r2_corr, hyperlist) print("R2 Correlation") print(r2_corr) print("R^2 corr. matrix saved to ", "hyper" + os.sep + "r2_corr.txt") # Plot the hyperparameters if not os.path.exists("plots"): os.makedirs("plots") if not os.path.exists("plots" + os.sep + "hyper"): os.makedirs("plots" + os.sep + "hyper") cp.plot_corr_matrix(mse_corr, cp.hyperlist_to_labellist(hyperlist), title="MSE correlation among " + name + "hyperparams", filename="plots" + os.sep + "hyper" + os.sep + "mse_corr") cp.plot_corr_matrix( r2_corr, cp.hyperlist_to_labellist(hyperlist), title="$r^2$ correlation among " + name + "hyperparams", filename="plots" + os.sep + "hyper" + os.sep + "r2_corr") cp.plot_hyper_boxplot( cp.hyperlist_to_labellist(hyperlist), hyper_fit_dfs, "r2", xlabel=name + "Hyperparameter", ylabel="$r^2$", title=name + "Hyperparameter VS $r^2$", filename="plots" + os.sep + "hyper" + os.sep + "hyperVSr2", hyper_color_labels=[ (cp.hyper_to_label(best_hyper), "k", "Best: " + cp.hyper_to_label(best_hyper) + ", $r^2$ = " + str(np.round(best["r2_avg"].values[0], 1))) ], horizontal_line_color_labels=[(best["r2_avg"].values[0], 'k', None)]) cp.plot_hyper_boxplot( cp.hyperlist_to_labellist(hyperlist), hyper_fit_dfs, "mse", xlabel=name + "Hyperparameter", ylabel="Mean-Squared Error", title=name + "Hyperparameter VS MSE", filename="plots" + os.sep + "hyper" + os.sep + "hyperVSmse", hyper_color_labels=[ (cp.hyper_to_label(best_hyper), "k", "Best: " + cp.hyper_to_label(best_hyper) + ", MSE = " + str(np.round(best["mse_avg"].values[0], 1))) ], horizontal_line_color_labels=[(best["mse_avg"].values[0], 'k', None)]) cp.plot_hyper_boxplot( cp.hyperlist_to_labellist(hyperlist), hyper_fit_dfs, "avg_df", xlabel=name + "Hyperparameter", ylabel="Degrees of Freedom", title=name + "Hyperparameter VS df", filename="plots" + os.sep + "hyper" + os.sep + "hyperVSdof", hyper_color_labels=[ (cp.hyper_to_label(best_hyper), "k", "Best: " + cp.hyper_to_label(best_hyper) + ", df = " + str(int(np.round(best["df_avg"].values[0])))) ], horizontal_line_color_labels=[(best["df_avg"].values[0], 'k', None)]) print("Correlation between hyperparameter results", "plots" + os.sep + "hyper") print("Hyper box plots of r^2, mse, avg d.o.f. written to ", "plots" + os.sep + "hyper")
def run(args): data = gtm.load_file_and_avg(args.original_data) rand_data = gtm.load_file_and_avg(args.randomized_data) matr = pickle.load(open(args.original_matrix, 'rB'))[:, :, args.coef_num - 1] rand_matr = pickle.load(open(args.randomized_matrix, 'rB'))[:, :, args.coef_num - 1] if args.stratify_by not in {"e", "n"}: raise ValueError( "Stratify_by must be either 'e' for effect or 'n' for none") else: if args.stratify_by == "e": stratify_by = "effect" elif args.stratify_by == "n": stratify_by = "none" print print "Beginning FDR control, stratifying the matrix by ", stratify_by genes = data["gene"] rand_genes = rand_data["gene"] if (genes != rand_genes).any(): raise ValueError("Genes are not the same!") print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt" gtm.save_gene_matrix(matrix=matr, filename=args.name + "-unshuffled-matrix.txt", genes=genes) print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt" gtm.save_gene_matrix(matrix=rand_matr, filename=args.name + "-shuffled-matrix.txt", genes=rand_genes) if args.plot_prefix != None: plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix) plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix + "_zoom-in-95", zoom_in_percentile=95) if args.cap_by != None: print "First capping original and randomized matrix" matr = cap_matr(matr, args.cap_by, name="Original") rand_matr = cap_matr(rand_matr, args.cap_by, name="Randomized") print "Using original" print "Trying to have an FDR of ", args.fdr print args.name functions = [get_abs_thresh, get_pos_neg_thresh] types = ["abs-thresh", "pos-neg-thresh"] # whether to take absolute value of given matrices absoluted = [True, True] for function, t, a in zip(functions, types, absoluted): print print "*******************" print t print "*******************" print "making matrix" out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str( args.fdr) + "-stratby-" + stratify_by thresh_matr, threshes = function(matr, rand_matr, args.fdr, stratify_by=stratify_by) matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr, genes) pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w')) print "Matrix written to ", out_prefix + "-matrix.txt" print "Threshes written to ", out_prefix + "-threshes.p" #write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix") if args.make_network: print "making network" net_df = nh.matr_to_net(matr_df, args.name + "-sb-" + args.stratify_by, make_pair=False) net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", out_prefix + "-network.txt" if absoluted: print "Making absoluted matrix " abs_matr = np.absolute(thresh_matr) abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str( args.fdr) + "-stratby-" + stratify_by abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr, genes) #write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix") if args.make_network: print "Making absoluted network" abs_net_df = nh.matr_to_net(abs_df, args.name + "-sb-" + args.stratify_by, make_pair=False) abs_net_df.to_csv(abs_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", abs_prefix + "-network.txt" print "FINISHED" print "#################################################" print
def run(args): data = gtm.load_file_and_avg(args.original_data) rand_data = gtm.load_file_and_avg(args.randomized_data) matr = pickle.load(open(args.original_matrix, 'rB'))[:, :, args.coef_num - 1] rand_matr = pickle.load(open(args.randomized_matrix, 'rB'))[:, :, args.coef_num - 1] if args.stratify_by not in {"e", "n"}: raise ValueError("Stratify_by must be either 'e' for effect or 'n' for none") else: if args.stratify_by == "e": stratify_by = "effect" elif args.stratify_by == "n": stratify_by = "none" genes = data["gene"] rand_genes = rand_data["gene"] if (genes != rand_genes).any(): raise ValueError("Genes are not the same!") print "Original matrix for ", args.name, "saved to", args.name + "-unshuffled-matrix.txt" gtm.save_gene_matrix(matrix=matr, filename=args.name + "-unshuffled-matrix.txt", genes=genes) print "Randomized matrix for ", args.name, "saved to", args.name + "-shuffled-matrix.txt" gtm.save_gene_matrix(matrix=rand_matr, filename=args.name + "-shuffled-matrix.txt", genes=rand_genes) if args.plot_prefix != None: plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix) plot_betas(matr.flatten(), rand_matr.flatten(), filename=args.plot_prefix + "_zoom-in-95", zoom_in_percentile=95) print "Using original" print "Trying to have an FDR of ", args.fdr print args.name functions = [get_abs_thresh, get_pos_thresh, get_neg_thresh, get_pos_neg_thresh] types = ["abs-thresh", "pos-thresh", "neg-thresh", "pos-neg-thresh"] # whether to take absolute value of given matrices absoluted = [True, False, False, True] for function, t, a in zip(functions, types, absoluted): out_prefix = args.name + "-unshuffled-" + t + "-FDR-" + str(args.fdr) + "-stratby-" + stratify_by thresh_matr, threshes = function(matr, rand_matr, args.fdr, stratify_by = stratify_by) matr_df = gtm.save_gene_matrix(out_prefix + "-matrix.txt", thresh_matr, genes) pickle.dump(threshes, open(out_prefix + "-threshes.p", 'w')) print "Matrix written to ", out_prefix + "-matrix.txt" print "Threshes written to ", out_prefix + "-threshes.p" write_readme(thresh_matr, out_prefix, args.fdr, out_prefix + '-README.txt', out_prefix + "-matrix") if args.make_network: net_df = nh.matr_to_net(matr_df, args.name, make_pair=False) net_df.to_csv(out_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", out_prefix + "-network.txt" if absoluted: abs_matr = np.absolute(thresh_matr) abs_prefix = args.name + "-unshuffled-" + t + "-absoluted-FDR-" + str(args.fdr) + "-stratby-" + stratify_by abs_df = gtm.save_gene_matrix(abs_prefix + "-matrix", abs_matr, genes) write_readme(abs_matr, abs_prefix, args.fdr, abs_prefix + '-README.txt', abs_prefix + "-matrix") if args.make_network: abs_net_df = nh.matr_to_net(abs_df, args.name, make_pair=False) abs_net_df.to_csv(abs_prefix + "-network.txt", sep="\t", index=False) print "Network written to ", abs_prefix + "-network.txt"