def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y): logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx))) # intersect selected SNPs with train snps if not self.selected_snps is None: # intersect snp names logging.info("intersecting train snps with selected snps for LOCO") int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx], self.selected_snps) sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx] else: sim_keeper_idx = train_snp_idx # subset data # fast indexing (needs to be C-order) assert np.isfortran(G) == False #G_train = G.take(train_snp_idx, axis=1) G_sim = G.take(sim_keeper_idx, axis=1) G_test = G.take(test_snp_idx, axis=1) t0 = time.time() if self.num_pcs == 0: pcs = None else: if not self.pc_prefix is None: out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i) logging.info("loading pc from file: %s" % out_fn) pcs = load(out_fn)[:, 0:self.num_pcs] logging.info("..done") else: assert False, "please precompute PCs" logging.info("done after %.4f seconds" % (time.time() - t0)) # only use PCs if self.pcs_only: G_sim = None logging.info("Using PCs only in LocoGWAS") gwas = FastGwas(G_sim, G_test, y, self.delta, train_pcs=pcs, mixing=self.mixing) gwas.run_gwas() assert len(gwas.p_values) == len(test_snp_idx) # wrap up results return test_snp_idx, gwas.p_values, result
def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y): logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx))) # intersect selected SNPs with train snps if not self.selected_snps is None: # intersect snp names logging.info("intersecting train snps with selected snps for LOCO") int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx], self.selected_snps) sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx] else: sim_keeper_idx = train_snp_idx # subset data # fast indexing (needs to be C-order) assert np.isfortran(G) == False #G_train = G.take(train_snp_idx, axis=1) G_sim = G.take(sim_keeper_idx, axis=1) G_test = G.take(test_snp_idx, axis=1) t0 = time.time() if self.num_pcs == 0: pcs = None else: if not self.pc_prefix is None: out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i) logging.info("loading pc from file: %s" % out_fn) pcs = load(out_fn)[:,0:self.num_pcs] logging.info("..done") else: assert False, "please precompute PCs" logging.info("done after %.4f seconds" % (time.time() - t0)) # only use PCs if self.pcs_only: G_sim = None logging.info("Using PCs only in LocoGWAS") gwas = FastGwas(G_sim, G_test, y, self.delta, train_pcs=pcs, mixing=self.mixing) gwas.run_gwas() assert len(gwas.p_values) == len(test_snp_idx) # wrap up results return test_snp_idx, gwas.p_values, result
def run_sim_and_compare(self, name, method): logging.info('in test_all') import fastlmm.util.runner as runner currentFolder = os.path.dirname(os.path.realpath(__file__)) snp_fn = os.path.realpath(currentFolder + "/../../data/mouse/alldata") out_prefix = currentFolder + "/tempdir/mouse_" description = "test_run_{0}".format(name) runner = Local() num_causals = 500 num_repeats = 1 num_pcs = 5 expected_prefix = currentFolder + "/expected/" methods = [method] combine_output = run_simulation(snp_fn, out_prefix, methods, num_causals, num_repeats, num_pcs, description, runner, plot_fn="out.png", seed=42) from fastlmm.util.pickle_io import load filename = "%s%s.bzip" % (expected_prefix, name) co = load(filename) compare_nested(combine_output, co)
def compute_core(input_tuple): """ Leave-two-chromosome-out evaluation scheme: Chr1: no causals, used for T1-error evaluation Chr2: has causals, not conditioned on, used for power evaluation Rest: has causals, conditioned on T1 Pow [ cond ] ===== ===== ===== .... ===== x x x x xx """ methods, snp_fn, eigen_fn, num_causal, num_pcs, seed, sim_id = input_tuple # partially load bed file from pysnptools.snpreader import Bed snp_reader = Bed(snp_fn) # determine indices for generation and evaluation ################################################################## chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest(snp_reader.pos) causal_candidates_idx = np.concatenate((chr2_idx, rest_idx)) # only compute t1-error (condition on all chr with causals on them) #causal_candidates_idx = rest_idx test_idx = np.concatenate((chr1_idx, chr2_idx)) if seed is not None: np.random.seed(int(seed % sys.maxint)) causal_idx = np.random.permutation(causal_candidates_idx)[0:num_causal] # generate phenotype ################################################################### genetic_var = 0.5 noise_var = 0.5 y = generate_phenotype(Bed(snp_fn).read(order='C').standardize(), causal_idx, genetic_var, noise_var) y.flags.writeable = False ############### only alter part until here --> modularize this # load pcs ################################################################### logging.info("loading eigendecomp from file %s" % eigen_fn) eig_dec = load(eigen_fn) G_pc = eig_dec["pcs"] G_pc.flags.writeable = False G_pc_ = G_pc[:,0:num_pcs] G_pc_norm = DiagKtoN(G_pc_.shape[0]).standardize(G_pc_.copy()) G_pc_norm.flags.writeable = False # run feature selection ######################################################### # generate pheno data structure pheno = {"iid": snp_reader.iid, "vals": y, "header": []} covar = {"iid": snp_reader.iid, "vals": G_pc_norm, "header": []} # subset readers G0 = snp_reader[:,rest_idx] test_snps = snp_reader[:,test_idx] result = {} fs_result = {} # additional methods can be defined and included in the benchmark for method_function in methods: result_, fs_result_ = method_function(test_snps, pheno, G0, covar) result.update(result_) fs_result.update(fs_result_) # save indices indices = {"causal_idx": causal_idx, "chr1_idx": chr1_idx, "chr2_idx": chr2_idx, "input_tuple": input_tuple, "fs_result": fs_result} #test_idx return result, indices
def combine_results(input_tuple): """ compute performance statistics from p-values of method """ method, results_fn = input_tuple logging.info("reading file: %s" % results_fn) output_list = load(results_fn) p_values_all = [] mask_all = [] p_values_all = [] p_values_chr1 = [] p_values_chr2 = [] mask_all = [] t0 = time.time() logging.info("concatenating p-values") for result, idx in output_list: causals_chr2_idx = np.intersect1d(idx["chr2_idx"], idx["causal_idx"]) assert len(result[method]) == len(idx["chr1_idx"]) + len(idx["chr2_idx"]) p_vals_t1_err = result[method][idx["chr1_idx"]] p_vals_power = result[method][causals_chr2_idx] p_values_chr1.extend(p_vals_t1_err) p_values_chr2.extend(p_vals_power) p_values_all.extend(p_vals_t1_err) p_values_all.extend(p_vals_power) mask_t1_err = np.zeros(len(idx["chr1_idx"]), dtype=np.bool) mask_power = np.ones(len(causals_chr2_idx), dtype=np.bool) mask_all.extend(mask_t1_err) mask_all.extend(mask_power) logging.info("done concatenating p-values (%s)" % (str(time.time()-t0))) result = {} t0 = time.time() result["roc"] = gw.compute_roc_data(np.array(mask_all, dtype=np.bool), -np.array(p_values_all)) logging.info("computed roc in (%s)" % (str(time.time()-t0))) t0 = time.time() result["prc"] = gw.compute_prc_data(np.array(mask_all, dtype=np.bool), -np.array(p_values_all)) logging.info("computed prc in (%s)" % (str(time.time()-t0))) t0 = time.time() result["t1err"] = gw.compute_t1err_data(np.array(p_values_chr1), np.zeros(len(p_values_chr1), dtype=np.bool)) logging.info("computed t1err in (%s)" % (str(time.time()-t0))) t0 = time.time() result["power"] = gw.compute_power_data(np.array(p_values_chr2), np.ones(len(p_values_chr2), dtype=np.bool)) logging.info("computed power in (%s)" % (str(time.time()-t0))) result["method"] = method result["num_trials"] = len(p_values_chr1) return result
def merge_results(results_dir, fn_filter_list, mindist): """ visualize gwas results based on results file names """ files = [fn for fn in os.listdir(results_dir) if fn.endswith("pickle")] import pylab pylab.figure() for fn_idx, fn_filter in enumerate(fn_filter_list): method_files = [fn for fn in files if fn.find(fn_filter) != -1] p_values = [] p_values_lin = [] i_causal = [] for method_fn in method_files: tmp_fn = results_dir + "/" + method_fn print tmp_fn dat = load(tmp_fn) pv_m, i_causal_m = cut_snps_close_to_causals(dat["p_values_uncut"], dat["pos"], dat["causal_idx"], mindist=mindist) pv_lin_m, i_causal_m2 = cut_snps_close_to_causals( dat["p_values_lin_uncut"], dat["pos"], dat["causal_idx"], mindist=mindist) np.testing.assert_array_equal(i_causal_m, i_causal_m2) p_values.extend(pv_m) p_values_lin.extend(pv_lin_m) i_causal.extend(i_causal_m) p_values = np.array(p_values) p_values_lin = np.array(p_values_lin) i_causal = np.array(i_causal) method_label = fn_filter.replace("_", "") # underscore prefix hides label pylab.subplot(221) plot_prc_noshow(i_causal, -p_values, label=method_label) if fn_idx == 0: plot_prc_noshow(i_causal, -p_values_lin, label="lin") pylab.subplot(222) plot_roc_noshow(i_causal, -p_values, label=method_label) if fn_idx == 0: plot_roc_noshow(i_causal, -p_values_lin, label="lin") pylab.subplot(223) plot_t1err_noshow(p_values, i_causal, label=method_label) if fn_idx == 0: plot_t1err_noshow(p_values_lin, i_causal, label="lin") pylab.subplot(224) plot_power_noshow(p_values, i_causal, label=method_label) if fn_idx == 0: plot_power_noshow(p_values_lin, i_causal, label="lin") print(p_values) print(i_causal) pylab.show()
def merge_results(results_dir, fn_filter_list, mindist): """ visualize gwas results based on results file names """ files = [fn for fn in os.listdir(results_dir) if fn.endswith("pickle")] import pylab pylab.figure() for fn_idx, fn_filter in enumerate(fn_filter_list): method_files = [fn for fn in files if fn.find(fn_filter) != -1] p_values = [] p_values_lin = [] i_causal = [] for method_fn in method_files: tmp_fn = results_dir + "/" + method_fn print tmp_fn dat = load(tmp_fn) pv_m, i_causal_m = cut_snps_close_to_causals(dat["p_values_uncut"], dat["pos"], dat["causal_idx"], mindist=mindist) pv_lin_m, i_causal_m2 = cut_snps_close_to_causals(dat["p_values_lin_uncut"], dat["pos"], dat["causal_idx"], mindist=mindist) np.testing.assert_array_equal(i_causal_m, i_causal_m2) p_values.extend(pv_m) p_values_lin.extend(pv_lin_m) i_causal.extend(i_causal_m) p_values = np.array(p_values) p_values_lin = np.array(p_values_lin) i_causal = np.array(i_causal) method_label = fn_filter.replace("_", "")# underscore prefix hides label pylab.subplot(221) plot_prc_noshow(i_causal, -p_values, label=method_label) if fn_idx == 0: plot_prc_noshow(i_causal, -p_values_lin, label="lin") pylab.subplot(222) plot_roc_noshow(i_causal, -p_values, label=method_label) if fn_idx == 0: plot_roc_noshow(i_causal, -p_values_lin, label="lin") pylab.subplot(223) plot_t1err_noshow(p_values, i_causal, label=method_label) if fn_idx == 0: plot_t1err_noshow(p_values_lin, i_causal, label="lin") pylab.subplot(224) plot_power_noshow(p_values, i_causal, label=method_label) if fn_idx == 0: plot_power_noshow(p_values_lin, i_causal, label="lin") print p_values print i_causal pylab.show()
def compute_core(input_tuple): """ Leave-two-chromosome-out evaluation scheme: Chr1: no causals, used for T1-error evaluation Chr2: has causals, not conditioned on, used for power evaluation Rest: has causals, conditioned on T1 Pow [ cond ] ===== ===== ===== .... ===== x x x x xx """ methods, snp_fn, eigen_fn, num_causal, num_pcs, seed, sim_id = input_tuple # partially load bed file from pysnptools.snpreader import Bed snp_reader = Bed(snp_fn) # determine indices for generation and evaluation ################################################################## chr1_idx, chr2_idx, rest_idx = split_data_helper.split_chr1_chr2_rest( snp_reader.pos) causal_candidates_idx = np.concatenate((chr2_idx, rest_idx)) # only compute t1-error (condition on all chr with causals on them) #causal_candidates_idx = rest_idx test_idx = np.concatenate((chr1_idx, chr2_idx)) if seed is not None: np.random.seed(int(seed % sys.maxint)) causal_idx = np.random.permutation(causal_candidates_idx)[0:num_causal] # generate phenotype ################################################################### genetic_var = 0.5 noise_var = 0.5 y = generate_phenotype( Bed(snp_fn).read(order='C').standardize(), causal_idx, genetic_var, noise_var) y.flags.writeable = False ############### only alter part until here --> modularize this # load pcs ################################################################### logging.info("loading eigendecomp from file %s" % eigen_fn) eig_dec = load(eigen_fn) G_pc = eig_dec["pcs"] G_pc.flags.writeable = False G_pc_ = G_pc[:, 0:num_pcs] G_pc_norm = DiagKtoN(G_pc_.shape[0]).standardize(G_pc_.copy()) G_pc_norm.flags.writeable = False # run feature selection ######################################################### # generate pheno data structure pheno = {"iid": snp_reader.iid, "vals": y, "header": []} covar = {"iid": snp_reader.iid, "vals": G_pc_norm, "header": []} # subset readers G0 = snp_reader[:, rest_idx] test_snps = snp_reader[:, test_idx] result = {} fs_result = {} # additional methods can be defined and included in the benchmark for method_function in methods: result_, fs_result_ = method_function(test_snps, pheno, G0, covar) result.update(result_) fs_result.update(fs_result_) # save indices indices = { "causal_idx": causal_idx, "chr1_idx": chr1_idx, "chr2_idx": chr2_idx, "input_tuple": input_tuple, "fs_result": fs_result } #test_idx return result, indices
def combine_results(input_tuple): """ compute performance statistics from p-values of method """ method, results_fn = input_tuple logging.info("reading file: %s" % results_fn) output_list = load(results_fn) p_values_all = [] mask_all = [] p_values_all = [] p_values_chr1 = [] p_values_chr2 = [] mask_all = [] t0 = time.time() logging.info("concatenating p-values") for result, idx in output_list: causals_chr2_idx = np.intersect1d(idx["chr2_idx"], idx["causal_idx"]) assert len( result[method]) == len(idx["chr1_idx"]) + len(idx["chr2_idx"]) p_vals_t1_err = result[method][idx["chr1_idx"]] p_vals_power = result[method][causals_chr2_idx] p_values_chr1.extend(p_vals_t1_err) p_values_chr2.extend(p_vals_power) p_values_all.extend(p_vals_t1_err) p_values_all.extend(p_vals_power) mask_t1_err = np.zeros(len(idx["chr1_idx"]), dtype=np.bool) mask_power = np.ones(len(causals_chr2_idx), dtype=np.bool) mask_all.extend(mask_t1_err) mask_all.extend(mask_power) logging.info("done concatenating p-values (%s)" % (str(time.time() - t0))) result = {} t0 = time.time() result["roc"] = gw.compute_roc_data(np.array(mask_all, dtype=np.bool), -np.array(p_values_all)) logging.info("computed roc in (%s)" % (str(time.time() - t0))) t0 = time.time() result["prc"] = gw.compute_prc_data(np.array(mask_all, dtype=np.bool), -np.array(p_values_all)) logging.info("computed prc in (%s)" % (str(time.time() - t0))) t0 = time.time() result["t1err"] = gw.compute_t1err_data( np.array(p_values_chr1), np.zeros(len(p_values_chr1), dtype=np.bool)) logging.info("computed t1err in (%s)" % (str(time.time() - t0))) t0 = time.time() result["power"] = gw.compute_power_data( np.array(p_values_chr2), np.ones(len(p_values_chr2), dtype=np.bool)) logging.info("computed power in (%s)" % (str(time.time() - t0))) result["method"] = method result["num_trials"] = len(p_values_chr1) return result