Example #1
0
    def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y):
        logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx)))

        # intersect selected SNPs with train snps
        if not self.selected_snps is None:
            # intersect snp names
            logging.info("intersecting train snps with selected snps for LOCO")
            int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx],
                                            self.selected_snps)
            sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx]

        else:
            sim_keeper_idx = train_snp_idx

        # subset data

        # fast indexing (needs to be C-order)
        assert np.isfortran(G) == False
        #G_train = G.take(train_snp_idx, axis=1)
        G_sim = G.take(sim_keeper_idx, axis=1)
        G_test = G.take(test_snp_idx, axis=1)

        t0 = time.time()

        if self.num_pcs == 0:
            pcs = None
        else:
            if not self.pc_prefix is None:
                out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i)
                logging.info("loading pc from file: %s" % out_fn)
                pcs = load(out_fn)[:, 0:self.num_pcs]
                logging.info("..done")

            else:
                assert False, "please precompute PCs"

                logging.info("done after %.4f seconds" % (time.time() - t0))

        # only use PCs
        if self.pcs_only:
            G_sim = None
            logging.info("Using PCs only in LocoGWAS")
        gwas = FastGwas(G_sim,
                        G_test,
                        y,
                        self.delta,
                        train_pcs=pcs,
                        mixing=self.mixing)
        gwas.run_gwas()

        assert len(gwas.p_values) == len(test_snp_idx)

        # wrap up results
        return test_snp_idx, gwas.p_values, result
Example #2
0
def execute_fs(test_snps, pheno, G0, covar):
    """
    run feature selection
    """

    result = {}
    fs_result = {}

    # fs unconditioned
    ########################
    tmp_uuid = str(uuid.uuid4())[0:13]
    out_fn = "tmp_pheno_%s.txt" % (tmp_uuid)
    out_data = pd.DataFrame({
        "id1": G0.iid[:, 0],
        "id2": G0.iid[:, 1],
        "y": pheno["vals"]
    })
    out_data.to_csv(out_fn, sep=" ", header=False, index=False)

    # write out covariates
    items = [
        ('id1', G0.iid[:, 0]),
        ('id2', G0.iid[:, 1]),
    ]

    items += [("pc_%i" % i, covar["vals"][:, i])
              for i in xrange(covar["vals"].shape[1])]
    cov_df = pd.DataFrame.from_items(items)
    cov_fn = "tmp_cov_%s.txt" % (tmp_uuid)
    cov_df.to_csv(cov_fn, sep=" ", header=False, index=False)

    #TODO: fix include_all!!
    fsd = create_feature_selection_distributable(G0,
                                                 out_fn,
                                                 None,
                                                 0,
                                                 "fs_out",
                                                 include_all=False,
                                                 cov_fn=cov_fn)
    fs_result["result_uncond_all"] = Local().run(fsd)
    best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"]
    fs_idx = argintersect_left(G0.sid, best_snps)

    G_fs = G0[:, fs_idx]

    result["fs_all"] = single_snp(test_snps, pheno,
                                  G0=G_fs).sort(["Chr", "ChrPos"
                                                 ])["PValue"].as_matrix()
    result["fs_all_cov"] = single_snp(
        test_snps, pheno, G0=G_fs,
        covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix()

    return result, fs_result
Example #3
0
    def dowork(self, i, train_snp_idx, test_snp_idx, result, G, y):
        logging.info("{0}, {1}".format(len(train_snp_idx), len(test_snp_idx)))
        
        # intersect selected SNPs with train snps
        if not self.selected_snps is None:
            # intersect snp names
            logging.info("intersecting train snps with selected snps for LOCO")
            int_snp_idx = argintersect_left(self.snp_reader.rs[train_snp_idx], self.selected_snps)
            sim_keeper_idx = np.array(train_snp_idx)[int_snp_idx]

        else:
            sim_keeper_idx = train_snp_idx

        # subset data
            
        # fast indexing (needs to be C-order)
        assert np.isfortran(G) == False
        #G_train = G.take(train_snp_idx, axis=1)
        G_sim = G.take(sim_keeper_idx, axis=1)
        G_test = G.take(test_snp_idx, axis=1)

        t0 = time.time()

        if self.num_pcs == 0:
            pcs = None
        else:
            if not self.pc_prefix is None:
                out_fn = PrecomputeLocoPcs.create_out_fn(self.pc_prefix, i)
                logging.info("loading pc from file: %s" % out_fn)
                pcs = load(out_fn)[:,0:self.num_pcs]
                logging.info("..done")

            else:
                assert False, "please precompute PCs"

                logging.info("done after %.4f seconds" % (time.time() - t0))

        # only use PCs
        if self.pcs_only:
            G_sim = None
            logging.info("Using PCs only in LocoGWAS")
        gwas = FastGwas(G_sim, G_test, y, self.delta, train_pcs=pcs, mixing=self.mixing)
        gwas.run_gwas()

        assert len(gwas.p_values) == len(test_snp_idx)

        # wrap up results
        return test_snp_idx, gwas.p_values, result
Example #4
0
def execute_fs(test_snps, pheno, G0, covar):
    """
    run feature selection
    """
    
    result = {}
    fs_result = {}
    
    # fs unconditioned
    ########################
    tmp_uuid = str(uuid.uuid4())[0:13]
    out_fn = "tmp_pheno_%s.txt" % (tmp_uuid)
    out_data = pd.DataFrame({"id1": G0.iid[:,0], "id2": G0.iid[:,1], "y": pheno["vals"]})
    out_data.to_csv(out_fn, sep=" ", header=False, index=False)
    
    # write out covariates
    items = [
                ('id1', G0.iid[:,0]),
                ('id2', G0.iid[:,1]), 
            ]
    
    items += [("pc_%i" % i, covar["vals"][:,i]) for i in xrange(covar["vals"].shape[1])]
    cov_df = pd.DataFrame.from_items(items)
    cov_fn = "tmp_cov_%s.txt" % (tmp_uuid)
    cov_df.to_csv(cov_fn, sep=" ", header=False, index=False)
    
    #TODO: fix include_all!!
    fsd = create_feature_selection_distributable(G0, out_fn, None, 0, "fs_out", include_all=False, cov_fn=cov_fn)
    fs_result["result_uncond_all"] = Local().run(fsd)
    best_k, best_delta, best_obj, best_snps = fs_result["result_uncond_all"]
    fs_idx = argintersect_left(G0.sid, best_snps)
    
    G_fs = G0[:,fs_idx]
    
    result["fs_all"] = single_snp(test_snps, pheno, G0=G_fs).sort(["Chr", "ChrPos"])["PValue"].as_matrix()
    result["fs_all_cov"] = single_snp(test_snps, pheno, G0=G_fs, covar=covar).sort(["Chr", "ChrPos"])["PValue"].as_matrix()

    return result, fs_result