Esempio n. 1
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)


        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True)
        assert len(dict['vals'].shape)==1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)

        snpdata4 = Pheno(None,iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10)
        snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
Esempio n. 2
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[
            1,
            0] = np.NaN  # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata2.val,
                                             decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="",
                                    vectorize=True)
        assert len(dict['vals'].shape) == 1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        snpdata4 = Pheno(None, iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder +
                         "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata5.val,
                                             decimal=10)
        snpdata6 = Pheno(self.currentFolder +
                         "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata6.val,
                                             decimal=10)
Esempio n. 3
0
    def load_data(self):
        """load data
        """

        
        tt0 = time.time()
        logging.info("loading data...")

        if self.num_snps_in_memory <= self.snpreader.iid_count : raise Exception("Expect self.num_snps_in_memory, {0} > self.snpreader.iid_count, {1}".format(self.num_snps_in_memory, self.total_num_ind))

        self.sid = pd.Series(self.snpreader.sid)

        # load phenotype
        pheno = pstpheno.loadOnePhen(self.pheno_fn,self.mpheno, vectorize=True)
        self.ind_iid = pheno['iid'] #!!LATER: bug? It looks like we record the pre-intersect iids only to write out the pcs later? Why?

        # load covariates
        self.X, cov_iid = self.load_covariates(pheno)

        # Set up the snps
        # G is the standardized snps. The GClass.factory will either load them into memory or will note their file and read them as needed.
        self.G = GClass.factory(self.snpreader, self.num_snps_in_memory, self.standardizer, self.blocksize)

        #!!LATER Should we give preference to self.G since reordering it is the most expensive?
        (self.y, yiid), (self.X, xiid), self.G = pstutil.intersect_apply([(pheno['vals'], pheno['iid']), (self.X, cov_iid), self.G], sort_by_dataset=False)

        # make sure input data isn't modified
        self.X.flags.writeable = False
        self.y.flags.writeable = False

        logging.info("...done. Loading time %.2f s" % (float(time.time() - tt0)))
Esempio n. 4
0
    def setUpClass(self):
        currentFolder = os.path.dirname(os.path.realpath(__file__))
        self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata"
        self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt"
        #self.cov_fn = currentFolder + "/examples/toydata.cov"

        # load data
        ###################################################################
        snp_reader = Bed(self.snp_fn)
        pheno = pstpheno.loadOnePhen(self.pheno_fn)
        #cov = pstpheno.loadPhen(self.cov_fn)
        
        # intersect sample ids
        snp_reader, pheno = pysnptools.util.intersect_apply([snp_reader, pheno])
        
        self.G = snp_reader.read(order='C').val
        self.G = stdizer.Unit().standardize(self.G)
        self.G.flags.writeable = False
        self.y = pheno['vals'][:,0]
        self.y.flags.writeable = False

        # load pcs
        #self.G_cov = cov['vals']
        self.G_cov = np.ones((len(self.y), 1))
        self.G_cov.flags.writeable = False
Esempio n. 5
0
    def load_data(self):
        """load data
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
        
            tt0 = time.time()
            logging.info("loading data...")

            if self.num_snps_in_memory <= self.snpreader.iid_count : raise Exception("Expect self.num_snps_in_memory, {0} > self.snpreader.iid_count, {1}".format(self.num_snps_in_memory, self.total_num_ind))

            self.sid = pd.Series(self.snpreader.sid)

            # load phenotype
            pheno = pstpheno.loadOnePhen(self.pheno_fn,self.mpheno, vectorize=True)
            self.ind_iid = pheno['iid'] #!!LATER: bug? It looks like we record the pre-intersect iids only to write out the pcs later? Why?

            # load covariates
            self.X, cov_iid = self.load_covariates(pheno)

            # Set up the snps
            # G is the standardized snps. The GClass.factory will either load them into memory or will note their file and read them as needed.
            self.G = GClass.factory(self.snpreader, self.num_snps_in_memory, self.standardizer, self.blocksize,count_A1=self.count_A1)

            #!!LATER Should we give preference to self.G since reordering it is the most expensive?
            (self.y, yiid), (self.X, xiid), self.G = pstutil.intersect_apply([(pheno['vals'], pheno['iid']), (self.X, cov_iid), self.G], sort_by_dataset=False)

            # make sure input data isn't modified
            self.X.flags.writeable = False
            self.y.flags.writeable = False

            logging.info("...done. Loading time %.2f s" % (float(time.time() - tt0)))
Esempio n. 6
0
def loadPheno(bed, phenoFile, missingPhenotype="-9", keepDict=False):
    pheno = phenoUtils.loadOnePhen(phenoFile, missing=missingPhenotype, vectorize=True)
    checkIntersection(bed, pheno, "phenotypes")
    bed, pheno = pstutil.intersect_apply([bed, pheno])
    if not keepDict:
        pheno = pheno["vals"]
    return bed, pheno
Esempio n. 7
0
    def setUpClass(self):
        currentFolder = os.path.dirname(os.path.realpath(__file__))
        self.snp_fn = currentFolder + "/../../tests/datasets/mouse/alldata"
        self.pheno_fn = currentFolder + "/../../tests/datasets/mouse/pheno_10_causals.txt"
        #self.cov_fn = currentFolder + "/examples/toydata.cov"

        # load data
        ###################################################################
        snp_reader = Bed(self.snp_fn)
        pheno = pstpheno.loadOnePhen(self.pheno_fn)
        #cov = pstpheno.loadPhen(self.cov_fn)

        # intersect sample ids
        snp_reader, pheno = pysnptools.util.intersect_apply(
            [snp_reader, pheno])

        self.G = snp_reader.read(order='C').val
        self.G = stdizer.Unit().standardize(self.G)
        self.G.flags.writeable = False
        self.y = pheno['vals'][:, 0]
        self.y.flags.writeable = False

        # load pcs
        #self.G_cov = cov['vals']
        self.G_cov = np.ones((len(self.y), 1))
        self.G_cov.flags.writeable = False
Esempio n. 8
0
def load_snp_data(snpreader,
                  pheno_fn,
                  cov_fn=None,
                  offset=True,
                  mpheno=0,
                  standardizer=Unit()):
    """Load plink files
    ----------

    snpreader : snpreader object
        object to read in binary SNP file

    pheno_fn : str
        File name of phenotype file

    cov_fn : str
        File name of covariates file

    offset : bool, default=True
        Adds offset to the covariates specified in cov_fn, if neccesssary


    Returns
    -------
    G : array, shape = [n_samples, n_features]
        SNP matrix

    X : array, shape = [n_samples, n_covariates]
        Matrix of covariates (e.g. age, gender)

    y : array, shape = [n_samples]
        Phenotype (target) vector

    """

    #TODO: completely remove this
    pheno = pstpheno.loadOnePhen(pheno_fn, mpheno, vectorize=True)
    geno = snpreader.read(order='C').standardize(standardizer)

    # sanity check
    #assert np.testing.assert_array_equal(ind_iid, pheno['iid'][indarr[:,0]])

    # load covariates or generate vector of ones (for bias)
    if cov_fn == None:
        cov = {'vals': np.ones((len(pheno['iid']), 1)), 'iid': pheno['iid']}
    else:
        cov = pstpheno.loadPhen(cov_fn)

    (y, yiid), G, (X, xiid) = pstutil.intersect_apply(
        [(pheno['vals'], pheno['iid']), geno, (cov['vals'], cov['iid'])],
        sort_by_dataset=False)
    G = G.read(order='C', view_ok=True)

    # add bias column if not present
    if offset and sp.all(X.std(0) != 0):
        offset = sp.ones((len(indarr), 1))
        X = sp.hstack((X, offset))

    return G, X, y
Esempio n. 9
0
def loadPheno(bed, phenoFile, missingPhenotype='-9', keepDict=False):
    pheno = phenoUtils.loadOnePhen(phenoFile,
                                   missing=missingPhenotype,
                                   vectorize=True)
    checkIntersection(bed, pheno, 'phenotypes')
    bed, pheno = pstutil.intersect_apply([bed, pheno])
    if (not keepDict): pheno = pheno['vals']
    return bed, pheno
Esempio n. 10
0
def loadCovars(bed, covarFile):
    covarsDict = phenoUtils.loadOnePhen(covarFile, vectorize=False)
    checkIntersection(bed, covarsDict, "covariates", checkSuperSet=True)
    _, covarsDict = pstutil.intersect_apply([bed, covarsDict])
    covar = covarsDict["vals"]
    covar -= np.mean(covar, axis=0)
    covar /= np.std(covar, axis=0)
    return covar
Esempio n. 11
0
def loadCovars(bed, covarFile):
    covarsDict = phenoUtils.loadOnePhen(covarFile, vectorize=False)
    checkIntersection(bed, covarsDict, 'covariates', checkSuperSet=True)
    _, covarsDict = pstutil.intersect_apply([bed, covarsDict])
    covar = covarsDict['vals']
    covar -= np.mean(covar, axis=0)
    covar /= np.std(covar, axis=0)
    return covar
Esempio n. 12
0
def loadRelatedFile(bed, relFile):
    relatedDict = phenoUtils.loadOnePhen(relFile, vectorize=True)
    checkIntersection(bed, relatedDict, "relatedness", checkSuperSet=True)
    _, relatedDict = pstutil.intersect_apply([bed, relatedDict])
    related = relatedDict["vals"]
    keepArr = related < 0.5
    print np.sum(~keepArr), "individuals will be removed due to high relatedness"
    return keepArr
Esempio n. 13
0
def loadRelatedFile(bed, relFile):
    relatedDict = phenoUtils.loadOnePhen(relFile, vectorize=True)
    checkIntersection(bed, relatedDict, 'relatedness', checkSuperSet=True)
    _, relatedDict = pstutil.intersect_apply([bed, relatedDict])
    related = relatedDict['vals']
    keepArr = (related < 0.5)
    print np.sum(
        ~keepArr), 'individuals will be removed due to high relatedness'
    return keepArr
Esempio n. 14
0
def main():
    """
    example that compares output to fastlmmc
    """

    # set up data
    phen_fn = "../feature_selection/examples/toydata.phe"
    snp_fn = "../feature_selection/examples/toydata.5chrom.bed"
    #chrom_count = 5

    # load data
    ###################################################################
    snp_reader = Bed(snp_fn)
    pheno = pstpheno.loadOnePhen(phen_fn)

    cov = None
    #cov = pstpheno.loadPhen(self.cov_fn)

    snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

    G = snp_reader.read(order='C').val
    G = stdizer.Unit().standardize(G)
    G.flags.writeable = False
    y = pheno['vals'][:, 0]
    y.flags.writeable

    # load pcs
    #G_pc = cov['vals']
    #G_pc.flags.writeable = False
    delta = 2.0
    gwas = WindowingGwas(G, y, delta=delta)
    pv = gwas.run_gwas()

    from fastlmm.association.tests.test_gwas import GwasTest
    REML = False
    snp_pos_sim = snp_reader.sid
    snp_pos_test = snp_reader.sid
    os.environ["FastLmmUseAnyMklLib"] = "1"
    gwas_c = GwasTest(snp_fn,
                      phen_fn,
                      snp_pos_sim,
                      snp_pos_test,
                      delta,
                      REML=REML,
                      excludeByPosition=0)
    gwas_c.run_gwas()

    import pylab
    pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+")
    pylab.plot(np.arange(-18, 0), np.arange(-18, 0), "-k")
    pylab.show()

    np.testing.assert_array_almost_equal(np.log(pv),
                                         np.log(gwas_c.p_values),
                                         decimal=3)

    simple_manhattan_plot(pv)
Esempio n. 15
0
def _fixup_pheno(pheno, bed=None, missingPhenotype='-9'):
	if (isinstance(pheno, str)):
		if (bed is not None):
			bed, pheno = loadPheno(bed, pheno, missingPhenotype, keepDict=True)
			return bed, pheno
		else:
			phenoDict = phenoUtils.loadOnePhen(pheno, missing=missingPhenotype, vectorize=True)
			return phenoDict
	else:
		if (bed is not None): return bed, pheno			
		else: return pheno
Esempio n. 16
0
def load_snp_data(snpreader, pheno_fn, cov_fn=None, offset=True, mpheno=0, standardizer=Unit()):
    """Load plink files
    ----------

    snpreader : snpreader object
        object to read in binary SNP file

    pheno_fn : str
        File name of phenotype file

    cov_fn : str
        File name of covariates file

    offset : bool, default=True
        Adds offset to the covariates specified in cov_fn, if neccesssary


    Returns
    -------
    G : array, shape = [n_samples, n_features]
        SNP matrix

    X : array, shape = [n_samples, n_covariates]
        Matrix of covariates (e.g. age, gender)

    y : array, shape = [n_samples]
        Phenotype (target) vector

    """
    
    #TODO: completely remove this
    pheno = pstpheno.loadOnePhen(pheno_fn,mpheno, vectorize=True)
    geno = snpreader.read(order='C').standardize(standardizer)

    # sanity check
    #assert np.testing.assert_array_equal(ind_iid, pheno['iid'][indarr[:,0]])

    # load covariates or generate vector of ones (for bias)
    if cov_fn == None:
        cov = {'vals': np.ones((len(pheno['iid']), 1)), 'iid':pheno['iid']}
    else:
        cov = pstpheno.loadPhen(cov_fn)

    (y, yiid), G, (X, xiid) = pstutil.intersect_apply([(pheno['vals'],pheno['iid']), geno, (cov['vals'],cov['iid'])], sort_by_dataset=False)
    G = G.read(order='C', view_ok=True)

    # add bias column if not present
    if offset and sp.all(X.std(0)!=0):
        offset = sp.ones((len(indarr),1))
        X = sp.hstack((X,offset))  
        
    return G, X, y
    def test_preload_files(self):
        logging.info("TestSingleSnp test_preload_files")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps, count_A1=False)

        output_file_name = self.file_name("preload_files")

        frame = single_snp(test_snps=bed[:,:10], pheno=pheno, G0=test_snps, mixing=0,leave_out_one_chrom=False,
                                  covar=covar, output_file_name=output_file_name,count_A1=False
                                  )
        self.compare_files(frame,"one")
Esempio n. 18
0
def _fixup_pheno(pheno, bed=None, missingPhenotype='-9'):
    if (isinstance(pheno, str)):
        if (bed is not None):
            bed, pheno = loadPheno(bed, pheno, missingPhenotype, keepDict=True)
            return bed, pheno
        else:
            phenoDict = phenoUtils.loadOnePhen(pheno,
                                               missing=missingPhenotype,
                                               vectorize=True)
            return phenoDict
    else:
        if (bed is not None): return bed, pheno
        else: return pheno
Esempio n. 19
0
def main():
    """
    example that compares output to fastlmmc
    """


    # set up data
    phen_fn = "../feature_selection/examples/toydata.phe"
    snp_fn = "../feature_selection/examples/toydata.5chrom"
    #chrom_count = 5
    
    # load data
    ###################################################################
    snp_reader = Bed(snp_fn)
    pheno = pstpheno.loadOnePhen(phen_fn)

    cov = None
    #cov = pstpheno.loadPhen(self.cov_fn)    

    snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])
    
    G = snp_reader.read(order='C').val
    G = stdizer.Unit().standardize(G)
    G.flags.writeable = False
    y = pheno['vals'][:,0]
    y.flags.writeable

    # load pcs
    #G_pc = cov['vals']
    #G_pc.flags.writeable = False
    delta = 2.0
    gwas = WindowingGwas(G, y, delta=delta)
    pv = gwas.run_gwas()

    from fastlmm.association.tests.test_gwas import GwasTest
    REML = False
    snp_pos_sim = snp_reader.sid
    snp_pos_test = snp_reader.sid
    os.environ["FastLmmUseAnyMklLib"] = "1"
    gwas_c = GwasTest(snp_fn, phen_fn, snp_pos_sim, snp_pos_test, delta, REML=REML, excludeByPosition=0)
    gwas_c.run_gwas()

    import pylab
    pylab.plot(np.log(pv), np.log(gwas_c.p_values), "+")
    pylab.plot(np.arange(-18, 0), np.arange(-18,0), "-k")
    pylab.show()

    np.testing.assert_array_almost_equal(np.log(pv), np.log(gwas_c.p_values), decimal=3)
    
    simple_manhattan_plot(pv)
Esempio n. 20
0
    def test_preload_files(self):
        logging.info("TestSingleSnp test_preload_files")
        from pysnptools.snpreader import Bed
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps)

        output_file_name = self.file_name("preload_files")

        frame = single_snp(test_snps=bed[:,:10], pheno=pheno, G0=test_snps, mixing=0,
                                  covar=covar, output_file_name=output_file_name
                                  )
        self.compare_files(frame,"one")
Esempio n. 21
0
def load_intersect(snp_reader, pheno_fn_or_none, snp_set=AllSnps()):
    """
    load SNPs and phenotype, intersect ids
    ----------------------------------------------------------------------
    Input:
    bed_reader : SnpReader object (e.g. BedReader)
    pheno_fn   : str, file name of phenotype file, defa
    ----------------------------------------------------------------------
    Output:
    G : numpy array containing SNP data
    y : numpy (1d) containing phenotype
    ----------------------------------------------------------------------
    """

    standardizer = stdizer.Unit()

    geno = snp_reader.read(order='C', snp_set=snp_set)
    G = geno['snps']
    G = standardizer.standardize(G)

    snp_names = geno['rs']
    chr_ids = geno['pos'][:, 0]

    if not pheno_fn_or_none is None:

        # load phenotype
        pheno = pstpheno.loadOnePhen(pheno_fn_or_none, 0)
        y = pheno['vals'][:, 0]

        # load covariates and intersect ids
        import warnings
        warnings.warn(
            "This intersect_ids is deprecated. Pysnptools includes newer versions of intersect_ids",
            DeprecationWarning)
        indarr = util.intersect_ids([pheno['iid'], snp_reader.original_iids])

        #print "warning: random phen"
        #y = np.random.random_sample(len(y))

        if not (indarr[:, 0] == indarr[:, 1]).all():
            assert False, "ERROR: this code assumes the same order for snp and phen file"

            print "reindexing"
            y = y[indarr[:, 0]]
            G = G[indarr[:, 1]]
    else:
        y = None

    return G, y, snp_names, chr_ids
Esempio n. 22
0
def load_intersect(snp_reader, pheno_fn_or_none,snp_set=AllSnps()):
    """
    load SNPs and phenotype, intersect ids
    ----------------------------------------------------------------------
    Input:
    bed_reader : SnpReader object (e.g. BedReader)
    pheno_fn   : str, file name of phenotype file, defa
    ----------------------------------------------------------------------
    Output:
    G : numpy array containing SNP data
    y : numpy (1d) containing phenotype
    ----------------------------------------------------------------------
    """

    standardizer = stdizer.Unit()

    geno = snp_reader.read(order='C',snp_set=snp_set)
    G = geno['snps']
    G = standardizer.standardize(G)

    snp_names = geno['rs']
    chr_ids = geno['pos'][:,0]

    if not pheno_fn_or_none is None:

        # load phenotype
        pheno = pstpheno.loadOnePhen(pheno_fn_or_none, 0)
        y = pheno['vals'][:,0]

        # load covariates and intersect ids
        import warnings
        warnings.warn("This intersect_ids is deprecated. Pysnptools includes newer versions of intersect_ids", DeprecationWarning)
        indarr = util.intersect_ids([pheno['iid'], snp_reader.original_iids])
    
        #print "warning: random phen"
        #y = np.random.random_sample(len(y)) 


        if not (indarr[:,0] == indarr[:,1]).all():
            assert False, "ERROR: this code assumes the same order for snp and phen file"

            print "reindexing"
            y = y[indarr[:,0]]
            G = G[indarr[:,1]]
    else:
        y = None


    return G, y, snp_names, chr_ids
    def test_SNC(self):
        logging.info("TestSNC")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps, count_A1=False)
        snc = bed.read()
        snc.val[:,2] = [0] * snc.iid_count # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:,:10], pheno=pheno, G0=snc, mixing=0,leave_out_one_chrom=False,
                                  covar=covar, output_file_name=output_file_name,count_A1=False
                                  )
        self.compare_files(frame,"snc")
    def test_cid_intersect(self):
        logging.info("TestSingleSnp test_cid_intersect")
        test_snps = Bed(self.bedbase, count_A1=False)
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        pheno['iid'] = np.vstack([pheno['iid'][::-1],[['Bogus','Bogus']]])
        pheno['vals'] = np.hstack([pheno['vals'][::-1],[-34343]])

        
        covar = self.cov_fn
        output_file_name = self.file_name("cid_intersect")
        frame = single_snp(test_snps=test_snps[:,:10], pheno=pheno, G0=test_snps, leave_out_one_chrom=False,
                                  covar=covar, mixing=0,
                                  output_file_name=output_file_name,count_A1=False
                                  )

        self.compare_files(frame,"one")
Esempio n. 25
0
    def test_preload_files(self):
        logging.info("TestSingleSnp test_preload_files")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps)

        output_file_name = self.file_name("preload_files")

        frame = single_snp(test_snps=bed[:, :10],
                           pheno=pheno,
                           G0=test_snps,
                           mixing=0,
                           leave_out_one_chrom=False,
                           covar=covar,
                           output_file_name=output_file_name)
        self.compare_files(frame, "one")
Esempio n. 26
0
    def test_preload_files(self):
        logging.info("TestEpistasis test_preload_files")
        from pysnptools.snpreader import Bed
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps)

        output_file = self.file_name("preload_files")

        frame = epistasis(test_snps, pheno, G0=test_snps, 
                                  covar=covar, 
                                  sid_list_0=bed.sid[:10], #first 10 snps
                                  sid_list_1=bed.sid[5:15], #Skip 5 snps, use next 10
                                  output_file_name=output_file
                                  )
        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"one")
Esempio n. 27
0
    def test_cid_intersect(self):
        logging.info("TestSingleSnp test_cid_intersect")
        test_snps = Bed(self.bedbase)
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        pheno['iid'] = np.vstack([pheno['iid'][::-1], [['Bogus', 'Bogus']]])
        pheno['vals'] = np.hstack([pheno['vals'][::-1], [-34343]])

        covar = self.cov_fn
        output_file_name = self.file_name("cid_intersect")
        frame = single_snp(test_snps=test_snps[:, :10],
                           pheno=pheno,
                           G0=test_snps,
                           leave_out_one_chrom=False,
                           covar=covar,
                           mixing=0,
                           output_file_name=output_file_name)

        self.compare_files(frame, "one")
Esempio n. 28
0
    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = None

        if isinstance(self.test_snps, str):
            self.test_snps = Bed(self.test_snps)

        if isinstance(self.G0, str):
            self.G0 = Bed(self.G0)

        if isinstance(self.pheno, str):
            self.pheno = pstpheno.loadOnePhen(self.pheno,
                                              vectorize=True,
                                              missing='NaN')

        if self.covar is not None and isinstance(self.covar, str):
            self.covar = pstpheno.loadPhen(self.covar, missing='NaN')

        if self.G1_or_none is not None and isinstance(self.G1_or_none, str):
            self.G1_or_none = Bed(self.G1_or_none)

        if self.sid_list_0 is None:
            self.sid_list_0 = self.test_snps.sid

        if self.sid_list_1 is None:
            self.sid_list_1 = self.test_snps.sid

        self.set_sid_sets()

        #!!Should fix up to add only of no constant columns - will need to add a test case for this
        if self.covar is None:
            self.covar = np.ones((self.test_snps.iid_count, 1))
        else:
            self.covar = np.hstack(
                (self.covar['vals'], np.ones((self.test_snps.iid_count, 1))))
        self.n_cov = self.covar.shape[1]

        if self.output_file_or_none is None:
            self.__tempdirectory = ".working"
        else:
            self.__tempdirectory = self.output_file_or_none + ".working"

        self._ran_once = True
Esempio n. 29
0
    def _run_once(self):
        if self._ran_once:
            return
        self._ran_once = None

        if isinstance(self.test_snps, str):
            self.test_snps = Bed(self.test_snps)

        if isinstance(self.G0, str):
            self.G0 = Bed(self.G0)

        if isinstance(self.pheno, str):
            self.pheno = pstpheno.loadOnePhen(self.pheno,vectorize=True) #!! what about missing=-9?

        if self.covar is not None and isinstance(self.covar, str):
            self.covar = pstpheno.loadPhen(self.covar)#!! what about missing=-9?

        if self.G1_or_none is not None and isinstance(self.G1_or_none, str):
            self.G1_or_none = Bed(self.G1_or_none)

        if self.sid_list_0 is None:
            self.sid_list_0 = self.test_snps.sid

        if self.sid_list_1 is None:
            self.sid_list_1 = self.test_snps.sid

        self.set_sid_sets()

        #!!Should fix up to add only of no constant columns - will need to add a test case for this
        if self.covar is None:
            self.covar = np.ones((self.test_snps.iid_count, 1))
        else:
            self.covar = np.hstack((self.covar['vals'],np.ones((self.test_snps.iid_count, 1))))
        self.n_cov = self.covar.shape[1] 


        if self.output_file_or_none is None:
            self.__tempdirectory = ".working"
        else:
            self.__tempdirectory = self.output_file_or_none + ".working"

        self._ran_once = True
Esempio n. 30
0
    def test_SNC(self):
        logging.info("TestSNC")
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps, count_A1=False)
        snc = bed.read()
        snc.val[:, 2] = 0  # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:, :10],
                           pheno=pheno,
                           G0=snc,
                           mixing=0,
                           leave_out_one_chrom=False,
                           covar=covar,
                           output_file_name=output_file_name,
                           count_A1=False)
        self.compare_files(frame, "snc")
Esempio n. 31
0
    def test_cid_intersect(self):
        logging.info("TestEpistasis test_cid_intersect")
        from pysnptools.snpreader import Bed
        test_snps = Bed(self.bedbase)
        pheno = pstpheno.loadOnePhen(self.phen_fn,vectorize=True)
        pheno['iid'] = np.vstack([pheno['iid'][::-1],[['Bogus','Bogus']]])
        pheno['vals'] = np.hstack([pheno['vals'][::-1],[-34343]])

        
        covar = self.cov_fn
        output_file = self.file_name("cid_intersect")
        frame = epistasis(test_snps, pheno, G0=test_snps,
                                  covar=covar, 
                                  sid_list_0=test_snps.sid[:10], #first 10 snps
                                  sid_list_1=test_snps.sid[5:15], #Skip 5 snps, use next 10
                                  output_file_name=output_file
                                  )

        sid0,sid1,pvalue_list =np.array(frame['SNP0']),np.array(frame['SNP1']),np.array(frame['PValue'])
        self.compare_files(sid0,sid1,pvalue_list,"one")
Esempio n. 32
0
    def test_SNC(self):
        logging.info("TestSNC")
        from pysnptools.snpreader import Bed
        test_snps = self.bedbase
        pheno = pstpheno.loadOnePhen(self.phen_fn, vectorize=True)
        covar = pstpheno.loadPhen(self.cov_fn)
        bed = Bed(test_snps)
        snc = bed.read()
        snc.val[:, 2] = [
            0
        ] * snc.iid_count  # make SNP #2 have constant values (aka a SNC)

        output_file_name = self.file_name("snc")

        frame = single_snp(test_snps=snc[:, :10],
                           pheno=pheno,
                           G0=snc,
                           mixing=0,
                           covar=covar,
                           output_file_name=output_file_name)
        self.compare_files(frame, "snc")
Esempio n. 33
0
def run_fastlmmc(dataset,
                 output_dir,
                 pheno_index,
                 covFile=None,
                 species='mouse',
                 maxthreads=1,
                 featsel=False,
                 exclude=False,
                 condition=None):

    # commands from fastlmmc:
    # maxthreads
    # condition
    # exclude by position

    # if condition:
    #	 condition = '-SnpId1 %s' % condition[0]
    # else:
    #	 condition = ''

    # temporary kludge because -excludeByPosition option is slow (at least for v2.05 and v2.06)

    bfile = dataset
    filtered_snp_reader = Bed('%s.FILTERED' % bfile)
    full_snp_reader = Bed('%s.FULL' % bfile)

    pheno_file = loadOnePhen('%s.pheno.txt' % dataset, i_pheno=pheno_index)
    phenotype_name = pheno_file['header'][0]

    v = globals()
    chroms = map(str, range(1, species_chroms[species] + 1))
    v.update(locals())

    # loop through chromosomes and run
    for i, chrom in enumerate(chroms):

        # separate by chromosome for LOOCV
        test_snps = filtered_snp_reader[:, filtered_snp_reader.pos[:, 0] ==
                                        int(chrom)]
        matrix_snps = full_snp_reader[:,
                                      full_snp_reader.pos[:, 0] != int(chrom)]

        # run snp with covar
        if covFile:
            df = single_snp(test_snps=test_snps,
                            pheno=pheno_file,
                            K0=matrix_snps,
                            covar=covFile)
        else:
            df = single_snp(test_snps=test_snps,
                            pheno=pheno_file,
                            K0=matrix_snps)

        # format outputs
        out_df = df.loc[:, ['SNP', 'Chr', 'ChrPos', 'PValue', 'SnpWeight']]
        out_df.columns = ['SNP', 'CHR', 'BP', 'P', 'Beta']

        # save results into data frame
        if i == 0:
            final = out_df
        else:
            final = final.append(out_df)

    # output to csv
    v.update(locals())
    final.to_csv('%(output_dir)s/%(phenotype_name)s.gwas' % v,
                 sep='\t',
                 index=False)
Esempio n. 34
0
def loadPheno(bed, phenoFile, missingPhenotype='-9', keepDict=False):
	pheno = phenoUtils.loadOnePhen(phenoFile, missing=missingPhenotype, vectorize=True)
	checkIntersection(bed, pheno, 'phenotypes')
	bed, pheno = pstutil.intersect_apply([bed, pheno])
	if (not keepDict): pheno = pheno['vals']
	return bed, pheno