Esempio n. 1
0
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
Esempio n. 2
0
def getData(filename="", mph=3, UseCov=False):
    sFil = Bed(filename)
    yFil = Pheno(filename + ".fam")

    Q = []
    if isfile(filename + ".cov") and UseCov:
        QFil = Pheno(filename + ".cov")
        [sFil, yFil, QFil] = intersect_apply([sFil, yFil, QFil])
    if isfile(filename + ".phen"):
        yFil = Pheno(filename + ".phen")
        [sFil, yFil] = intersect_apply([sFil, yFil])
    return [yFil, sFil]
Esempio n. 3
0
def read_covariates(covar_file, ids_to_match, missing):
    ## Read a covariate file and reorder to match ids_to_match ##
    # Read covariate file
    covar_f = Pheno(covar_file, missing=missing).read()
    ids = covar_f.iid
    # Get covariate values
    n_X = covar_f._col.shape[0] + 1
    X = np.ones((covar_f.val.shape[0], n_X))
    X[:, 1:n_X] = covar_f.val
    # Get covariate names
    X_names = np.zeros((n_X), dtype='S10')
    X_names[0] = 'Intercept'
    X_names[1:n_X] = np.array(covar_f._col, dtype='S20')
    # Remove NAs
    NA_rows = np.isnan(X).any(axis=1)
    n_NA_row = np.sum(NA_rows)
    if n_NA_row > 0:
        print(
            'Number of rows removed from covariate file due to missing observations: '
            + str(np.sum(NA_rows)))
        X = X[~NA_rows]
        ids = ids[~NA_rows]
    id_dict = id_dict_make(ids)
    # Match with pheno_ids
    ids_to_match_tuples = [tuple(x) for x in ids_to_match]
    common_ids = id_dict.viewkeys() & set(ids_to_match_tuples)
    pheno_in = np.array([(tuple(x) in common_ids) for x in ids_to_match])
    match_ids = ids_to_match[pheno_in, :]
    X_id_match = np.array([id_dict[tuple(x)] for x in match_ids])
    X = X[X_id_match, :]
    return [X, X_names, pheno_in]
Esempio n. 4
0
    def estVar(self, num, epsilon):
        filename = self.BED.filename
        y = Pheno(filename + ".fam").read().val[:, 3]
        varEsts = self.divideData(filename, num=num)
        if epsilon < 0:
            return varEsts[0]
        e1 = .1 * epsilon
        e2 = .45 * epsilon
        e3 = .45 * epsilon
        vary = self.estVarY(y, e1)
        se2 = sum([v[1] for v in varEsts]) / float(num) + Lap(
            0.0, vary / (e2 * float(num)))
        if se2 < 0:
            se2 = 0
        if se2 > vary:
            se2 = vary
        sg2 = sum([v[0] for v in varEsts]) / float(num) + Lap(
            0.0, vary / (e3 * float(num)))

        if sg2 < 0:
            sg2 = .01 * vary
        if sg2 > vary:
            sg2 = vary

        return [sg2, se2]
Esempio n. 5
0
def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1):
    """Read a phenotype file and remove missing values.

    Args:
        phenofile : :class:`str`
            path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ...
        missing_char : :class:`str`
            The character that denotes a missing phenotype value; 'NA' by default.
        phen_index : :class:`int`
           The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile

    Returns:
        y : :class:`~numpy:numpy.array`
            vector of non-missing phenotype values from specified column of phenofile
        pheno_ids: :class:`~numpy:numpy.array`
            corresponding vector of individual IDs (IID)
    """
    pheno = Pheno(phenofile, missing=missing_char)[:,phen_index-1].read()
    y = np.array(pheno.val)
    y.reshape((y.shape[0],1))
    pheno_ids = np.array(pheno.iid)[:,1]
    # Remove y NAs
    y_not_nan = np.logical_not(np.isnan(y[:,0]))
    if np.sum(y_not_nan) < y.shape[0]:
        y = y[y_not_nan,:]
        pheno_ids = pheno_ids[y_not_nan]
    print('Number of non-missing phenotype observations: ' + str(y.shape[0]))
    return gtarray(y,ids=pheno_ids)
Esempio n. 6
0
def test_single_snp(args):
    import fastlmm
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from fastlmm.association import single_snp
    from utils import read_hdf5_dataset
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import fastlmm.util.util as flutil

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.sample_indices_file is not None:
        logger.info('read indices from file: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(
            (phenotypes['type'] == 'training').values)[0]
    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices)
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes[phenotypes['type'] == 'training'].copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno[['fid', 'iid', trait]].to_csv(pheno_file,
                                               index=False,
                                               sep='\t',
                                               header=False)
        pheno = Pheno(pheno_file)
        logger.info('run FastLMM for single SNP test for %s' % trait)
        results_df = single_snp(test_snps,
                                pheno,
                                K0=K0,
                                count_A1=True,
                                GB_goal=args.GB_goal)
        result_file = os.path.join(args.output_dir, 'single_snp.' + trait)
        logger.info('save results to file: ' + result_file)
        results_df.to_hdf(result_file, trait)

        if args.manhattan:
            plot_file = os.path.join(args.output_dir,
                                     'manhattan.%s.pdf' % trait)
            logger.info('create Manhattan plot: ' + plot_file)
            plt.clf()
            flutil.manhattan_plot(results_df.as_matrix(
                ["Chr", "ChrPos", "PValue"]),
                                  pvalue_line=1e-5,
                                  xaxis_unit_bp=False)
            plt.savefig(plot_file)
Esempio n. 7
0
    def test_intersection(self):

        from pysnptools.standardizer import Unit
        from pysnptools.kernelreader import SnpKernel
        from pysnptools.snpreader import Pheno
        from pysnptools.kernelreader._subset import _KernelSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                       count_A1=False)
        k = SnpKernel(snps_all, stdizer.Identity())

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:, :]  # To test intersection we remove a iid from pheno

        k1, pheno = intersect_apply([
            k, pheno
        ])  #SnpKernel is special because it standardizes AFTER intersecting.
        assert isinstance(k1.snpreader,
                          _SnpSubset) and not isinstance(k1, _KernelSubset)

        #What happens with fancy selection?
        k2 = k[::2]
        assert isinstance(k2, SnpKernel)

        logging.info("Done with test_intersection")
Esempio n. 8
0
	def __init__(self,filename,snpfile="",params="",n0=-1,n1=-1):
		self.BED=Bed(filename);
		self.pheno=Pheno(filename+".fam");
		self.y=self.pheno.read().val[:,3];
		self.y=self.y-1.0;
		self.params=params;
		n=len(self.y)
		
		if n0>0:
			print "Initiate with n0"
			I0=[i for i in range(0,n) if self.y[i]==0.0]
			I0=I0[:n0]
			I1=[i for i in range(0,n) if self.y[i]==1.0]
			I1=I1[:n1]
			I0.extend(I1);
			self.y=self.y[I0]
			self.BED=self.BED[I0,:]

		try:
			if len(snpfile)>0:
				fil=open(snpfile)
				lines=fil.readlines();
				fil.close();
				self.snps=[l.strip() for l in lines]
			else:
				self.snps=self.BED.sid;
		except:
			print "Error loading SNPs!"
			sys.exit();
		self.setUp();
		self.n=len(self.y)
		print "Number of individuals: "+str(self.n)
		self.Cov=[];
		self.params="";
Esempio n. 9
0
    def divideData(self, filename, num=5, mph=3, delet=True):
        print "Estimating heritability using " + str(num) + " components"
        direct = "TEMP"
        sFil = Bed(filename)
        yFil = Pheno(filename + ".fam")
        n = sFil.iid_count
        reOrd = perm(n)
        yFil = yFil[reOrd, :]
        sFil = sFil[reOrd, :]

        y = yFil.read().val[:, 3]

        div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)]

        varEsts = []

        for i in range(0, num):
            print "For component " + str(i)
            sFilTemp = self.BED[div[i]:div[i + 1], :]
            Xtemp = sFilTemp.read().standardize().val
            ytemp = y[div[i]:div[i + 1]]

            varEsts.append(self.VarCalc.RealVar(ytemp, Xtemp))

        return varEsts
Esempio n. 10
0
def loadData(filename):
    mph = 3
    sFil = Bed(filename)
    yFil = Pheno(filename + ".fam")

    y = yFil.read().val[:, mph]
    y = [i - 1 for i in y]
    return [y, sFil]
Esempio n. 11
0
 def read_phen(self, fn_phen=None):
     """
     read phenotype file
     """
     PH = Pheno(fn_phen)
     PHOB = PH.read()
     self.Y = PHOB.val
     self.SID = PHOB.iid[:, 1]
Esempio n. 12
0
def getData(filename):
    mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	
	X=sFil.read().standardize().val;
	y=yFil.read().val[:,mph];
	return [y,sFil];
Esempio n. 13
0
def getData(filename):
    mph = 3
    sFil = Bed(filename, count_A1=False)
    # Bed object
    yFil = Pheno(filename + ".fam")

    y = yFil.read().val[:, mph]
    y = [i - 1 for i in y
         ]  # the last column of .fam file is the disease states of data owners
    return [y, sFil]
Esempio n. 14
0
def read_covariates(covar, pheno_ids=None, missing_char = 'NA'):
    covar = Pheno(covar, missing=missing_char).read()
    X = np.array(covar.val)
    X = gtarray(X, ids=np.array(covar.iid)[:,1])
    if pheno_ids is not None:
        in_covar = np.array([x in X.id_dict for x in pheno_ids])
        if np.sum((~in_covar))>0:
            raise(ValueError('Missing covariate values for some phenotyped individuals'))
    X.fill_NAs()
    return X
Esempio n. 15
0
def _pheno_fixup(pheno_input, iid_if_none=None, missing='-9'):

    try:
        ret = Pheno(pheno_input, iid_if_none, missing=missing)
        ret.iid  #doing this just to force file load
        return ret
    except:
        return _snps_fixup(pheno_input, iid_if_none=iid_if_none)

    return pheno_input
Esempio n. 16
0
    def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None):
        do_plot = False
        use_cache = False

        # define file names
        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt")
        if not (use_cache and os.path.exists(pcs_fn)):
            from fastlmm.util import compute_auto_pcs
            covar = compute_auto_pcs(bed_fn, count_A1=count_A1)
            logging.info("selected number of PCs: {0}".format(
                covar["vals"].shape[1]))
            Pheno.write(
                pcs_fn,
                SnpData(iid=covar['iid'],
                        sid=covar['header'],
                        val=covar['vals']))
        else:
            logging.info("Using top pcs's cache")
            covar = Pheno(pcs_fn)

        mf_name = "lmp"  #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(20)

        logging.info(
            "Working on h2={0},force_low_rank={1},force_full_rank={2}".format(
                h2, force_low_rank, force_full_rank))
        result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 ==
                                                    .5 else "h2Search")
        output_file_name = os.path.join(self.tempout_dir,
                                        result_file_name) + ".txt"
        results = single_snp_select(test_snps=bed_fn,
                                    G=bed_fn,
                                    pheno=phen_fn,
                                    k_list=[
                                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20,
                                        30, 40, 50, 60, 70, 80, 90, 100, 125,
                                        160, 200, 250, 320, 400, 500, 630, 800,
                                        1000
                                    ],
                                    h2=h2,
                                    n_folds=self.pythonpath +
                                    "/tests/datasets/synth/DebugEmitFolds.txt",
                                    covar=covar,
                                    output_file_name=output_file_name,
                                    force_low_rank=force_low_rank,
                                    force_full_rank=force_full_rank,
                                    GB_goal=2,
                                    count_A1=False
                                    #runner = runner
                                    )
        logging.info(results.head())
        self.compare_files(results, result_file_name)
Esempio n. 17
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[
            1,
            0] = np.NaN  # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata2.val,
                                             decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="",
                                    vectorize=True)
        assert len(dict['vals'].shape) == 1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        snpdata4 = Pheno(None, iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder +
                         "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata5.val,
                                             decimal=10)
        snpdata6 = Pheno(self.currentFolder +
                         "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata6.val,
                                             decimal=10)
Esempio n. 18
0
 def test_covar_by_chrom_mixing(self):
     logging.info(
         "TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing")
     test_snps = Bed(self.bedbase)
     pheno = self.phen_fn
     covar = self.cov_fn
     covar = Pheno(self.cov_fn).read()
     covar = SnpData(iid=covar.iid, sid=["pheno-1"], val=covar.val)
     covar_by_chrom = {chrom: self.cov_fn for chrom in xrange(1, 6)}
     output_file = self.file_name("covar_by_chrom_mixing")
     frame = single_snp(test_snps,
                        pheno,
                        covar=covar,
                        covar_by_chrom=covar_by_chrom,
                        output_file_name=output_file)
     self.compare_files(frame, "covar_by_chrom_mixing")
Esempio n. 19
0
    def test_intersection_Dist2Snp(self):
        from pysnptools.snpreader._dist2snp import _Dist2Snp
        from pysnptools.snpreader import Pheno
        from pysnptools.distreader._subset import _DistSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        dist_all = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")
        k = dist_all.as_snp(max_weight=25)

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) 
        assert isinstance(k1.distreader,_DistSubset) and not isinstance(k1,_SnpSubset)

        #What happens with fancy selection?
        k2 = k[::2,:]
        assert isinstance(k2,_Dist2Snp)

        logging.info("Done with test_intersection")
Esempio n. 20
0
    def test_intersection_Snp2Dist(self):
        from pysnptools.distreader._snp2dist import _Snp2Dist
        from pysnptools.snpreader import Pheno, Bed
        from pysnptools.distreader._subset import _DistSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snp_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True)
        k = snp_all.as_dist(max_weight=2)

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) 
        assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_DistSubset)

        #What happens with fancy selection?
        k2 = k[::2,:]
        assert isinstance(k2,_Snp2Dist)

        logging.info("Done with test_intersection")
Esempio n. 21
0
    def test_multipheno(self):
        logging.info("test_multipheno")

        random_state = RandomState(29921)
        pheno_reference = Pheno(self.phen_fn).read()
        for pheno_count in [2, 5, 1]:
            val = random_state.normal(loc=pheno_count,
                                      scale=pheno_count,
                                      size=(pheno_reference.iid_count,
                                            pheno_count))
            pheno_col = ['pheno{0}'.format(i) for i in range(pheno_count)]
            pheno_multi = SnpData(iid=pheno_reference.iid,
                                  sid=pheno_col,
                                  val=val)

            reference = pd.concat([
                single_snp(test_snps=self.bed,
                           pheno=pheno_multi[:, pheno_index],
                           covar=self.cov_fn)
                for pheno_index in range(pheno_count)
            ])
            frame = single_snp_scale(test_snps=self.bed,
                                     pheno=pheno_multi,
                                     covar=self.cov_fn)

            assert len(frame) == len(
                reference), "# of pairs differs from file '{0}'".format(
                    reffile)
            for sid in sorted(
                    set(reference.SNP
                        )):  #This ignores which pheno produces which pvalue
                pvalue_frame = np.array(
                    sorted(frame[frame['SNP'] == sid].PValue))
                pvalue_reference = np.array(
                    sorted(reference[reference['SNP'] == sid].PValue))
                assert (
                    abs(pvalue_frame - pvalue_reference) < 1e-5
                ).all, "pair {0} differs too much from reference".format(sid)
Esempio n. 22
0
        default='NA')
    parser.add_argument('--no_h2_estimate',
                        action='store_true',
                        default=False,
                        help='Suppress output of h2 estimate')

    args = parser.parse_args()

    ##### Check minimal model is specified #####
    if args.mean_covar is None and args.var_covar is None and args.random_gts is None:
        raise (ValueError(
            'Must specify at least one of: mean_covar, var_covar, random_gts'))

    ####################### Read in data #########################
    #### Read phenotype ###
    pheno = Pheno(args.phenofile, missing=args.missing_char).read()
    y = np.array(pheno.val)
    pheno_ids = np.array(pheno.iid)
    if y.ndim == 1:
        pass
    elif y.ndim == 2:
        y = y[:, args.phen_index - 1]
    else:
        raise (ValueError('Incorrect dimensions of phenotype array'))
    # Remove y NAs
    y_not_nan = np.logical_not(np.isnan(y))
    if np.sum(y_not_nan) < y.shape[0]:
        y = y[y_not_nan]
        pheno_ids = pheno_ids[y_not_nan, :]
    # Make id dictionary
    print('Number of non-missing y observations: ' + str(y.shape[0]))
Esempio n. 23
0
# Via NumPy-style indexing, these allow reading by name and genetic property

#Topic: Other SnpReaders and how to write

#Read from the PLINK phenotype file (text) instead of a Bed file
# Looks like:
#cid0P0 cid0P0 0.4853395139922632
#cid1P0 cid1P0 -0.2076984565752155
#cid2P0 cid2P0 1.4909084058931985
#cid3P0 cid3P0 -1.2128996652683697
#cid4P0 cid4P0 0.4293203431508744
#...

from pysnptools.snpreader import Pheno

phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos
#Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan  nan  nan]]
phenodata = phenoreader.read()
print phenodata.val
#[[  4.85339514e-01]
# [ -2.07698457e-01]
# [  1.49090841e+00]
# [ -1.21289967e+00]
# ...

# Write 1st 10 iids and sids of Bed data into Pheno format
snpdata1010 = Bed("all.bed")[:10, :10].read()
Pheno.write("deleteme1010.txt", snpdata1010)

#Write it to Bed format
Esempio n. 24
0
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree
            val = val[iid_index, :]  #reorder or trim any extra allocation
            if not SnpReader._array_properties_are_ok(val, order, dtype):
                val = val.copy(order=order)
            self._close_bed()

        return val


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    if True:
        from pysnptools.util import example_file
        pheno_fn = example_file("pysnptools/examples/toydata.phe")

    if False:
        from pysnptools.snpreader import Pheno, Bed
        import pysnptools.util as pstutil
        import os
        print(os.getcwd())
        snpdata = Pheno(
            '../examples/toydata.phe').read()  # Read data from Pheno format
        pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.bed")
        Bed.write("tempdir/toydata.5chrom.bed", snpdata,
                  count_A1=False)  # Write data in Bed format

    import doctest
    doctest.testmod(optionflags=doctest.ELLIPSIS)
    # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
Esempio n. 25
0
def run_fastlmm(args):
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from utils import prepare_output_file, read_cvindex
    from fastlmm.inference import FastLMM
    import dill as pickle

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.cvindex_file is not None:
        logger.info('read indices from file: ' + args.cvindex_file)
        train_index, test_index = read_cvindex(args.cvindex_file)
    else:
        train_index = np.nonzero((phenotypes['type'] == 'training').values)[0]
        test_index = np.nonzero((phenotypes['type'] == 'test').values)[0]

    n_snps_total = get_num_snps(args.snp_file)
    n_snps_sel = min(n_snps_total, args.n_snps)
    logger.info('number of sampled SNPs: %d' % n_snps_sel)
    sel_snps = np.random.choice(n_snps_total, size=n_snps_sel)

    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid,
                            args.snp_file,
                            transpose=args.transpose_x,
                            snp_indices=sel_snps,
                            std_filter_indices=train_index)
    logger.info('number of sampled SNPs after filtering by std: %d' %
                test_snps.shape[1])
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0)

    if args.seed:
        logger.info('set random seed for numpy: %d' % args.seed)
        np.seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes.copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file,
                                                                index=False,
                                                                sep='\t',
                                                                header=False)
        pheno = Pheno(pheno_file)
        logger.info('train FastLMM model for %s' % trait)
        model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True)
        model.fit(X=test_snps[train_index, :],
                  y=pheno,
                  K0_train=K0,
                  penalty=args.penalty,
                  Smin=1.0)
        logger.info('fitted h2: %f' % model.h2raw)
        logger.info('predict using the FastLMM model for %s' % trait)
        y_mean, y_var = model.predict(X=test_snps[test_index, :],
                                      K0_whole_test=K0[test_index, :])
        y_true = phenotypes[trait][test_index].values
        result_file = os.path.join(args.output_dir, 'predictions.%s' % trait)
        logger.info('save predictions to file: ' + result_file)
        prepare_output_file(result_file)
        with h5py.File(result_file, 'w') as f:
            f.create_dataset('y_mean', data=y_mean.val)
            f.create_dataset('y_var', data=y_var.val)
            f.create_dataset('y_true', data=y_true)
            f.create_dataset('h2raw', data=model.h2raw)
            f.create_dataset('sel_snps', data=sel_snps)

        model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait)
        logger.info('save model to file: ' + model_file)
        with open(model_file, 'wb') as f:
            pickle.dump(model, f)
Esempio n. 26
0
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn, count_A1=False)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read(
            order='C').standardize()
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read(
            order='C').standardize()

        y = pheno.read().val[:, 0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info(
            "running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7,
                                          n_folds=7,
                                          order_by_lmm=True,
                                          measure="ll",
                                          random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val,
                                                                   G0.val,
                                                                   y,
                                                                   cov=X_cov)

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))

        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:, feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps,
                                pheno,
                                G0=G0,
                                G1=G1,
                                mixing=best_mix,
                                h2=None,
                                leave_out_one_chrom=False,
                                output_file_name=output_file_name,
                                count_A1=False)

        logging.info("results:")
        logging.info("#" * 40)
        logging.info(results_df.head())
        self.compare_files(results_df, "old")
Esempio n. 27
0
        delim = ' '
    if cols[0] == 'FID' and cols[1]== 'IID':
        pass
    else:
        raise ValueError('First two columns of PGS must be FID, IID')
    f.close()
    ids = np.loadtxt(args.pgs, dtype='U', usecols=(0,1), delimiter=delim, skiprows=1)
    pgs_vals = np.loadtxt(args.pgs, usecols=tuple([x for x in range(2, cols.shape[0])]),delimiter=delim, skiprows=1)
    pg = gtarray(pgs_vals.reshape((pgs_vals.shape[0],1)), ids[:, 1], sid=cols[2:cols.shape[0]], fams=ids[:, 0])
    print('Normalising PGS to have mean zero and variance 1')
    pg.mean_normalise()
    pg.scale()

    # Read phenotype
    print('Reading '+str(args.phenofile))
    pheno = Pheno(args.phenofile, missing=args.missing_char).read()
    # pheno = Pheno('phenotypes/eduyears_resid.ped', missing='NA').read()
    y = np.array(pheno.val)
    pheno_ids = np.array(pheno.iid)[:, 1]
    if y.ndim == 1:
        pass
    elif y.ndim == 2:
        y = y[:, args.phen_index - 1]
    else:
        raise ValueError('Incorrect dimensions of phenotype array')
    # Remove y NAs
    y_not_nan = np.logical_not(np.isnan(y))
    if np.sum(y_not_nan) < y.shape[0]:
        y = y[y_not_nan]
        pheno_ids = pheno_ids[y_not_nan]
    y = y-np.mean(y)
# Load FaST-LMM basic association test:
from fastlmm.association import single_snp
from pysnptools.snpreader import Ped
from pysnptools.snpreader import Pheno
from pysnptools.snpreader import wrap_plink_parser
import numpy as np
from sys import argv
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import fastlmm.util.util as flutil

script, inped_file, inpheno_file, results_dataframe, output_manhattan = argv

# Load snp data:
print "Loading variant data..."
ped_file = Ped(inped_file)
print "Loading phenotype data..."
pheno_fn = Pheno(inpheno_file)

# Run basic association test:
print "Running FaST-LMM single_snp test..."
results_df = single_snp(test_snps=ped_file, pheno=pheno_fn, leave_out_one_chrom=0, output_file_name=results_dataframe)

chromosome_starts = flutil.manhattan_plot(results_df.as_matrix(["Chr", "ChrPos", "PValue"]), pvalue_line=4.4e-7, xaxis_unit_bp=True)
plt.show()
# fig = plt.figure()
# fig.savefig(output_manhattan)