コード例 #1
0
ファイル: estHerit.py プロジェクト: seanken/PrivSTRAT
def divideData(filename,direct,num=5,mph=3,delet=True):
	print "Estimating heritability using "+str(num)+" components"
	[yFil,sFil]=getData(filename,mph=mph);
	n=sFil.iid_count	
	reOrd=perm(n);
	yFil=yFil[reOrd,:];
	sFil=sFil[reOrd,:];

	div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)];
		
	varEsts=[];

	for i in range(0,num):
		print "For component "+str(i);
		sFilTemp=sFil[div[i]:div[i+1],:];

		yFilTemp=yFil[div[i]:div[i+1],:];

		fileTemp=direct+"/tempFile_"+str(i);
		Bed.write(fileTemp,sFilTemp.read());
		Pheno.write(fileTemp+".phen",yFilTemp.read())
		
		varEsts.append(varRes(fileTemp,direct));
		
		

		if delet:
			os.system("rm "+direct+"/tempFile_"+str(i)+"*");
	
	return varEsts;
コード例 #2
0
	def __init__(self,filename,snpfile="",params="",n0=-1,n1=-1):
		self.BED=Bed(filename);
		self.pheno=Pheno(filename+".fam");
		self.y=self.pheno.read().val[:,3];
		self.y=self.y-1.0;
		self.params=params;
		n=len(self.y)
		
		if n0>0:
			print "Initiate with n0"
			I0=[i for i in range(0,n) if self.y[i]==0.0]
			I0=I0[:n0]
			I1=[i for i in range(0,n) if self.y[i]==1.0]
			I1=I1[:n1]
			I0.extend(I1);
			self.y=self.y[I0]
			self.BED=self.BED[I0,:]

		try:
			if len(snpfile)>0:
				fil=open(snpfile)
				lines=fil.readlines();
				fil.close();
				self.snps=[l.strip() for l in lines]
			else:
				self.snps=self.BED.sid;
		except:
			print "Error loading SNPs!"
			sys.exit();
		self.setUp();
		self.n=len(self.y)
		print "Number of individuals: "+str(self.n)
		self.Cov=[];
		self.params="";
コード例 #3
0
ファイル: MU_LMM.py プロジェクト: seanken/PrivSTRAT
	def divideData(self,filename,num=5,mph=3,delet=True):
		print "Estimating heritability using "+str(num)+" components"
		direct="TEMP"
		sFil=Bed(filename);
		yFil=Pheno(filename+".fam");
		n=sFil.iid_count	
		reOrd=perm(n);
		yFil=yFil[reOrd,:];
		sFil=sFil[reOrd,:];

		y=yFil.read().val[:,3];

		div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)];
		
		varEsts=[];

		for i in range(0,num):
			print "For component "+str(i);
			sFilTemp=self.BED[div[i]:div[i+1],:];
			Xtemp=sFilTemp.read().standardize().val;
			ytemp=y[div[i]:div[i+1]];

			varEsts.append(self.VarCalc.RealVar(ytemp,Xtemp));
		
		return varEsts;
コード例 #4
0
    def divideData(self, filename, num=5, mph=3, delet=True):
        print "Estimating heritability using " + str(num) + " components"
        direct = "TEMP"
        sFil = Bed(filename)
        yFil = Pheno(filename + ".fam")
        n = sFil.iid_count
        reOrd = perm(n)
        yFil = yFil[reOrd, :]
        sFil = sFil[reOrd, :]

        y = yFil.read().val[:, 3]

        div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)]

        varEsts = []

        for i in range(0, num):
            print "For component " + str(i)
            sFilTemp = self.BED[div[i]:div[i + 1], :]
            Xtemp = sFilTemp.read().standardize().val
            ytemp = y[div[i]:div[i + 1]]

            varEsts.append(self.VarCalc.RealVar(ytemp, Xtemp))

        return varEsts
コード例 #5
0
ファイル: estHerit.py プロジェクト: wenwenyu/PrivSTRAT
def divideData(filename, direct, num=5, mph=3, delet=True):
    print "Estimating heritability using " + str(num) + " components"
    [yFil, sFil] = getData(filename, mph=mph)
    n = sFil.iid_count
    reOrd = perm(n)
    yFil = yFil[reOrd, :]
    sFil = sFil[reOrd, :]

    div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)]

    varEsts = []

    for i in range(0, num):
        print "For component " + str(i)
        sFilTemp = sFil[div[i]:div[i + 1], :]

        yFilTemp = yFil[div[i]:div[i + 1], :]

        fileTemp = direct + "/tempFile_" + str(i)
        Bed.write(fileTemp, sFilTemp.read())
        Pheno.write(fileTemp + ".phen", yFilTemp.read())

        varEsts.append(varRes(fileTemp, direct))

        if delet:
            os.system("rm " + direct + "/tempFile_" + str(i) + "*")

    return varEsts
コード例 #6
0
ファイル: loadData.py プロジェクト: seanken/LogAttack
def loadData(filename):
    mph = 3
    sFil = Bed(filename)
    yFil = Pheno(filename + ".fam")

    y = yFil.read().val[:, mph]
    y = [i - 1 for i in y]
    return [y, sFil]
コード例 #7
0
ファイル: loadFile.py プロジェクト: seanken/PrivSTRAT
def getData(filename):
	mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	
	y=yFil.read().val[:,mph];
	y=[i-1 for i in y]
	return [y,sFil];
コード例 #8
0
ファイル: phenotypes.py プロジェクト: kuod/pygcta
 def read_phen(self,fn_phen = None):
     """
     read phenotype file
     """
     PH = Pheno(fn_phen)
     PHOB = PH.read()
     self.Y = PHOB.val
     self.SID = PHOB.iid[:,1]
コード例 #9
0
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
コード例 #10
0
ファイル: loadFile.py プロジェクト: wenwenyu/PrivSTRAT
def getData(filename):
    mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	
	X=sFil.read().standardize().val;
	y=yFil.read().val[:,mph];
	return [y,sFil];
コード例 #11
0
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
コード例 #12
0
 def read_phen(self, fn_phen=None):
     """
     read phenotype file
     """
     PH = Pheno(fn_phen)
     PHOB = PH.read()
     self.Y = PHOB.val
     self.SID = PHOB.iid[:, 1]
コード例 #13
0
ファイル: loadFile.py プロジェクト: jianhao666/SourceCode
def getData(filename):
    mph = 3
    sFil = Bed(filename, count_A1=False)
    # Bed object
    yFil = Pheno(filename + ".fam")

    y = yFil.read().val[:, mph]
    y = [i - 1 for i in y
         ]  # the last column of .fam file is the disease states of data owners
    return [y, sFil]
コード例 #14
0
    def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None):
        do_plot = False
        use_cache = False

        # define file names
        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt")
        if not (use_cache and os.path.exists(pcs_fn)):
            from fastlmm.util import compute_auto_pcs
            covar = compute_auto_pcs(bed_fn, count_A1=count_A1)
            logging.info("selected number of PCs: {0}".format(
                covar["vals"].shape[1]))
            Pheno.write(
                pcs_fn,
                SnpData(iid=covar['iid'],
                        sid=covar['header'],
                        val=covar['vals']))
        else:
            logging.info("Using top pcs's cache")
            covar = Pheno(pcs_fn)

        mf_name = "lmp"  #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(20)

        logging.info(
            "Working on h2={0},force_low_rank={1},force_full_rank={2}".format(
                h2, force_low_rank, force_full_rank))
        result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 ==
                                                    .5 else "h2Search")
        output_file_name = os.path.join(self.tempout_dir,
                                        result_file_name) + ".txt"
        results = single_snp_select(test_snps=bed_fn,
                                    G=bed_fn,
                                    pheno=phen_fn,
                                    k_list=[
                                        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20,
                                        30, 40, 50, 60, 70, 80, 90, 100, 125,
                                        160, 200, 250, 320, 400, 500, 630, 800,
                                        1000
                                    ],
                                    h2=h2,
                                    n_folds=self.pythonpath +
                                    "/tests/datasets/synth/DebugEmitFolds.txt",
                                    covar=covar,
                                    output_file_name=output_file_name,
                                    force_low_rank=force_low_rank,
                                    force_full_rank=force_full_rank,
                                    GB_goal=2,
                                    count_A1=False
                                    #runner = runner
                                    )
        logging.info(results.head())
        self.compare_files(results, result_file_name)
コード例 #15
0
ファイル: estHerit.py プロジェクト: wenwenyu/PrivSTRAT
def getData(filename="", mph=3, UseCov=False):
    sFil = Bed(filename)
    yFil = Pheno(filename + ".fam")

    Q = []
    if isfile(filename + ".cov") and UseCov:
        QFil = Pheno(filename + ".cov")
        [sFil, yFil, QFil] = intersect_apply([sFil, yFil, QFil])
    if isfile(filename + ".phen"):
        yFil = Pheno(filename + ".phen")
        [sFil, yFil] = intersect_apply([sFil, yFil])
    return [yFil, sFil]
コード例 #16
0
def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1):
    """Read a phenotype file and remove missing values.

    Args:
        phenofile : :class:`str`
            path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ...
        missing_char : :class:`str`
            The character that denotes a missing phenotype value; 'NA' by default.
        phen_index : :class:`int`
           The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile

    Returns:
        y : :class:`~numpy:numpy.array`
            vector of non-missing phenotype values from specified column of phenofile
        pheno_ids: :class:`~numpy:numpy.array`
            corresponding vector of individual IDs (IID)
    """
    pheno = Pheno(phenofile, missing=missing_char)[:,phen_index-1].read()
    y = np.array(pheno.val)
    y.reshape((y.shape[0],1))
    pheno_ids = np.array(pheno.iid)[:,1]
    # Remove y NAs
    y_not_nan = np.logical_not(np.isnan(y[:,0]))
    if np.sum(y_not_nan) < y.shape[0]:
        y = y[y_not_nan,:]
        pheno_ids = pheno_ids[y_not_nan]
    print('Number of non-missing phenotype observations: ' + str(y.shape[0]))
    return gtarray(y,ids=pheno_ids)
コード例 #17
0
    def estVar(self, num, epsilon):
        filename = self.BED.filename
        y = Pheno(filename + ".fam").read().val[:, 3]
        varEsts = self.divideData(filename, num=num)
        if epsilon < 0:
            return varEsts[0]
        e1 = .1 * epsilon
        e2 = .45 * epsilon
        e3 = .45 * epsilon
        vary = self.estVarY(y, e1)
        se2 = sum([v[1] for v in varEsts]) / float(num) + Lap(
            0.0, vary / (e2 * float(num)))
        if se2 < 0:
            se2 = 0
        if se2 > vary:
            se2 = vary
        sg2 = sum([v[0] for v in varEsts]) / float(num) + Lap(
            0.0, vary / (e3 * float(num)))

        if sg2 < 0:
            sg2 = .01 * vary
        if sg2 > vary:
            sg2 = vary

        return [sg2, se2]
コード例 #18
0
def read_covariates(covar_file, ids_to_match, missing):
    ## Read a covariate file and reorder to match ids_to_match ##
    # Read covariate file
    covar_f = Pheno(covar_file, missing=missing).read()
    ids = covar_f.iid
    # Get covariate values
    n_X = covar_f._col.shape[0] + 1
    X = np.ones((covar_f.val.shape[0], n_X))
    X[:, 1:n_X] = covar_f.val
    # Get covariate names
    X_names = np.zeros((n_X), dtype='S10')
    X_names[0] = 'Intercept'
    X_names[1:n_X] = np.array(covar_f._col, dtype='S20')
    # Remove NAs
    NA_rows = np.isnan(X).any(axis=1)
    n_NA_row = np.sum(NA_rows)
    if n_NA_row > 0:
        print(
            'Number of rows removed from covariate file due to missing observations: '
            + str(np.sum(NA_rows)))
        X = X[~NA_rows]
        ids = ids[~NA_rows]
    id_dict = id_dict_make(ids)
    # Match with pheno_ids
    ids_to_match_tuples = [tuple(x) for x in ids_to_match]
    common_ids = id_dict.viewkeys() & set(ids_to_match_tuples)
    pheno_in = np.array([(tuple(x) in common_ids) for x in ids_to_match])
    match_ids = ids_to_match[pheno_in, :]
    X_id_match = np.array([id_dict[tuple(x)] for x in match_ids])
    X = X[X_id_match, :]
    return [X, X_names, pheno_in]
コード例 #19
0
    def test_intersection(self):

        from pysnptools.standardizer import Unit
        from pysnptools.kernelreader import SnpKernel
        from pysnptools.snpreader import Pheno
        from pysnptools.kernelreader._subset import _KernelSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",
                       count_A1=False)
        k = SnpKernel(snps_all, stdizer.Identity())

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:, :]  # To test intersection we remove a iid from pheno

        k1, pheno = intersect_apply([
            k, pheno
        ])  #SnpKernel is special because it standardizes AFTER intersecting.
        assert isinstance(k1.snpreader,
                          _SnpSubset) and not isinstance(k1, _KernelSubset)

        #What happens with fancy selection?
        k2 = k[::2]
        assert isinstance(k2, SnpKernel)

        logging.info("Done with test_intersection")
コード例 #20
0
ファイル: run_fastlmm.py プロジェクト: caudjcc/emaize-1
def test_single_snp(args):
    import fastlmm
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from fastlmm.association import single_snp
    from utils import read_hdf5_dataset
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import fastlmm.util.util as flutil

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.sample_indices_file is not None:
        logger.info('read indices from file: ' + args.sample_indices_file)
        sample_indices = read_hdf5_dataset(args.sample_indices_file)
    else:
        sample_indices = np.nonzero(
            (phenotypes['type'] == 'training').values)[0]
    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices)
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes[phenotypes['type'] == 'training'].copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno[['fid', 'iid', trait]].to_csv(pheno_file,
                                               index=False,
                                               sep='\t',
                                               header=False)
        pheno = Pheno(pheno_file)
        logger.info('run FastLMM for single SNP test for %s' % trait)
        results_df = single_snp(test_snps,
                                pheno,
                                K0=K0,
                                count_A1=True,
                                GB_goal=args.GB_goal)
        result_file = os.path.join(args.output_dir, 'single_snp.' + trait)
        logger.info('save results to file: ' + result_file)
        results_df.to_hdf(result_file, trait)

        if args.manhattan:
            plot_file = os.path.join(args.output_dir,
                                     'manhattan.%s.pdf' % trait)
            logger.info('create Manhattan plot: ' + plot_file)
            plt.clf()
            flutil.manhattan_plot(results_df.as_matrix(
                ["Chr", "ChrPos", "PValue"]),
                                  pvalue_line=1e-5,
                                  xaxis_unit_bp=False)
            plt.savefig(plot_file)
コード例 #21
0
ファイル: test.py プロジェクト: MicrosoftGenomics/PySnpTools
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)


        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True)
        assert len(dict['vals'].shape)==1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)

        snpdata4 = Pheno(None,iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10)
        snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
コード例 #22
0
def read_covariates(covar, pheno_ids=None, missing_char = 'NA'):
    covar = Pheno(covar, missing=missing_char).read()
    X = np.array(covar.val)
    X = gtarray(X, ids=np.array(covar.iid)[:,1])
    if pheno_ids is not None:
        in_covar = np.array([x in X.id_dict for x in pheno_ids])
        if np.sum((~in_covar))>0:
            raise(ValueError('Missing covariate values for some phenotyped individuals'))
    X.fill_NAs()
    return X
コード例 #23
0
ファイル: fastlmmmodel.py プロジェクト: DSLituiev/FaST-LMM
def _pheno_fixup(pheno_input, iid_if_none=None, missing='-9'):

    try:
        ret = Pheno(pheno_input, iid_if_none, missing=missing)
        ret.iid  #doing this just to force file load
        return ret
    except:
        return _snps_fixup(pheno_input, iid_if_none=iid_if_none)

    return pheno_input
コード例 #24
0
ファイル: loadFile.py プロジェクト: seanken/AllelicTest
def getData(filename):
	mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	snpList=sFil.sid;
	y=yFil.read().val[:,mph];
	y=[i-1 for i in y]
	Icases=[i for i in range(0,len(y)) if y[i]>0];
	Icont=[i for i in range(0,len(y)) if y[i]<1];
	sFilcases=sFil[Icases,:]
	sFilcont=sFil[Icont,:]


	Dcont=sFilcont.read().val;
	Dcases=sFilcases.read().val;

	
	r=getMarginals(Dcont);
	s=getMarginals(Dcases);

	return [r,s,snpList];
コード例 #25
0
    def _sel_plus_pc(self,h2,force_low_rank,force_full_rank,count_A1=None):
        do_plot = False
        use_cache = False

        # define file names
        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        pcs_fn = os.path.join(self.tempout_dir,"sel_plus_pc.pcs.txt")
        if not (use_cache and os.path.exists(pcs_fn)):
            from fastlmm.util import compute_auto_pcs
            covar = compute_auto_pcs(bed_fn,count_A1=count_A1)
            logging.info("selected number of PCs: {0}".format(covar["vals"].shape[1]))
            Pheno.write(pcs_fn,SnpData(iid=covar['iid'],sid=covar['header'],val=covar['vals']))
        else:
            logging.info("Using top pcs's cache")
            covar=Pheno(pcs_fn)


        mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp"
        runner = mf_to_runner_function(mf_name)(20)

        logging.info("Working on h2={0},force_low_rank={1},force_full_rank={2}".format(h2,force_low_rank,force_full_rank))
        result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search")
        output_file_name = os.path.join(self.tempout_dir,result_file_name)+".txt"
        results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn,
                                        k_list = [0,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,125,160,200,250,320,400,500,630,800,1000],
                                        h2=h2,
                                        n_folds = self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt",
                                        covar=covar,
                                        output_file_name=output_file_name,
                                        force_low_rank=force_low_rank,force_full_rank=force_full_rank,
                                        GB_goal=2,
                                        count_A1=False
                                        #runner = runner
                                    )
        logging.info(results.head())
        self.compare_files(results,result_file_name)
コード例 #26
0
 def test_covar_by_chrom_mixing(self):
     logging.info(
         "TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing")
     test_snps = Bed(self.bedbase)
     pheno = self.phen_fn
     covar = self.cov_fn
     covar = Pheno(self.cov_fn).read()
     covar = SnpData(iid=covar.iid, sid=["pheno-1"], val=covar.val)
     covar_by_chrom = {chrom: self.cov_fn for chrom in xrange(1, 6)}
     output_file = self.file_name("covar_by_chrom_mixing")
     frame = single_snp(test_snps,
                        pheno,
                        covar=covar,
                        covar_by_chrom=covar_by_chrom,
                        output_file_name=output_file)
     self.compare_files(frame, "covar_by_chrom_mixing")
コード例 #27
0
    def test_intersection_Dist2Snp(self):
        from pysnptools.snpreader._dist2snp import _Dist2Snp
        from pysnptools.snpreader import Pheno
        from pysnptools.distreader._subset import _DistSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        dist_all = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz")
        k = dist_all.as_snp(max_weight=25)

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) 
        assert isinstance(k1.distreader,_DistSubset) and not isinstance(k1,_SnpSubset)

        #What happens with fancy selection?
        k2 = k[::2,:]
        assert isinstance(k2,_Dist2Snp)

        logging.info("Done with test_intersection")
コード例 #28
0
    def test_intersection_Snp2Dist(self):
        from pysnptools.distreader._snp2dist import _Snp2Dist
        from pysnptools.snpreader import Pheno, Bed
        from pysnptools.distreader._subset import _DistSubset
        from pysnptools.snpreader._subset import _SnpSubset
        from pysnptools.util import intersect_apply

        snp_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True)
        k = snp_all.as_dist(max_weight=2)

        pheno = Pheno(self.currentFolder + "/../examples/toydata.phe")
        pheno = pheno[1:,:] # To test intersection we remove a iid from pheno

        k1,pheno = intersect_apply([k,pheno]) 
        assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_DistSubset)

        #What happens with fancy selection?
        k2 = k[::2,:]
        assert isinstance(k2,_Snp2Dist)

        logging.info("Done with test_intersection")
コード例 #29
0
    def test_multipheno(self):
        logging.info("test_multipheno")

        random_state = RandomState(29921)
        pheno_reference = Pheno(self.phen_fn).read()
        for pheno_count in [2, 5, 1]:
            val = random_state.normal(loc=pheno_count,
                                      scale=pheno_count,
                                      size=(pheno_reference.iid_count,
                                            pheno_count))
            pheno_col = ['pheno{0}'.format(i) for i in range(pheno_count)]
            pheno_multi = SnpData(iid=pheno_reference.iid,
                                  sid=pheno_col,
                                  val=val)

            reference = pd.concat([
                single_snp(test_snps=self.bed,
                           pheno=pheno_multi[:, pheno_index],
                           covar=self.cov_fn)
                for pheno_index in range(pheno_count)
            ])
            frame = single_snp_scale(test_snps=self.bed,
                                     pheno=pheno_multi,
                                     covar=self.cov_fn)

            assert len(frame) == len(
                reference), "# of pairs differs from file '{0}'".format(
                    reffile)
            for sid in sorted(
                    set(reference.SNP
                        )):  #This ignores which pheno produces which pvalue
                pvalue_frame = np.array(
                    sorted(frame[frame['SNP'] == sid].PValue))
                pvalue_reference = np.array(
                    sorted(reference[reference['SNP'] == sid].PValue))
                assert (
                    abs(pvalue_frame - pvalue_reference) < 1e-5
                ).all, "pair {0} differs too much from reference".format(sid)
コード例 #30
0
ファイル: fit_hlmm_model.py プロジェクト: AlexTISYoung/hlmm
        n_V = 1
        V_names = np.array(['Intercept'])
    n_pars = n_X + n_V + 1
    print(str(n_pars) + ' parameters in model')

    # Get sample size
    n = y.shape[0]
    if n == 0:
        raise (ValueError('No non-missing observations with both phenotype and genotype data'))
    print(str(n) + ' individuals with no missing phenotype or covariate observations')
    n = float(n)

    #### Read random effect genotypes ####
    if args.random_gts is not None:
        if args.random_gts_txt:
            random_gts_f = Pheno(args.random_gts)
        else:
            random_gts_f = Bed(args.random_gts)
        random_gts_ids = np.array(random_gts_f.iid)
        random_gts_f = random_gts_f.read()
        # Match to phenotypes
        pheno_id_dict = id_dict_make(pheno_ids)
        G_random = random_gts_f.val
        G = np.empty((y.shape[0], G_random.shape[1]))
        G[:] = np.nan
        for i in xrange(0, random_gts_ids.shape[0]):
            if tuple(random_gts_ids[i, :]) in pheno_id_dict:
                G[pheno_id_dict[tuple(random_gts_ids[i, :])], :] = G_random[i, :]
        del G_random
        # Check for NAs
        random_isnan = np.isnan(G)
コード例 #31
0
        default='NA')
    parser.add_argument('--no_h2_estimate',
                        action='store_true',
                        default=False,
                        help='Suppress output of h2 estimate')

    args = parser.parse_args()

    ##### Check minimal model is specified #####
    if args.mean_covar is None and args.var_covar is None and args.random_gts is None:
        raise (ValueError(
            'Must specify at least one of: mean_covar, var_covar, random_gts'))

    ####################### Read in data #########################
    #### Read phenotype ###
    pheno = Pheno(args.phenofile, missing=args.missing_char).read()
    y = np.array(pheno.val)
    pheno_ids = np.array(pheno.iid)
    if y.ndim == 1:
        pass
    elif y.ndim == 2:
        y = y[:, args.phen_index - 1]
    else:
        raise (ValueError('Incorrect dimensions of phenotype array'))
    # Remove y NAs
    y_not_nan = np.logical_not(np.isnan(y))
    if np.sum(y_not_nan) < y.shape[0]:
        y = y[y_not_nan]
        pheno_ids = pheno_ids[y_not_nan, :]
    # Make id dictionary
    print('Number of non-missing y observations: ' + str(y.shape[0]))
コード例 #32
0
ファイル: run_fastlmm.py プロジェクト: caudjcc/emaize-1
def run_fastlmm(args):
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from utils import prepare_output_file, read_cvindex
    from fastlmm.inference import FastLMM
    import dill as pickle

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.cvindex_file is not None:
        logger.info('read indices from file: ' + args.cvindex_file)
        train_index, test_index = read_cvindex(args.cvindex_file)
    else:
        train_index = np.nonzero((phenotypes['type'] == 'training').values)[0]
        test_index = np.nonzero((phenotypes['type'] == 'test').values)[0]

    n_snps_total = get_num_snps(args.snp_file)
    n_snps_sel = min(n_snps_total, args.n_snps)
    logger.info('number of sampled SNPs: %d' % n_snps_sel)
    sel_snps = np.random.choice(n_snps_total, size=n_snps_sel)

    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid,
                            args.snp_file,
                            transpose=args.transpose_x,
                            snp_indices=sel_snps,
                            std_filter_indices=train_index)
    logger.info('number of sampled SNPs after filtering by std: %d' %
                test_snps.shape[1])
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0)

    if args.seed:
        logger.info('set random seed for numpy: %d' % args.seed)
        np.seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes.copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file,
                                                                index=False,
                                                                sep='\t',
                                                                header=False)
        pheno = Pheno(pheno_file)
        logger.info('train FastLMM model for %s' % trait)
        model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True)
        model.fit(X=test_snps[train_index, :],
                  y=pheno,
                  K0_train=K0,
                  penalty=args.penalty,
                  Smin=1.0)
        logger.info('fitted h2: %f' % model.h2raw)
        logger.info('predict using the FastLMM model for %s' % trait)
        y_mean, y_var = model.predict(X=test_snps[test_index, :],
                                      K0_whole_test=K0[test_index, :])
        y_true = phenotypes[trait][test_index].values
        result_file = os.path.join(args.output_dir, 'predictions.%s' % trait)
        logger.info('save predictions to file: ' + result_file)
        prepare_output_file(result_file)
        with h5py.File(result_file, 'w') as f:
            f.create_dataset('y_mean', data=y_mean.val)
            f.create_dataset('y_var', data=y_var.val)
            f.create_dataset('y_true', data=y_true)
            f.create_dataset('h2raw', data=model.h2raw)
            f.create_dataset('sel_snps', data=sel_snps)

        model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait)
        logger.info('save model to file: ' + model_file)
        with open(model_file, 'wb') as f:
            pickle.dump(model, f)
コード例 #33
0
# Via NumPy-style indexing, these allow reading by name and genetic property

#Topic: Other SnpReaders and how to write

#Read from the PLINK phenotype file (text) instead of a Bed file
# Looks like:
#cid0P0 cid0P0 0.4853395139922632
#cid1P0 cid1P0 -0.2076984565752155
#cid2P0 cid2P0 1.4909084058931985
#cid3P0 cid3P0 -1.2128996652683697
#cid4P0 cid4P0 0.4293203431508744
#...

from pysnptools.snpreader import Pheno

phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos
#Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan  nan  nan]]
phenodata = phenoreader.read()
print phenodata.val
#[[  4.85339514e-01]
# [ -2.07698457e-01]
# [  1.49090841e+00]
# [ -1.21289967e+00]
# ...

# Write 1st 10 iids and sids of Bed data into Pheno format
snpdata1010 = Bed("all.bed")[:10, :10].read()
Pheno.write("deleteme1010.txt", snpdata1010)

#Write it to Bed format
コード例 #34
0
class TestHeritabilitySpatialCorrection(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from pysnptools.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "..", ".."))
        self.snpreader_whole = Bed(self.pythonpath +
                                   "/tests/datasets/synth/all",
                                   count_A1=False)
        self.pheno_whole = Pheno(self.pythonpath +
                                 "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/heritability_spatial_correction"

    def file_name(self, testcase_name):
        temp_fn = os.path.join(self.tempout_dir, testcase_name)
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_one(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "one.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        half = self.pheno_whole.read().val
        pheno = SnpData(iid=self.pheno_whole.iid,
                        sid=["pheno0", "pheno1"],
                        val=np.c_[half, half])

        spatial_coor = [[i, -i]
                        for i in xrange(self.snpreader_whole.iid_count)]
        alpha_list = alpha_list_big = [
            int(v) for v in np.logspace(2, np.log10(4000), 2)
        ]
        dataframe = heritability_spatial_correction(self.snpreader_whole,
                                                    spatial_coor,
                                                    self.snpreader_whole.iid,
                                                    alpha_list,
                                                    2,
                                                    pheno,
                                                    jackknife_count=2,
                                                    permute_plus_count=1,
                                                    permute_times_count=1,
                                                    just_testing=True)

        dataframe.to_csv(tmpOutfile, sep="\t", index=False)
        referenceOutfile = TestFeatureSelection.reference_file(
            "heritability_spatial_correction/" + fn)
        out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance)
        self.assertTrue(
            out,
            "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile,
                                                     tmpOutfile))

    def test_two(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "two.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        snpreader = self.snpreader_whole[:10, :]

        spatial_coor = [[i, -i] for i in xrange(snpreader.iid_count)]
        alpha_list = alpha_list_big = [
            int(v) for v in np.logspace(2, np.log10(4000), 2)
        ]
        dataframe = heritability_spatial_correction(snpreader,
                                                    spatial_coor,
                                                    snpreader.iid,
                                                    alpha_list,
                                                    2,
                                                    self.pheno_whole,
                                                    jackknife_count=2,
                                                    permute_plus_count=1,
                                                    permute_times_count=1,
                                                    just_testing=False)

        dataframe.to_csv(tmpOutfile, sep="\t", index=False)
        referenceOutfile = TestFeatureSelection.reference_file(
            "heritability_spatial_correction/" + fn)
        out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance)
        self.assertTrue(
            out,
            "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile,
                                                     tmpOutfile))

    def test_three(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "three.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        snpreader = self.snpreader_whole[:10, :]

        spatial_coor = [[i, -i] for i in xrange(snpreader.iid_count)]
        alpha_list = alpha_list_big = [
            int(v) for v in np.logspace(2, np.log10(4000), 2)
        ]
        dataframe = heritability_spatial_correction(snpreader,
                                                    spatial_coor,
                                                    snpreader.iid,
                                                    alpha_list,
                                                    2,
                                                    self.pheno_whole,
                                                    jackknife_count=0,
                                                    permute_plus_count=0,
                                                    permute_times_count=0,
                                                    just_testing=False)

        dataframe.to_csv(tmpOutfile, sep="\t", index=False)
        referenceOutfile = TestFeatureSelection.reference_file(
            "heritability_spatial_correction/" + fn)
        out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance)
        self.assertTrue(
            out,
            "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile,
                                                     tmpOutfile))

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)) + "/..")
        result = doctest.testfile("../heritability_spatial_correction.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
コード例 #35
0
ファイル: bed.py プロジェクト: eric-czech/PySnpTools
                val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree
            val = val[iid_index, :]  #reorder or trim any extra allocation
            if not SnpReader._array_properties_are_ok(val, order, dtype):
                val = val.copy(order=order)
            self._close_bed()

        return val


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    if True:
        from pysnptools.util import example_file
        pheno_fn = example_file("pysnptools/examples/toydata.phe")

    if False:
        from pysnptools.snpreader import Pheno, Bed
        import pysnptools.util as pstutil
        import os
        print(os.getcwd())
        snpdata = Pheno(
            '../examples/toydata.phe').read()  # Read data from Pheno format
        pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.bed")
        Bed.write("tempdir/toydata.5chrom.bed", snpdata,
                  count_A1=False)  # Write data in Bed format

    import doctest
    doctest.testmod(optionflags=doctest.ELLIPSIS)
    # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
コード例 #36
0
class TestLinRegTrain(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/linear_regression"

    def file_name(self,testcase_name):
        temp_fn = os.path.join(self.tempout_dir,testcase_name+".dat")
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_lr_real(self):
        do_plot = False

        import pylab
        logging.info("TestLinRegTrain test_lr_real")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()
        K0_test_test = KernelIdentity(covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

        pheno_train = pheno[train_idx,:].read()
        pheno_test = pheno[test_idx,:].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
        Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0])
        bs=lsqSol[0] #weights
        r2=lsqSol[1] #squared residuals
        D=lsqSol[2]  #rank of design matrix
        N=pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2/N)
            nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
        else:
            sigma2 = float(r2 / (N-D))
            nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
            nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name,K0_train,K0_whole_test in [("Identity Kernel",None,None)]:

            first_name = first_name or name
            #Learn model, save, load
            modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                
                
            filename = self.tempout_dir + "/model_lr_real.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(modelx, filename) 
            model = joblib.load(filename)

            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = model.predict(K0_whole_test=K0_train, X=covariate_train) #test on train
                output_file = self.file_name("lr_reala_"+name)
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr_reala.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_pheno,"lr2a_"+first_name)
                self.compare_files(covar2,"lr2a.cov_"+first_name)

            #Predict with model (test on test)
            predicted_pheno, covar  = model.predict(K0_whole_test=K0_whole_test, X=covariate_test) #test on train
            output_file = self.file_name("lr_realb_"+name)
            Dat.write(output_file,predicted_pheno)
            covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
            output_file = self.file_name("lr_realb.cov_"+name)
            Dat.write(output_file,covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno,"lr2b_"+first_name)
            self.compare_files(covar2,"lr2b.cov_"+first_name)



    def compare_files(self,answer,ref_base):
        reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat") #Uses same results folder as lmm_train
        reference=Dat(reffile).read()
        assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile)
        assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile)
        for iid_index in xrange(reference.row_count):
            for sid_index in xrange(reference.col_count):
                a_v = answer.val[iid_index,sid_index]
                r_v = reference.val[iid_index,sid_index]
                assert abs(a_v - r_v) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile)

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..")
        result = doctest.testfile("../linear_regression.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
コード例 #37
0
class TestLinRegTrain(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "..", ".."))

        self.snpreader_whole = Bed(self.pythonpath +
                                   "/tests/datasets/synth/all",
                                   count_A1=False)
        self.covariate_whole = Pheno(self.pythonpath +
                                     "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath +
                                 "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/linear_regression"

    def file_name(self, testcase_name):
        temp_fn = os.path.join(self.tempout_dir, testcase_name + ".dat")
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_lr_real(self):
        do_plot = False

        import pylab
        logging.info("TestLinRegTrain test_lr_real")

        train_idx = np.r_[10:self.snpreader_whole.iid_count]  # iids 10 and on
        test_idx = np.r_[0:10]  # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx, :].read()
        covariate_test = covar[test_idx, :].read()
        K0_test_test = KernelIdentity(covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(
            size=covar.val.shape) * 10

        pheno_train = pheno[train_idx, :].read()
        pheno_test = pheno[test_idx, :].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val, ".",
                       covariate_test.val, pheno_test.val, ".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,
                       np.ones((covariate_train.iid_count, 1))]
        Xtest = np.c_[covariate_test.val,
                      np.ones((covariate_test.iid_count, 1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:, 0], rcond=-1)
        bs = lsqSol[0]  #weights
        r2 = lsqSol[1]  #squared residuals
        D = lsqSol[2]  #rank of design matrix
        N = pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2 / N)
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + N * 0.5
        else:
            sigma2 = float(r2 / (N - D))
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + 0.5 / sigma2 * r2
            nLL -= 0.5 * D * np.log(2 * np.pi * sigma2)
            #REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val, "g.",
                       covariate_test.val, predicted, "r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val,
                           predicted,
                           yerr,
                           linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name, K0_train, K0_whole_test in [("Identity Kernel", None, None)]:

            first_name = first_name or name
            #Learn model, save, load
            modelx = LinearRegression().fit(K0_train=K0_train,
                                            X=covariate_train,
                                            y=pheno_train)

            filename = self.tempout_dir + "/model_lr_real.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(modelx, filename)
            model = joblib.load(filename)

            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = model.predict(
                    K0_whole_test=K0_train, X=covariate_train)  #test on train
                output_file = self.file_name("lr_reala_" + name)
                Dat.write(output_file, predicted_pheno)
                covar2 = SnpData(
                    iid=covar.row, sid=covar.col[:, 1],
                    val=covar.val)  #kludge to write kernel to text format
                output_file = self.file_name("lr_reala.cov_" + name)
                Dat.write(output_file, covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val, "g.",
                               covariate_train.val, predicted, "r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val,
                                   predicted,
                                   yerr,
                                   linestyle='None')
                    pylab.suptitle(
                        name +
                        ": test on train: train X to true target (green) and prediction (red)"
                    )
                    pylab.show()

                self.compare_files(predicted_pheno, "lr2a_" + first_name)
                self.compare_files(covar2, "lr2a.cov_" + first_name)

            #Predict with model (test on test)
            predicted_pheno, covar = model.predict(
                K0_whole_test=K0_whole_test, X=covariate_test)  #test on train
            output_file = self.file_name("lr_realb_" + name)
            Dat.write(output_file, predicted_pheno)
            covar2 = SnpData(
                iid=covar.row, sid=covar.col[:, 1],
                val=covar.val)  #kludge to write kernel to text format
            output_file = self.file_name("lr_realb.cov_" + name)
            Dat.write(output_file, covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val, "g.",
                           covariate_test.val, predicted, "r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val,
                               predicted,
                               yerr,
                               linestyle='None')
                pylab.suptitle(
                    name +
                    ": test on test: test X to true target (green) and prediction (red)"
                )
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno, "lr2b_" + first_name)
            self.compare_files(covar2, "lr2b.cov_" + first_name)

    def compare_files(self, answer, ref_base):
        reffile = TestFeatureSelection.reference_file(
            "fastlmm/" + ref_base +
            ".dat")  #Uses same results folder as lmm_train
        reference = Dat(reffile).read()
        assert np.array_equal(
            answer.col,
            reference.col), "sid differs. File '{0}'".format(reffile)
        assert np.array_equal(
            answer.row,
            reference.row), "iid differs. File '{0}'".format(reffile)
        for iid_index in xrange(reference.row_count):
            for sid_index in xrange(reference.col_count):
                a_v = answer.val[iid_index, sid_index]
                r_v = reference.val[iid_index, sid_index]
                assert abs(
                    a_v - r_v
                ) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format(
                    iid_index, sid_index, reffile)

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)) + "/..")
        result = doctest.testfile("../linear_regression.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
コード例 #38
0
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:,snp_reader.pos[:,0] != test_chr].read(order='C').standardize()
        test_snps = snp_reader[:,snp_reader.pos[:,0] == test_chr].read(order='C').standardize()


        y = pheno.read().val[:,0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info("running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov)    

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))


        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:,feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None,leave_out_one_chrom=False,output_file_name=output_file_name)

        logging.info("results:")
        logging.info("#"*40)
        logging.info(results_df.head())
        self.compare_files(results_df,"old")
コード例 #39
0
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn, count_A1=False)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read(
            order='C').standardize()
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read(
            order='C').standardize()

        y = pheno.read().val[:, 0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info(
            "running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7,
                                          n_folds=7,
                                          order_by_lmm=True,
                                          measure="ll",
                                          random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val,
                                                                   G0.val,
                                                                   y,
                                                                   cov=X_cov)

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))

        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:, feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps,
                                pheno,
                                G0=G0,
                                G1=G1,
                                mixing=best_mix,
                                h2=None,
                                leave_out_one_chrom=False,
                                output_file_name=output_file_name,
                                count_A1=False)

        logging.info("results:")
        logging.info("#" * 40)
        logging.info(results_df.head())
        self.compare_files(results_df, "old")
コード例 #40
0
# The iid_to_index and sid_to_index methods turn iid's and sid's into indexes
# Via NumPy-style indexing, these allow reading by name and genetic property

#Topic: Other SnpReaders and how to write

#Read from the PLINK phenotype file (text) instead of a Bed file
# Looks like:
#cid0P0 cid0P0 0.4853395139922632
#cid1P0 cid1P0 -0.2076984565752155
#cid2P0 cid2P0 1.4909084058931985
#cid3P0 cid3P0 -1.2128996652683697
#cid4P0 cid4P0 0.4293203431508744
#...

from pysnptools.snpreader import Pheno
phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos
#Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan  nan  nan]]
phenodata = phenoreader.read()
print phenodata.val
#[[  4.85339514e-01]
# [ -2.07698457e-01]
# [  1.49090841e+00]
# [ -1.21289967e+00]
# ...

# Write 1st 10 iids and sids of Bed data into Pheno format
snpdata1010 = Bed("all.bed")[:10,:10].read()
Pheno.write("deleteme1010.txt",snpdata1010)

#Write it to Bed format
コード例 #41
0
ファイル: fPGS_obs.py プロジェクト: ccrobertson/SNIPar
        delim = ' '
    if cols[0] == 'FID' and cols[1]== 'IID':
        pass
    else:
        raise ValueError('First two columns of PGS must be FID, IID')
    f.close()
    ids = np.loadtxt(args.pgs, dtype='U', usecols=(0,1), delimiter=delim, skiprows=1)
    pgs_vals = np.loadtxt(args.pgs, usecols=tuple([x for x in range(2, cols.shape[0])]),delimiter=delim, skiprows=1)
    pg = gtarray(pgs_vals.reshape((pgs_vals.shape[0],1)), ids[:, 1], sid=cols[2:cols.shape[0]], fams=ids[:, 0])
    print('Normalising PGS to have mean zero and variance 1')
    pg.mean_normalise()
    pg.scale()

    # Read phenotype
    print('Reading '+str(args.phenofile))
    pheno = Pheno(args.phenofile, missing=args.missing_char).read()
    # pheno = Pheno('phenotypes/eduyears_resid.ped', missing='NA').read()
    y = np.array(pheno.val)
    pheno_ids = np.array(pheno.iid)[:, 1]
    if y.ndim == 1:
        pass
    elif y.ndim == 2:
        y = y[:, args.phen_index - 1]
    else:
        raise ValueError('Incorrect dimensions of phenotype array')
    # Remove y NAs
    y_not_nan = np.logical_not(np.isnan(y))
    if np.sum(y_not_nan) < y.shape[0]:
        y = y[y_not_nan]
        pheno_ids = pheno_ids[y_not_nan]
    y = y-np.mean(y)
コード例 #42
0
class TestHeritabilitySpatialCorrection(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))
        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/heritability_spatial_correction"

    def file_name(self,testcase_name):
        temp_fn = os.path.join(self.tempout_dir,testcase_name)
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_one(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "one.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        half = self.pheno_whole.read().val
        pheno = SnpData(iid=self.pheno_whole.iid,sid=["pheno0","pheno1"],val=np.c_[half,half])

        spatial_coor = [[i,-i] for i in xrange(self.snpreader_whole.iid_count)]
        alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)]
        dataframe = heritability_spatial_correction(self.snpreader_whole,spatial_coor,self.snpreader_whole.iid,alpha_list,pheno,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=True)

        dataframe.to_csv(tmpOutfile,sep="\t",index=False)
        referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn)
        out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance)                
        self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))

    def test_two(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "two.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        snpreader = self.snpreader_whole[:10,:]

        spatial_coor = [[i,-i] for i in xrange(snpreader.iid_count)]
        alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)]
        dataframe = heritability_spatial_correction(snpreader,spatial_coor,snpreader.iid,alpha_list,self.pheno_whole,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=False)

        dataframe.to_csv(tmpOutfile,sep="\t",index=False)
        referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn)
        out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance)                
        self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))


    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..")
        result = doctest.testfile("../heritability_spatial_correction.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
コード例 #43
0
# Load FaST-LMM basic association test:
from fastlmm.association import single_snp
from pysnptools.snpreader import Ped
from pysnptools.snpreader import Pheno
from pysnptools.snpreader import wrap_plink_parser
import numpy as np
from sys import argv
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import fastlmm.util.util as flutil

script, inped_file, inpheno_file, results_dataframe, output_manhattan = argv

# Load snp data:
print "Loading variant data..."
ped_file = Ped(inped_file)
print "Loading phenotype data..."
pheno_fn = Pheno(inpheno_file)

# Run basic association test:
print "Running FaST-LMM single_snp test..."
results_df = single_snp(test_snps=ped_file, pheno=pheno_fn, leave_out_one_chrom=0, output_file_name=results_dataframe)

chromosome_starts = flutil.manhattan_plot(results_df.as_matrix(["Chr", "ChrPos", "PValue"]), pvalue_line=4.4e-7, xaxis_unit_bp=True)
plt.show()
# fig = plt.figure()
# fig.savefig(output_manhattan)
コード例 #44
0
class TestFastLMM(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False)
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/fastlmm"

    def file_name(self,testcase_name):
        temp_fn = os.path.join(self.tempout_dir,testcase_name+".dat")
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_api(self):
        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #####################################################
        # Train and standardize cov and then apply to test
        #####################################################

        cov_train, unit_trained = self.covariate_whole[train_idx,:].read().standardize(Unit(),return_trained=True)
        cov_test = self.covariate_whole[test_idx,:].read().standardize(unit_trained)

        #####################################################
        # standardize whole kernel from snps (both ways) and then pull out the 3 parts
        #####################################################
        
        whole_kernel = SnpKernel(self.covariate_whole,Unit()).read().standardize(DiagKtoN())
        train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True)
        test_kernel = whole_kernel[train_idx,test_idx].read(order='A',view_ok=True)
        test_test_kernel = whole_kernel[test_idx,test_idx].read(order='A',view_ok=True)

        #####################################################
        # create train_train, train_test, and test_test based on just the training snps (both standardizations)
        #####################################################

        K_train = SnpKernel(self.snpreader_whole[train_idx,:],Unit(),block_size=100)
        train_train_kernel, snp_trained, kernel_trained = K_train._read_with_standardizing(to_kerneldata=True, kernel_standardizer=DiagKtoN(), return_trained=True)

        K_whole_test = _SnpWholeTest(train=self.snpreader_whole[train_idx,:],test=self.snpreader_whole[test_idx,:],standardizer=snp_trained,block_size=100)
        train_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[train_idx]) #The new reader may have the iids in a different order than the original reader
        train_test_kernel = K_whole_test[train_idx2,:].read().standardize(kernel_trained)

        test_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[test_idx])
        test_test_kernel = K_whole_test[test_idx2,:].read().standardize(kernel_trained)

        #####################################################
        # How does predict look with whole_test as input?
        #####################################################

        # a. - standardize whole up front
        whole_kernel = SnpKernel(self.snpreader_whole,Unit(),block_size=100).read().standardize()
        train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True)
        whole_test_kernel = whole_kernel[:,test_idx].read(order='A',view_ok=True)
        fastlmm1 = FastLMM(snp_standardizer=SS_Identity(), kernel_standardizer=KS_Identity())
        fastlmm1.fit(K0_train=train_kernel, X=self.covariate_whole, y=self.pheno_whole) #iid intersection means we won't really be using whole covar or pheno
        predicted_pheno, covar = fastlmm1.predict(K0_whole_test=whole_test_kernel, X=self.covariate_whole,count_A1=False)
        output_file = self.file_name("whole")
        Dat.write(output_file,predicted_pheno)
        self.compare_files(predicted_pheno,"whole")

        # b -- just files
        fastlmm2 = FastLMM()
        fastlmm2.fit(K0_train=self.snpreader_whole[train_idx,:], X=self.covariate_whole, y=self.pheno_whole[train_idx,:]) #iid intersection means we won't really be using whole covar
        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=self.snpreader_whole[test_idx,:], X=self.covariate_whole,count_A1=False)
        self.compare_files(predicted_pheno,"one")

    def test_notebook1(self):
        do_plot=False

        import matplotlib.pyplot as plt
        from pysnptools.snpreader import Pheno,Bed
        bed = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False)
        cov = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        pheno = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt").read()

        # Now we learn from the first 400 students.
        training = bed[:400,:] #!!!later: the learning code doesn't like it if there are two instances of bed[:400] that are not "is -equal"
        fastlmm2 = FastLMM(GB_goal=2).fit(K0_train=training,
                                            X=cov[:400,:],
                                            y=pheno[:400,:])

        # Predict on training data:
        predicted_score,covariance = fastlmm2.predict(K0_whole_test=training,
                                                            X=cov[:400,:],count_A1=False)

        assert np.array_equal(pheno.iid[:400],predicted_score.iid), "for plots to make sense, the iids must be in the order"
        if do_plot:
            plt.plot(pheno.val[:400,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r")
            plt.errorbar(pheno.val[:400,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.')
            plt.xlabel('score (actual train)')
            plt.ylabel('predicted (test on train with stdev)')
            plt.show()

        # How well does this model predict the (unseen) TEST data?
        predicted_score,covariance = fastlmm2.predict(K0_whole_test=bed[400:500,:],
                                                            X=cov[400:500,:],count_A1=False)

        assert np.array_equal(pheno.iid[400:500],predicted_score.iid), "for plots to make sense, the iids must be in the order"
        if do_plot:
            plt.plot(pheno.val[400:500,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r")
            plt.errorbar(pheno.val[400:500,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.')
            plt.xlabel('score (actual test)')
            plt.ylabel('predicted')
            plt.show()

    def test_one(self):
        logging.info("TestLmmTrain test_one")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")

    def test_str(self):
        logging.info("TestLmmTrain test_str")

        G0_train = self.pythonpath + "/tests/datasets/synth/all"
        covariate_train = None
        pheno_train = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train,count_A1=False)
        filename = self.tempout_dir + "/model_str.flm.p"
        pstutil.create_directory_if_necessary(filename)

        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on same
        G0_test = G0_train
        covariate_test = covariate_train

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("str")
        Dat.write(output_file,predicted_pheno)

        #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"str")

    def test_lr_no_K0(self):
        logging.info("TestLinRegTrain test_lr_no_k0")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        covariate_train3 = self.covariate_whole[train_idx,:].read()
        covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)])
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        np.random.seed(0)
        pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1)

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model3.flm.p"
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covariance = fastlmm3.predict(K0_whole_test=KernelIdentity(pheno_train3.iid), X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("lr_no_k0")
        Dat.write(output_file,predicted_pheno)

        self.compare_files(predicted_pheno,"lr_no_k0")

    def test_lr_as_lmm(self):
            do_plot = False
            #later why does this test case generate two intersect info messages instead of just one?
            import pylab
            logging.info("TestLmmTrain test_lr_as_lmm")

            ###############################################################
            # Create a linear data set with just a little noise
            ###############################################################

            train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
            test_idx  = np.r_[0:10] # the first 10 iids

            #make covar just numbers 0,1,...
            covar = self.covariate_whole.read()
            covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
            covar._name = 'np.array([[float(num)] for num in xrange(covar.iid_count)])'
            covariate_train = covar[train_idx,:].read()
            covariate_test = covar[test_idx,:].read()


            #make pheno  # pheno = 2*covar+100+normal(0,1)*10
            pheno = self.pheno_whole.read()
            np.random.seed(0)
            pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

            pheno_train = pheno[train_idx,:].read()
            pheno_test = pheno[test_idx,:].read()

            if do_plot:
                #Plot training x and y, testing x and y
                pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
                pylab.suptitle("Plot training x and y, testing x and y")
                pylab.show()

            ###############################################################
            # Show that linear regression does a good job predicting
            ###############################################################

            Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
            Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
            lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
            bs=lsqSol[0] #weights
            r2=lsqSol[1] #squared residuals
            D=lsqSol[2]  #rank of design matrix
            N=pheno_train.iid_count
            REML = False
            if not REML:
                sigma2 = float(r2/N)
                nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
            else:
                sigma2 = float(r2 / (N-D))
                nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
                nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

            predicted = Xtest.dot(bs)
            yerr = [np.sqrt(sigma2)] * len(predicted)
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle("real linear regression: actual to prediction")
                pylab.show()

            ###############################################################
            # Use LMM as LR and apply test on train
            ###############################################################
            for force_full_rank in [True, False]:
                #Learn model, save, load
                fastlmmx = FastLMM(GB_goal=2,force_full_rank=force_full_rank).fit(K0_train=covariate_train, X=None, y=pheno_train)
                
                
                filename = self.tempout_dir + "/model_lr_as_lmm.flm.p"
                pstutil.create_directory_if_necessary(filename)
                joblib.dump(fastlmmx, filename) 
                fastlmm = joblib.load(filename)


                do_test_on_train = True
                if do_test_on_train:
                    #Predict with model (test on train)
                    predicted_pheno, covar = fastlmm.predict(K0_whole_test=covariate_train, X=None,count_A1=False) #test on train
                    output_file = self.file_name("lr_as_lmma_")
                    Dat.write(output_file,predicted_pheno)
                    covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                    output_file = self.file_name("lr_as_lmma.cov_")
                    Dat.write(output_file,covar2)

                    yerr = np.sqrt(np.diag(covar.val))
                    predicted = predicted_pheno.val
                    if do_plot:
                        pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                        pylab.xlim([0, 50])
                        pylab.ylim([100, 200])
                        pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                        pylab.suptitle("test on train: train X to true target (green) and prediction (red)")
                        pylab.show()

                    self.compare_files(predicted_pheno,"lr_as_lmma_")
                    self.compare_files(covar2,"lr_as_lmma.cov_")

                ###############################################################
                # Use LMM as LR and apply test on test
                ###############################################################

                #Predict with model (test on test)
                predicted_pheno, covar  = fastlmm.predict(K0_whole_test=covariate_test, X=None,count_A1=False) #test on train
                output_file = self.file_name("lr_as_lmmb_")
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr_as_lmmb.cov_")
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                    pylab.xlim([-1, 10])
                    pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                    pylab.suptitle("test on test: test X to true target (green) and prediction (red)")
                    pylab.show()
                    ## Plot y and predicted y (test on train)
                    #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                    #pylab.suptitle(name+": test on test: true target to prediction")
                    #pylab.show()

                self.compare_files(predicted_pheno,"lr_as_lmmb_")
                self.compare_files(covar2,"lr_as_lmmb.cov_")

    def test_lr2(self):
        do_plot = False

        import pylab
        logging.info("TestLmmTrain test_lr2")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()
        K0_whole_test = KernelIdentity(covar.iid,covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

        pheno_train = pheno[train_idx,:].read()
        pheno_test = pheno[test_idx,:].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
        Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
        bs=lsqSol[0] #weights
        r2=lsqSol[1] #squared residuals
        D=lsqSol[2]  #rank of design matrix
        N=pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2/N)
            nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
        else:
            sigma2 = float(r2 / (N-D))
            nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
            nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name,K0_train,K0_whole_test in [("Identity Kernel",
                                            KernelIdentity(self.snpreader_whole.iid[train_idx]),
                                            KernelIdentity(self.snpreader_whole.iid,test=self.snpreader_whole.iid[test_idx])),
                                      #!!!later("sid_count=0", self.snpreader_whole[train_idx,[]],self.snpreader_whole[test_idx,[]])
                                      ]:
            logging.info(name)
            first_name = first_name or name
            #Learn model, save, load
            fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                
                
            filename = self.tempout_dir + "/model_lr2.flm.p"
            joblib.dump(fastlmmx, filename) 
            fastlmm = joblib.load(filename)


            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train
                output_file = self.file_name("lr2a_"+name)
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr2a.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_pheno,"lr2a_"+first_name)
                self.compare_files(covar2,"lr2a.cov_"+first_name)

            #Predict with model (test on test)
            predicted_pheno, covar  = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on train
            output_file = self.file_name("lr2b_"+name)
            Dat.write(output_file,predicted_pheno)
            covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
            output_file = self.file_name("lr2b.cov_"+name)
            Dat.write(output_file,covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno,"lr2b_"+first_name)
            self.compare_files(covar2,"lr2b.cov_"+first_name)


    def test_str2(self):
        logging.info("TestLmmTrain test_str2")


        #Standardize train and test together
        whole_kernel = self.snpreader_whole.read_kernel(Unit())

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        K0_train_filename = self.tempout_dir + "/model_str2.kernel.npz"
        pstutil.create_directory_if_necessary(K0_train_filename)
        from pysnptools.kernelreader import KernelNpz
        KernelNpz.write(K0_train_filename,whole_kernel[train_idx].read(order='A',view_ok=True))

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train_filename, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_str2.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=whole_kernel[:,test_idx].read(order='A',view_ok=True), X=covariate_test,count_A1=False)

        output_file = self.file_name("str2")
        Dat.write(output_file,predicted_pheno)

        #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]
        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"str2")

    #Creating multiple tests so that will run faster when on cluster.
    def test_fasttwoK(self):
        logging.info("TestLmmTrain test_fasttwoK")
        self._fasttwoK(None,None)

    def test_fasttwoK_force_low_rank(self):
        logging.info("TestLmmTrain test_fasttwoK_force_low_rank")
        self._fasttwoK(True,None)

    def test_fasttwoK_GB2(self):
        logging.info("TestLmmTrain test_fasttwoK_GB2")
        self._fasttwoK(None,2)

    def test_fasttwoK_force_low_rank_GB2(self):
        logging.info("TestLmmTrain test_fasttwoK_force_low_rank_GB2")
        self._fasttwoK(True,2)

    def _fasttwoK(self,force_low_rank,GB_goal):

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        G1_train = SnpData(iid=G0_train.iid,sid=[item+"_1" for item in G0_train.sid],val=G0_train.read().val,pos=G0_train.pos,name="Different SNP names for {0}".format(G0_train))
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        logging.info("force_low_rank = {0}".format(force_low_rank))
        fastlmm1 = FastLMM(force_low_rank=force_low_rank,GB_goal=GB_goal).fit(K0_train=G0_train, K1_train=G1_train, X=covariate_train, y=pheno_train, mixing=.1)

        filename = self.tempout_dir + "/model_fasttwoK.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        G1_test = SnpData(iid=G0_test.iid,sid=[item+"_1" for item in G0_test.sid],val=G0_test.read().val,pos=G0_test.pos,name="Different SNP names for {0}".format(G0_test))
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G1_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("fasttwoK"+("_force_low" if force_low_rank else "")+("GB{0}".format(GB_goal) if GB_goal is not None else ""))
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")

    def test_lowrank(self):
        logging.info("TestLmmTrain test_lowrank")

        snpreader = self.snpreader_whole[:,:100]

        train_idx = np.r_[10:snpreader.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = snpreader[train_idx,:]
        G0_test = snpreader[test_idx,:]

        pheno_whole = self.pheno_whole.read()
        pheno_whole.val *= 100
        pheno_whole.val += 1000

        mean_low, covar_low =   FastLMM(force_low_rank=True,GB_goal=2).fit(K0_train=G0_train, y=pheno_whole[train_idx,:], X=self.covariate_whole[train_idx,:]). predict(K0_whole_test=G0_test,X=self.covariate_whole[test_idx,:],count_A1=False)
        mean_full, covar_full = FastLMM(force_full_rank=True,GB_goal=2).fit(K0_train=G0_train, y=pheno_whole[train_idx,:], X=self.covariate_whole[train_idx,:]).predict(K0_whole_test=G0_test,X=self.covariate_whole[test_idx,:],count_A1=False)

        np.testing.assert_allclose(mean_low.val, mean_full.val)
        np.testing.assert_allclose(covar_low.val,covar_full.val)

        logging.info("finished with TestLmmTrain test_lowrank")

    def test_twoK(self):
        logging.info("TestLmmTrain test_twoK")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, K1_train=G0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")

    def test_lr(self):
        import matplotlib.pyplot as plt
        import pylab


        logging.info("TestLmmTrain test_lr")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)])
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        np.random.seed(0)
        pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1)

        ##Plot training x and y
        #pylab.plot(covariate_train3.val, pheno_train3.val,".")
        #pylab.show()

        for force_full_rank,force_low_rank in [(True,False),(False,True)]:
            #Learn model, save, load
            fastlmm3x = FastLMM(force_full_rank=force_full_rank,force_low_rank=force_low_rank,GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3)
            filename = self.tempout_dir + "/model_lr.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(fastlmm3x, filename) 
            fastlmm3 = joblib.load(filename)


            #Predict with model (test on train)
            predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train
            output_file = self.file_name("lr")
            Dat.write(output_file,predicted_pheno)

            ## Plot training x and y, and training x with predicted y
            #do_plot = True 
            #if do_plot:
            #    pylab.plot(covariate_train3.val, pheno_train3.val,covariate_train3.val,predicted_pheno.val,".")
            #    pylab.show()

            #    # Plot y and predicted y (test on train)
            #    pheno_actual = pheno_train3.val[:,0]
            #    pylab.plot(pheno_actual,predicted_pheno.val,".")
            #    pylab.show()


            self.compare_files(predicted_pheno,"lr")

    def test_lmm(self):
        do_plot = False
        iid_count = 500
        seed = 0


        import pylab
        logging.info("TestLmmTrain test_lmm")

        iid = [["cid{0}P{1}".format(iid_index,iid_index//250)]*2 for iid_index in xrange(iid_count)]
        train_idx = np.r_[10:iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids


        #Every person is 100% related to everyone in one of 5 families
        K0a = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index0 in xrange(iid_count):
            for iid_index1 in xrange(iid_count):
                K0a.val[iid_index0,iid_index1] = 1 if iid_index0 % 5 == iid_index1 % 5 else 0
                if iid_index1 < iid_index0:
                    assert K0a.val[iid_index0,iid_index1] == K0a.val[iid_index1,iid_index0]

        #every person lives on a line from 0 to 1
        # They are related to every other person as a function of distance on the line
        np.random.seed(seed)
        home = np.random.random([iid_count])
        K0b = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index in xrange(iid_count):
            K0b.val[iid_index,:] = 1 - np.abs(home-home[iid_index])**.1

        #make covar just numbers 0,1,...
        covar = SnpData(iid=iid,sid=["x"],val=np.array([[float(num)] for num in xrange(iid_count)]))
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()

        for name, h2, K0 in [("clones", 1, K0a),("line_world",.75,K0b)]:

            sigma2x = 100
            varg = sigma2x * h2
            vare = sigma2x * (1-h2)

            #######################################################################
            #make pheno  # pheno = 2*covar+100+normal(0,1)*2.5+normal(0,K)*7.5
            #######################################################################
            #random.multivariate_normal is sensitive to mkl_num_thread, so we control it.
            if 'MKL_NUM_THREADS' in os.environ:
                mkl_num_thread = os.environ['MKL_NUM_THREADS']
            else:
                mkl_num_thread = None
            os.environ['MKL_NUM_THREADS'] = '1'
            np.random.seed(seed)
            p1 = covar.val * 2.0 + 100
            p2 = np.random.normal(size=covar.val.shape)*np.sqrt(vare)
            p3 = (np.random.multivariate_normal(np.zeros(iid_count),K0.val)*np.sqrt(varg)).reshape(-1,1)
            if mkl_num_thread is not None:
                os.environ['MKL_NUM_THREADS'] = mkl_num_thread
            else:
                del os.environ['MKL_NUM_THREADS']
            pheno = SnpData(iid=iid,sid=["pheno0"],val= p1 + p2 + p3)

            pheno_train = pheno[train_idx,:].read()
            pheno_test = pheno[test_idx,:].read()

            if do_plot:
                #Plot training x and y, testing x and y
                pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
                pylab.suptitle(name + ": Plot training x and y, testing x and y")
                pylab.show()

            Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
            Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
            lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
            bs=lsqSol[0] #weights
            r2=lsqSol[1] #squared residuals
            D=lsqSol[2]  #rank of design matrix
            N=pheno_train.iid_count
            REML = False
            if not REML:
                sigma2 = float(r2/N)
                nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
            else:
                sigma2 = float(r2 / (N-D))
                nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
                nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

            predicted = Xtest.dot(bs)
            yerr = [np.sqrt(sigma2)] * len(predicted)
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name + ": real linear regression: actual to prediction")
                pylab.show()

            for factor in [1,100,.02]:
                K0 = K0.read()
                K0.val *= factor

                K0_train = K0[train_idx]
                K0_whole_test = K0[:,test_idx]

                #Learn model, save, load
                fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                v2 = np.var(p2)
                v3 = np.var(p3)
                logging.debug("Original h2 of {0}. Generated h2 of {1}. Learned h2 of {2}".format(h2, v3/(v2+v3), fastlmmx.h2raw))
                
                
                filename = self.tempout_dir + "/model_lmm.flm.p"
                pstutil.create_directory_if_necessary(filename)
                joblib.dump(fastlmmx, filename) 
                fastlmm = joblib.load(filename)


                do_test_on_train = True
                if do_test_on_train:
                    #Predict with model (test on train)
                    predicted_pheno, covar_pheno = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train
                    output_file = self.file_name("lmma_"+name)
                    Dat.write(output_file,predicted_pheno)
                    covar2 = SnpData(iid=covar_pheno.row,sid=covar_pheno.col[:,1],val=covar_pheno.val) #kludge to write kernel to text format
                    output_file = self.file_name("lmma.cov_"+name)
                    Dat.write(output_file,covar2)

                    yerr = np.sqrt(np.diag(covar_pheno.val))
                    predicted = predicted_pheno.val
                    if do_plot:
                        pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                        pylab.xlim([0, 50])
                        pylab.ylim([100, 200])
                        pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                        pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                        pylab.show()

                    self.compare_files(predicted_pheno,"lmma_"+name)
                    self.compare_files(covar2,"lmma.cov_"+name)

                    predicted_pheno0, covar_pheno0 = fastlmm.predict(K0_whole_test=K0_train[:,0], X=covariate_train[0,:],count_A1=False) #test on train #0
                    assert np.abs(predicted_pheno0.val[0,0] - predicted_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                    assert np.abs(covar_pheno0.val[0,0] - covar_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"


                #Predict with model (test on test)
                predicted_phenoB, covar_phenoB  = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on test
                output_file = self.file_name("lmmb_"+name)
                Dat.write(output_file,predicted_phenoB)
                covar2 = SnpData(iid=covar_phenoB.row,sid=covar_phenoB.col[:,1],val=covar_phenoB.val) #kludge to write kernel to text format
                output_file = self.file_name("lmmb.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar_phenoB.val))
                predicted = predicted_phenoB.val
                if do_plot:
                    pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                    pylab.xlim([-1, 10])
                    pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_phenoB,"lmmb_"+name)
                self.compare_files(covar2,"lmmb.cov_"+name)

                predicted_phenoB0, covar_phenoB0  = fastlmm.predict(K0_whole_test=K0_whole_test[:,0], X=covariate_test[0,:],count_A1=False) #test on a single test case
                assert np.abs(predicted_phenoB0.val[0,0] - predicted_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                assert np.abs(covar_phenoB0.val[0,0] - covar_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"

                #Predict with model test on some train and some test
                some_idx = range(covar.iid_count)
                some_idx.remove(train_idx[0])
                some_idx.remove(test_idx[0])
                covariate_some = covar[some_idx,:]
                K0_whole_some = K0[:,some_idx]
                predicted_phenoC, covar_phenoC  = fastlmm.predict(K0_whole_test=K0_whole_some, X=covariate_some,count_A1=False)
                for idxC, iidC in enumerate(predicted_phenoC.iid):
                    meanC = predicted_phenoC.val[idxC]
                    varC = covar_phenoC.val[idxC,idxC]
                    if iidC in predicted_pheno.iid:
                        predicted_pheno_ref = predicted_pheno
                        covar_pheno_ref = covar_pheno
                    else:
                        assert iidC in predicted_phenoB.iid
                        predicted_pheno_ref = predicted_phenoB
                        covar_pheno_ref = covar_phenoB
                    idx_ref = predicted_pheno_ref.iid_to_index([iidC])[0]
                    mean_ref = predicted_pheno_ref.val[idx_ref]
                    var_ref = covar_pheno_ref.val[idx_ref,idx_ref]
                    assert np.abs(meanC - mean_ref) < 1e-6
                    assert np.abs(varC - var_ref) < 1e-6


    def test_snps(self):
        logging.info("TestLmmTrain test_snps")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        # Show it using the snps
        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        pheno_train3.val = G0_train[:,0:1].read().val*2

        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".")
        #pylab.show()

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model_snps.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("snps")
        Dat.write(output_file,predicted_pheno)

        ### Plot training x and y, and training x with predicted y
        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val,".",G0_train[:,0:1].read().val[:,0],predicted_pheno.val,".")
        #pylab.show()

        ### Plot y and predicted y (test on train)
        #pheno_actual = pheno_train3.val[:,0]
        #pylab.plot(pheno_actual,predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"snps")

    def test_kernel(self):
        logging.info("TestLmmTrain test_kernel")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        # Show it using the snps
        K0_train = self.snpreader_whole[train_idx,:].read_kernel(Unit())
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        pheno_train3.val = self.snpreader_whole[train_idx,0:1].read().val*2
        assert np.array_equal(K0_train.iid,covariate_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"
        assert np.array_equal(K0_train.iid,pheno_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"

        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".")
        #pylab.show()

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model_snps.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covar = fastlmm3.predict(K0_whole_test=K0_train, X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("kernel")
        Dat.write(output_file,predicted_pheno)

        #### Plot training x and y, and training x with predicted y
        #pylab.plot(self.snpreader_whole[train_idx,0:1].read().val[:,0], pheno_train3.val,".",self.snpreader_whole[train_idx,0:1].read().val[:,0],predicted_pheno.val,".")
        #pylab.show()

        #### Plot y and predicted y (test on train)
        #pheno_actual = pheno_train3.val[:,0]
        #pylab.plot(pheno_actual,predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"snps") #"kernel" and "snps" test cases should give the same results

    def test_kernel_one(self):
        logging.info("TestLmmTrain test_kernel_one")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        K0_train = SnpKernel(self.snpreader_whole[train_idx,:],standardizer=Unit())
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]
        assert np.array_equal(K0_train.iid,covariate_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"
        assert np.array_equal(K0_train.iid,pheno_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_kernel_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("kernel_one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one") #Expect same results as SNPs "one"

    def compare_files(self,answer,ref_base):
        reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat")
        reference=Dat(reffile).read()
        assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile)
        assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile)
        for iid_index in xrange(reference.row_count):
            for sid_index in xrange(reference.col_count):
                a_v = answer.val[iid_index,sid_index]
                r_v = reference.val[iid_index,sid_index]
                assert abs(a_v - r_v) < 1e-4 or abs(a_v - r_v)/abs(r_v) < 1e5, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile)

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..")
        result = doctest.testfile("../fastlmm_predictor.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__