Beispiel #1
0
	def divideData(self,filename,num=5,mph=3,delet=True):
		print "Estimating heritability using "+str(num)+" components"
		direct="TEMP"
		sFil=Bed(filename);
		yFil=Pheno(filename+".fam");
		n=sFil.iid_count	
		reOrd=perm(n);
		yFil=yFil[reOrd,:];
		sFil=sFil[reOrd,:];

		y=yFil.read().val[:,3];

		div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)];
		
		varEsts=[];

		for i in range(0,num):
			print "For component "+str(i);
			sFilTemp=self.BED[div[i]:div[i+1],:];
			Xtemp=sFilTemp.read().standardize().val;
			ytemp=y[div[i]:div[i+1]];

			varEsts.append(self.VarCalc.RealVar(ytemp,Xtemp));
		
		return varEsts;
Beispiel #2
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)


        dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True)
        assert len(dict['vals'].shape)==1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10)

        snpdata4 = Pheno(None,iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10)
        snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
Beispiel #3
0
    def divideData(self, filename, num=5, mph=3, delet=True):
        print "Estimating heritability using " + str(num) + " components"
        direct = "TEMP"
        sFil = Bed(filename)
        yFil = Pheno(filename + ".fam")
        n = sFil.iid_count
        reOrd = perm(n)
        yFil = yFil[reOrd, :]
        sFil = sFil[reOrd, :]

        y = yFil.read().val[:, 3]

        div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)]

        varEsts = []

        for i in range(0, num):
            print "For component " + str(i)
            sFilTemp = self.BED[div[i]:div[i + 1], :]
            Xtemp = sFilTemp.read().standardize().val
            ytemp = y[div[i]:div[i + 1]]

            varEsts.append(self.VarCalc.RealVar(ytemp, Xtemp))

        return varEsts
Beispiel #4
0
def loadData(filename):
    mph = 3
    sFil = Bed(filename)
    yFil = Pheno(filename + ".fam")

    y = yFil.read().val[:, mph]
    y = [i - 1 for i in y]
    return [y, sFil]
Beispiel #5
0
 def read_phen(self, fn_phen=None):
     """
     read phenotype file
     """
     PH = Pheno(fn_phen)
     PHOB = PH.read()
     self.Y = PHOB.val
     self.SID = PHOB.iid[:, 1]
Beispiel #6
0
 def read_phen(self,fn_phen = None):
     """
     read phenotype file
     """
     PH = Pheno(fn_phen)
     PHOB = PH.read()
     self.Y = PHOB.val
     self.SID = PHOB.iid[:,1]
Beispiel #7
0
def getData(filename):
    mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	
	X=sFil.read().standardize().val;
	y=yFil.read().val[:,mph];
	return [y,sFil];
Beispiel #8
0
def getData(filename):
	mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	
	y=yFil.read().val[:,mph];
	y=[i-1 for i in y]
	return [y,sFil];
Beispiel #9
0
def getData(filename):
    mph = 3
    sFil = Bed(filename, count_A1=False)
    # Bed object
    yFil = Pheno(filename + ".fam")

    y = yFil.read().val[:, mph]
    y = [i - 1 for i in y
         ]  # the last column of .fam file is the disease states of data owners
    return [y, sFil]
Beispiel #10
0
    def test_c_reader_pheno(self):
        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()

        self.assertEqual(np.float64, snpdata1.val.dtype)

        snpdata1.val[
            1,
            0] = np.NaN  # Inject a missing value to test writing and reading missing values
        output = "tempdir/snpreader/toydata.phe"
        create_directory_if_necessary(output)
        Pheno.write(output, snpdata1)
        snpreader = Pheno(output)
        _fortesting_JustCheckExists().input(snpreader)
        s = str(snpreader)
        snpdata2 = snpreader.read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata2.val,
                                             decimal=10)

        snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read()
        import pysnptools.util.pheno as pstpheno
        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="")
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        dict = pstpheno.loadOnePhen(self.currentFolder +
                                    "/examples/toydata.phe",
                                    missing="",
                                    vectorize=True)
        assert len(dict['vals'].shape) == 1, "test 1-d array of values"
        snpdata3 = Pheno(dict).read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata3.val,
                                             decimal=10)

        snpdata4 = Pheno(None, iid_if_none=snpdata1.iid)
        assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0

        snpdata5 = Pheno(self.currentFolder +
                         "/examples/toydata.id.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata5.val,
                                             decimal=10)
        snpdata6 = Pheno(self.currentFolder +
                         "/examples/toydata.fid.phe").read()
        np.testing.assert_array_almost_equal(snpdata1.val,
                                             snpdata6.val,
                                             decimal=10)
Beispiel #11
0
def getData(filename):
	mph=3;
	sFil=Bed(filename);
	yFil=Pheno(filename+".fam");
	snpList=sFil.sid;
	y=yFil.read().val[:,mph];
	y=[i-1 for i in y]
	Icases=[i for i in range(0,len(y)) if y[i]>0];
	Icont=[i for i in range(0,len(y)) if y[i]<1];
	sFilcases=sFil[Icases,:]
	sFilcont=sFil[Icont,:]


	Dcont=sFilcont.read().val;
	Dcases=sFilcases.read().val;

	
	r=getMarginals(Dcont);
	s=getMarginals(Dcases);

	return [r,s,snpList];
Beispiel #12
0
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn, count_A1=False)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read(
            order='C').standardize()
        test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read(
            order='C').standardize()

        y = pheno.read().val[:, 0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info(
            "running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7,
                                          n_folds=7,
                                          order_by_lmm=True,
                                          measure="ll",
                                          random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val,
                                                                   G0.val,
                                                                   y,
                                                                   cov=X_cov)

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))

        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:, feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps,
                                pheno,
                                G0=G0,
                                G1=G1,
                                mixing=best_mix,
                                h2=None,
                                leave_out_one_chrom=False,
                                output_file_name=output_file_name,
                                count_A1=False)

        logging.info("results:")
        logging.info("#" * 40)
        logging.info(results_df.head())
        self.compare_files(results_df, "old")
Beispiel #13
0
class Analysis:
	##
	##Reads in covariate, pheno and geno info from filename (filename in plink format)
	##
	def __init__(self,filename,snpfile="",params="",n0=-1,n1=-1):
		self.BED=Bed(filename);
		self.pheno=Pheno(filename+".fam");
		self.y=self.pheno.read().val[:,3];
		self.y=self.y-1.0;
		self.params=params;
		n=len(self.y)
		
		if n0>0:
			print "Initiate with n0"
			I0=[i for i in range(0,n) if self.y[i]==0.0]
			I0=I0[:n0]
			I1=[i for i in range(0,n) if self.y[i]==1.0]
			I1=I1[:n1]
			I0.extend(I1);
			self.y=self.y[I0]
			self.BED=self.BED[I0,:]

		try:
			if len(snpfile)>0:
				fil=open(snpfile)
				lines=fil.readlines();
				fil.close();
				self.snps=[l.strip() for l in lines]
			else:
				self.snps=self.BED.sid;
		except:
			print "Error loading SNPs!"
			sys.exit();
		self.setUp();
		self.n=len(self.y)
		print "Number of individuals: "+str(self.n)
		self.Cov=[];
		self.params="";


	##
	##Loads Cov;
	##
	def loadCov(self,covfile):
		print "Not yet implemented!"


	##
	##Set the SNPs!
	##
	def setSNPs(self,snpfile="",SNPs=[]):
		if len(SNPs)>0:
			self.snps=[i for i in SNPs]
		else:
			try:
				fil=open(snpfile)
				lines=fil.readlines();
				fil.close();
				self.snps=[l.strip() for l in lines]
			except:
				print "Error loading SNPs!"
				sys.exit();
		self.setUp();


	##
	##sets up for analysis, specific to each subclass, in this method raises error
	##
	def setUp(self):
		raise NotImplementedError;


	##
	##gets analysis results using permutation
	##
	def getAnalysis(self,perm=[]):
		if len(perm)==0:
			return self.runAnalysis(self.y);
		else:
			return self.runAnalysis([self.y[i] for i in perm])


	##
	##Noisy analysis!
	##
	def getNoisyAnalysis(self,noise,err):
		exact=self.getAnalysis();
		m=len(exact);
		for i in range(0,m):
			pert=lap(scale=noise);
			while pert>err and err>0:
				pert=lap(scale=noise);
			exact[i]=exact[i]+pert;
		return exact;



	##
	##Rns analysis. Implemented in sub classes, assumes analysis is 1-D array
	##
	def runAnalysis(self,y):
		raise NotImplementedError;

	##
	##Returns y
	##
	def getY(self):
		return [i for i in self.y];
class TestHeritabilitySpatialCorrection(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from pysnptools.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "..", ".."))
        self.snpreader_whole = Bed(self.pythonpath +
                                   "/tests/datasets/synth/all",
                                   count_A1=False)
        self.pheno_whole = Pheno(self.pythonpath +
                                 "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/heritability_spatial_correction"

    def file_name(self, testcase_name):
        temp_fn = os.path.join(self.tempout_dir, testcase_name)
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_one(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "one.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        half = self.pheno_whole.read().val
        pheno = SnpData(iid=self.pheno_whole.iid,
                        sid=["pheno0", "pheno1"],
                        val=np.c_[half, half])

        spatial_coor = [[i, -i]
                        for i in xrange(self.snpreader_whole.iid_count)]
        alpha_list = alpha_list_big = [
            int(v) for v in np.logspace(2, np.log10(4000), 2)
        ]
        dataframe = heritability_spatial_correction(self.snpreader_whole,
                                                    spatial_coor,
                                                    self.snpreader_whole.iid,
                                                    alpha_list,
                                                    2,
                                                    pheno,
                                                    jackknife_count=2,
                                                    permute_plus_count=1,
                                                    permute_times_count=1,
                                                    just_testing=True)

        dataframe.to_csv(tmpOutfile, sep="\t", index=False)
        referenceOutfile = TestFeatureSelection.reference_file(
            "heritability_spatial_correction/" + fn)
        out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance)
        self.assertTrue(
            out,
            "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile,
                                                     tmpOutfile))

    def test_two(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "two.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        snpreader = self.snpreader_whole[:10, :]

        spatial_coor = [[i, -i] for i in xrange(snpreader.iid_count)]
        alpha_list = alpha_list_big = [
            int(v) for v in np.logspace(2, np.log10(4000), 2)
        ]
        dataframe = heritability_spatial_correction(snpreader,
                                                    spatial_coor,
                                                    snpreader.iid,
                                                    alpha_list,
                                                    2,
                                                    self.pheno_whole,
                                                    jackknife_count=2,
                                                    permute_plus_count=1,
                                                    permute_times_count=1,
                                                    just_testing=False)

        dataframe.to_csv(tmpOutfile, sep="\t", index=False)
        referenceOutfile = TestFeatureSelection.reference_file(
            "heritability_spatial_correction/" + fn)
        out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance)
        self.assertTrue(
            out,
            "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile,
                                                     tmpOutfile))

    def test_three(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "three.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        snpreader = self.snpreader_whole[:10, :]

        spatial_coor = [[i, -i] for i in xrange(snpreader.iid_count)]
        alpha_list = alpha_list_big = [
            int(v) for v in np.logspace(2, np.log10(4000), 2)
        ]
        dataframe = heritability_spatial_correction(snpreader,
                                                    spatial_coor,
                                                    snpreader.iid,
                                                    alpha_list,
                                                    2,
                                                    self.pheno_whole,
                                                    jackknife_count=0,
                                                    permute_plus_count=0,
                                                    permute_times_count=0,
                                                    just_testing=False)

        dataframe.to_csv(tmpOutfile, sep="\t", index=False)
        referenceOutfile = TestFeatureSelection.reference_file(
            "heritability_spatial_correction/" + fn)
        out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance)
        self.assertTrue(
            out,
            "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile,
                                                     tmpOutfile))

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)) + "/..")
        result = doctest.testfile("../heritability_spatial_correction.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
Beispiel #15
0
        raise (ValueError(
            'No non-missing observations with both phenotype and genotype data'
        ))
    print(
        str(n) +
        ' individuals with no missing phenotype or covariate observations')
    n = float(n)

    #### Read random effect genotypes ####
    if args.random_gts is not None:
        if args.random_gts_txt:
            random_gts_f = Pheno(args.random_gts)
        else:
            random_gts_f = Bed(args.random_gts)
        random_gts_ids = np.array(random_gts_f.iid)
        random_gts_f = random_gts_f.read()
        # Match to phenotypes
        pheno_id_dict = id_dict_make(pheno_ids)
        G_random = random_gts_f.val
        G = np.empty((y.shape[0], G_random.shape[1]))
        G[:] = np.nan
        for i in xrange(0, random_gts_ids.shape[0]):
            if tuple(random_gts_ids[i, :]) in pheno_id_dict:
                G[pheno_id_dict[tuple(random_gts_ids[i, :])], :] = G_random[
                    i, :]
        del G_random
        # Check for NAs
        random_isnan = np.isnan(G)
        random_gts_NAs = np.sum(random_isnan, axis=0)
        gts_with_obs = list()
        if np.sum(random_gts_NAs) > 0:
Beispiel #16
0
    # Get sample size
    n = y.shape[0]
    if n == 0:
        raise (ValueError('No non-missing observations with both phenotype and genotype data'))
    print(str(n) + ' individuals with no missing phenotype or covariate observations')
    n = float(n)

    #### Read random effect genotypes ####
    if args.random_gts is not None:
        if args.random_gts_txt:
            random_gts_f = Pheno(args.random_gts)
        else:
            random_gts_f = Bed(args.random_gts)
        random_gts_ids = np.array(random_gts_f.iid)
        random_gts_f = random_gts_f.read()
        # Match to phenotypes
        pheno_id_dict = id_dict_make(pheno_ids)
        G_random = random_gts_f.val
        G = np.empty((y.shape[0], G_random.shape[1]))
        G[:] = np.nan
        for i in xrange(0, random_gts_ids.shape[0]):
            if tuple(random_gts_ids[i, :]) in pheno_id_dict:
                G[pheno_id_dict[tuple(random_gts_ids[i, :])], :] = G_random[i, :]
        del G_random
        # Check for NAs
        random_isnan = np.isnan(G)
        random_gts_NAs = np.sum(random_isnan, axis=0)
        gts_with_obs = list()
        if np.sum(random_gts_NAs) > 0:
            print('Mean imputing missing genotypes in random effect design matrix')
Beispiel #17
0
#Read from the PLINK phenotype file (text) instead of a Bed file
# Looks like:
#cid0P0 cid0P0 0.4853395139922632
#cid1P0 cid1P0 -0.2076984565752155
#cid2P0 cid2P0 1.4909084058931985
#cid3P0 cid3P0 -1.2128996652683697
#cid4P0 cid4P0 0.4293203431508744
#...

from pysnptools.snpreader import Pheno

phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos
#Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan  nan  nan]]
phenodata = phenoreader.read()
print phenodata.val
#[[  4.85339514e-01]
# [ -2.07698457e-01]
# [  1.49090841e+00]
# [ -1.21289967e+00]
# ...

# Write 1st 10 iids and sids of Bed data into Pheno format
snpdata1010 = Bed("all.bed")[:10, :10].read()
Pheno.write("deleteme1010.txt", snpdata1010)

#Write it to Bed format
Bed.write("deleteme1010.bed", snpdata1010)

# Create a snpdata on the fly and write to Bed
#Topic: Other SnpReaders and how to write

#Read from the PLINK phenotype file (text) instead of a Bed file
# Looks like:
#cid0P0 cid0P0 0.4853395139922632
#cid1P0 cid1P0 -0.2076984565752155
#cid2P0 cid2P0 1.4909084058931985
#cid3P0 cid3P0 -1.2128996652683697
#cid4P0 cid4P0 0.4293203431508744
#...

from pysnptools.snpreader import Pheno
phenoreader = Pheno("pheno_10_causals.txt")
print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos
#Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan  nan  nan]]
phenodata = phenoreader.read()
print phenodata.val
#[[  4.85339514e-01]
# [ -2.07698457e-01]
# [  1.49090841e+00]
# [ -1.21289967e+00]
# ...

# Write 1st 10 iids and sids of Bed data into Pheno format
snpdata1010 = Bed("all.bed")[:10,:10].read()
Pheno.write("deleteme1010.txt",snpdata1010)

#Write it to Bed format
Bed.write("deleteme1010.bed",snpdata1010)

# Create a snpdata on the fly and write to Bed
class TestHeritabilitySpatialCorrection(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))
        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/heritability_spatial_correction"

    def file_name(self,testcase_name):
        temp_fn = os.path.join(self.tempout_dir,testcase_name)
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_one(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "one.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        half = self.pheno_whole.read().val
        pheno = SnpData(iid=self.pheno_whole.iid,sid=["pheno0","pheno1"],val=np.c_[half,half])

        spatial_coor = [[i,-i] for i in xrange(self.snpreader_whole.iid_count)]
        alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)]
        dataframe = heritability_spatial_correction(self.snpreader_whole,spatial_coor,self.snpreader_whole.iid,alpha_list,pheno,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=True)

        dataframe.to_csv(tmpOutfile,sep="\t",index=False)
        referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn)
        out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance)                
        self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))

    def test_two(self):
        '''
        Lock in results on arbitrary data -- because meaningful runs take too long to run.
        '''
        fn = "two.txt"
        logging.info(fn)
        tmpOutfile = self.file_name(fn)

        snpreader = self.snpreader_whole[:10,:]

        spatial_coor = [[i,-i] for i in xrange(snpreader.iid_count)]
        alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)]
        dataframe = heritability_spatial_correction(snpreader,spatial_coor,snpreader.iid,alpha_list,self.pheno_whole,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=False)

        dataframe.to_csv(tmpOutfile,sep="\t",index=False)
        referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn)
        out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance)                
        self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile))


    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..")
        result = doctest.testfile("../heritability_spatial_correction.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
class TestLinRegTrain(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all")
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/linear_regression"

    def file_name(self,testcase_name):
        temp_fn = os.path.join(self.tempout_dir,testcase_name+".dat")
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_lr_real(self):
        do_plot = False

        import pylab
        logging.info("TestLinRegTrain test_lr_real")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()
        K0_test_test = KernelIdentity(covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

        pheno_train = pheno[train_idx,:].read()
        pheno_test = pheno[test_idx,:].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
        Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0])
        bs=lsqSol[0] #weights
        r2=lsqSol[1] #squared residuals
        D=lsqSol[2]  #rank of design matrix
        N=pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2/N)
            nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
        else:
            sigma2 = float(r2 / (N-D))
            nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
            nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name,K0_train,K0_whole_test in [("Identity Kernel",None,None)]:

            first_name = first_name or name
            #Learn model, save, load
            modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                
                
            filename = self.tempout_dir + "/model_lr_real.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(modelx, filename) 
            model = joblib.load(filename)

            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = model.predict(K0_whole_test=K0_train, X=covariate_train) #test on train
                output_file = self.file_name("lr_reala_"+name)
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr_reala.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_pheno,"lr2a_"+first_name)
                self.compare_files(covar2,"lr2a.cov_"+first_name)

            #Predict with model (test on test)
            predicted_pheno, covar  = model.predict(K0_whole_test=K0_whole_test, X=covariate_test) #test on train
            output_file = self.file_name("lr_realb_"+name)
            Dat.write(output_file,predicted_pheno)
            covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
            output_file = self.file_name("lr_realb.cov_"+name)
            Dat.write(output_file,covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno,"lr2b_"+first_name)
            self.compare_files(covar2,"lr2b.cov_"+first_name)



    def compare_files(self,answer,ref_base):
        reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat") #Uses same results folder as lmm_train
        reference=Dat(reffile).read()
        assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile)
        assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile)
        for iid_index in xrange(reference.row_count):
            for sid_index in xrange(reference.col_count):
                a_v = answer.val[iid_index,sid_index]
                r_v = reference.val[iid_index,sid_index]
                assert abs(a_v - r_v) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile)

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..")
        result = doctest.testfile("../linear_regression.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
    def test_old(self):
        do_plot = False
        from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample
        from pysnptools.util import intersect_apply

        logging.info("TestSingleSnpAllPlusSelect test_old")

        bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed"
        pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"
        cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt"

        #load data
        ###################################################################
        snp_reader = Bed(bed_fn)
        pheno = Pheno(pheno_fn)
        cov = Pheno(cov_fn)

        # intersect sample ids
        snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov])

        # read in snps

        # partition snps on chr5 vs rest
        test_chr = 5
        G0 = snp_reader[:,snp_reader.pos[:,0] != test_chr].read(order='C').standardize()
        test_snps = snp_reader[:,snp_reader.pos[:,0] == test_chr].read(order='C').standardize()


        y = pheno.read().val[:,0]
        y -= y.mean()
        y /= y.std()

        # load covariates
        X_cov = cov.read().val
        X_cov.flags.writeable = False

        # invoke feature selection to learn which SNPs to use to build G1
        logging.info("running feature selection conditioned on background kernel")
        # partition data into the first 50 SNPs on chr1 and all but chr1

        select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42)
        best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov)    

        # plot out of sample error
        if do_plot: select.plot_results(measure="ll")
        # select.plot_results(measure="mse")

        # print results
        logging.info("best_k:{0}".format(best_k))
        logging.info("best_mix:{0}".format(best_mix))
        logging.info("best_delta:{0}".format(best_delta))


        ###############################
        # use selected SNPs to build G1
        logging.info(feat_idx)
        G1 = G0[:,feat_idx]

        output_file_name = self.file_name("old")
        results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None,leave_out_one_chrom=False,output_file_name=output_file_name)

        logging.info("results:")
        logging.info("#"*40)
        logging.info(results_df.head())
        self.compare_files(results_df,"old")
class TestLinRegTrain(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "..", ".."))

        self.snpreader_whole = Bed(self.pythonpath +
                                   "/tests/datasets/synth/all",
                                   count_A1=False)
        self.covariate_whole = Pheno(self.pythonpath +
                                     "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath +
                                 "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/linear_regression"

    def file_name(self, testcase_name):
        temp_fn = os.path.join(self.tempout_dir, testcase_name + ".dat")
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_lr_real(self):
        do_plot = False

        import pylab
        logging.info("TestLinRegTrain test_lr_real")

        train_idx = np.r_[10:self.snpreader_whole.iid_count]  # iids 10 and on
        test_idx = np.r_[0:10]  # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx, :].read()
        covariate_test = covar[test_idx, :].read()
        K0_test_test = KernelIdentity(covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(
            size=covar.val.shape) * 10

        pheno_train = pheno[train_idx, :].read()
        pheno_test = pheno[test_idx, :].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val, ".",
                       covariate_test.val, pheno_test.val, ".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,
                       np.ones((covariate_train.iid_count, 1))]
        Xtest = np.c_[covariate_test.val,
                      np.ones((covariate_test.iid_count, 1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:, 0], rcond=-1)
        bs = lsqSol[0]  #weights
        r2 = lsqSol[1]  #squared residuals
        D = lsqSol[2]  #rank of design matrix
        N = pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2 / N)
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + N * 0.5
        else:
            sigma2 = float(r2 / (N - D))
            nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + 0.5 / sigma2 * r2
            nLL -= 0.5 * D * np.log(2 * np.pi * sigma2)
            #REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val, "g.",
                       covariate_test.val, predicted, "r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val,
                           predicted,
                           yerr,
                           linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name, K0_train, K0_whole_test in [("Identity Kernel", None, None)]:

            first_name = first_name or name
            #Learn model, save, load
            modelx = LinearRegression().fit(K0_train=K0_train,
                                            X=covariate_train,
                                            y=pheno_train)

            filename = self.tempout_dir + "/model_lr_real.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(modelx, filename)
            model = joblib.load(filename)

            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = model.predict(
                    K0_whole_test=K0_train, X=covariate_train)  #test on train
                output_file = self.file_name("lr_reala_" + name)
                Dat.write(output_file, predicted_pheno)
                covar2 = SnpData(
                    iid=covar.row, sid=covar.col[:, 1],
                    val=covar.val)  #kludge to write kernel to text format
                output_file = self.file_name("lr_reala.cov_" + name)
                Dat.write(output_file, covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val, "g.",
                               covariate_train.val, predicted, "r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val,
                                   predicted,
                                   yerr,
                                   linestyle='None')
                    pylab.suptitle(
                        name +
                        ": test on train: train X to true target (green) and prediction (red)"
                    )
                    pylab.show()

                self.compare_files(predicted_pheno, "lr2a_" + first_name)
                self.compare_files(covar2, "lr2a.cov_" + first_name)

            #Predict with model (test on test)
            predicted_pheno, covar = model.predict(
                K0_whole_test=K0_whole_test, X=covariate_test)  #test on train
            output_file = self.file_name("lr_realb_" + name)
            Dat.write(output_file, predicted_pheno)
            covar2 = SnpData(
                iid=covar.row, sid=covar.col[:, 1],
                val=covar.val)  #kludge to write kernel to text format
            output_file = self.file_name("lr_realb.cov_" + name)
            Dat.write(output_file, covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val, "g.",
                           covariate_test.val, predicted, "r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val,
                               predicted,
                               yerr,
                               linestyle='None')
                pylab.suptitle(
                    name +
                    ": test on test: test X to true target (green) and prediction (red)"
                )
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno, "lr2b_" + first_name)
            self.compare_files(covar2, "lr2b.cov_" + first_name)

    def compare_files(self, answer, ref_base):
        reffile = TestFeatureSelection.reference_file(
            "fastlmm/" + ref_base +
            ".dat")  #Uses same results folder as lmm_train
        reference = Dat(reffile).read()
        assert np.array_equal(
            answer.col,
            reference.col), "sid differs. File '{0}'".format(reffile)
        assert np.array_equal(
            answer.row,
            reference.row), "iid differs. File '{0}'".format(reffile)
        for iid_index in xrange(reference.row_count):
            for sid_index in xrange(reference.col_count):
                a_v = answer.val[iid_index, sid_index]
                r_v = reference.val[iid_index, sid_index]
                assert abs(
                    a_v - r_v
                ) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format(
                    iid_index, sid_index, reffile)

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__)) + "/..")
        result = doctest.testfile("../linear_regression.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__
class TestFastLMM(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        from fastlmm.util.util import create_directory_if_necessary
        create_directory_if_necessary(self.tempout_dir, isfile=False)
        self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..",".."))

        self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False)
        self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")

    tempout_dir = "tempout/fastlmm"

    def file_name(self,testcase_name):
        temp_fn = os.path.join(self.tempout_dir,testcase_name+".dat")
        if os.path.exists(temp_fn):
            os.remove(temp_fn)
        return temp_fn

    def test_api(self):
        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #####################################################
        # Train and standardize cov and then apply to test
        #####################################################

        cov_train, unit_trained = self.covariate_whole[train_idx,:].read().standardize(Unit(),return_trained=True)
        cov_test = self.covariate_whole[test_idx,:].read().standardize(unit_trained)

        #####################################################
        # standardize whole kernel from snps (both ways) and then pull out the 3 parts
        #####################################################
        
        whole_kernel = SnpKernel(self.covariate_whole,Unit()).read().standardize(DiagKtoN())
        train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True)
        test_kernel = whole_kernel[train_idx,test_idx].read(order='A',view_ok=True)
        test_test_kernel = whole_kernel[test_idx,test_idx].read(order='A',view_ok=True)

        #####################################################
        # create train_train, train_test, and test_test based on just the training snps (both standardizations)
        #####################################################

        K_train = SnpKernel(self.snpreader_whole[train_idx,:],Unit(),block_size=100)
        train_train_kernel, snp_trained, kernel_trained = K_train._read_with_standardizing(to_kerneldata=True, kernel_standardizer=DiagKtoN(), return_trained=True)

        K_whole_test = _SnpWholeTest(train=self.snpreader_whole[train_idx,:],test=self.snpreader_whole[test_idx,:],standardizer=snp_trained,block_size=100)
        train_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[train_idx]) #The new reader may have the iids in a different order than the original reader
        train_test_kernel = K_whole_test[train_idx2,:].read().standardize(kernel_trained)

        test_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[test_idx])
        test_test_kernel = K_whole_test[test_idx2,:].read().standardize(kernel_trained)

        #####################################################
        # How does predict look with whole_test as input?
        #####################################################

        # a. - standardize whole up front
        whole_kernel = SnpKernel(self.snpreader_whole,Unit(),block_size=100).read().standardize()
        train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True)
        whole_test_kernel = whole_kernel[:,test_idx].read(order='A',view_ok=True)
        fastlmm1 = FastLMM(snp_standardizer=SS_Identity(), kernel_standardizer=KS_Identity())
        fastlmm1.fit(K0_train=train_kernel, X=self.covariate_whole, y=self.pheno_whole) #iid intersection means we won't really be using whole covar or pheno
        predicted_pheno, covar = fastlmm1.predict(K0_whole_test=whole_test_kernel, X=self.covariate_whole,count_A1=False)
        output_file = self.file_name("whole")
        Dat.write(output_file,predicted_pheno)
        self.compare_files(predicted_pheno,"whole")

        # b -- just files
        fastlmm2 = FastLMM()
        fastlmm2.fit(K0_train=self.snpreader_whole[train_idx,:], X=self.covariate_whole, y=self.pheno_whole[train_idx,:]) #iid intersection means we won't really be using whole covar
        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=self.snpreader_whole[test_idx,:], X=self.covariate_whole,count_A1=False)
        self.compare_files(predicted_pheno,"one")

    def test_notebook1(self):
        do_plot=False

        import matplotlib.pyplot as plt
        from pysnptools.snpreader import Pheno,Bed
        bed = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False)
        cov = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt")
        pheno = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt").read()

        # Now we learn from the first 400 students.
        training = bed[:400,:] #!!!later: the learning code doesn't like it if there are two instances of bed[:400] that are not "is -equal"
        fastlmm2 = FastLMM(GB_goal=2).fit(K0_train=training,
                                            X=cov[:400,:],
                                            y=pheno[:400,:])

        # Predict on training data:
        predicted_score,covariance = fastlmm2.predict(K0_whole_test=training,
                                                            X=cov[:400,:],count_A1=False)

        assert np.array_equal(pheno.iid[:400],predicted_score.iid), "for plots to make sense, the iids must be in the order"
        if do_plot:
            plt.plot(pheno.val[:400,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r")
            plt.errorbar(pheno.val[:400,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.')
            plt.xlabel('score (actual train)')
            plt.ylabel('predicted (test on train with stdev)')
            plt.show()

        # How well does this model predict the (unseen) TEST data?
        predicted_score,covariance = fastlmm2.predict(K0_whole_test=bed[400:500,:],
                                                            X=cov[400:500,:],count_A1=False)

        assert np.array_equal(pheno.iid[400:500],predicted_score.iid), "for plots to make sense, the iids must be in the order"
        if do_plot:
            plt.plot(pheno.val[400:500,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r")
            plt.errorbar(pheno.val[400:500,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.')
            plt.xlabel('score (actual test)')
            plt.ylabel('predicted')
            plt.show()

    def test_one(self):
        logging.info("TestLmmTrain test_one")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")

    def test_str(self):
        logging.info("TestLmmTrain test_str")

        G0_train = self.pythonpath + "/tests/datasets/synth/all"
        covariate_train = None
        pheno_train = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt"

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train,count_A1=False)
        filename = self.tempout_dir + "/model_str.flm.p"
        pstutil.create_directory_if_necessary(filename)

        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on same
        G0_test = G0_train
        covariate_test = covariate_train

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("str")
        Dat.write(output_file,predicted_pheno)

        #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"str")

    def test_lr_no_K0(self):
        logging.info("TestLinRegTrain test_lr_no_k0")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        covariate_train3 = self.covariate_whole[train_idx,:].read()
        covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)])
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        np.random.seed(0)
        pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1)

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model3.flm.p"
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covariance = fastlmm3.predict(K0_whole_test=KernelIdentity(pheno_train3.iid), X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("lr_no_k0")
        Dat.write(output_file,predicted_pheno)

        self.compare_files(predicted_pheno,"lr_no_k0")

    def test_lr_as_lmm(self):
            do_plot = False
            #later why does this test case generate two intersect info messages instead of just one?
            import pylab
            logging.info("TestLmmTrain test_lr_as_lmm")

            ###############################################################
            # Create a linear data set with just a little noise
            ###############################################################

            train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
            test_idx  = np.r_[0:10] # the first 10 iids

            #make covar just numbers 0,1,...
            covar = self.covariate_whole.read()
            covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
            covar._name = 'np.array([[float(num)] for num in xrange(covar.iid_count)])'
            covariate_train = covar[train_idx,:].read()
            covariate_test = covar[test_idx,:].read()


            #make pheno  # pheno = 2*covar+100+normal(0,1)*10
            pheno = self.pheno_whole.read()
            np.random.seed(0)
            pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

            pheno_train = pheno[train_idx,:].read()
            pheno_test = pheno[test_idx,:].read()

            if do_plot:
                #Plot training x and y, testing x and y
                pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
                pylab.suptitle("Plot training x and y, testing x and y")
                pylab.show()

            ###############################################################
            # Show that linear regression does a good job predicting
            ###############################################################

            Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
            Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
            lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
            bs=lsqSol[0] #weights
            r2=lsqSol[1] #squared residuals
            D=lsqSol[2]  #rank of design matrix
            N=pheno_train.iid_count
            REML = False
            if not REML:
                sigma2 = float(r2/N)
                nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
            else:
                sigma2 = float(r2 / (N-D))
                nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
                nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

            predicted = Xtest.dot(bs)
            yerr = [np.sqrt(sigma2)] * len(predicted)
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle("real linear regression: actual to prediction")
                pylab.show()

            ###############################################################
            # Use LMM as LR and apply test on train
            ###############################################################
            for force_full_rank in [True, False]:
                #Learn model, save, load
                fastlmmx = FastLMM(GB_goal=2,force_full_rank=force_full_rank).fit(K0_train=covariate_train, X=None, y=pheno_train)
                
                
                filename = self.tempout_dir + "/model_lr_as_lmm.flm.p"
                pstutil.create_directory_if_necessary(filename)
                joblib.dump(fastlmmx, filename) 
                fastlmm = joblib.load(filename)


                do_test_on_train = True
                if do_test_on_train:
                    #Predict with model (test on train)
                    predicted_pheno, covar = fastlmm.predict(K0_whole_test=covariate_train, X=None,count_A1=False) #test on train
                    output_file = self.file_name("lr_as_lmma_")
                    Dat.write(output_file,predicted_pheno)
                    covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                    output_file = self.file_name("lr_as_lmma.cov_")
                    Dat.write(output_file,covar2)

                    yerr = np.sqrt(np.diag(covar.val))
                    predicted = predicted_pheno.val
                    if do_plot:
                        pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                        pylab.xlim([0, 50])
                        pylab.ylim([100, 200])
                        pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                        pylab.suptitle("test on train: train X to true target (green) and prediction (red)")
                        pylab.show()

                    self.compare_files(predicted_pheno,"lr_as_lmma_")
                    self.compare_files(covar2,"lr_as_lmma.cov_")

                ###############################################################
                # Use LMM as LR and apply test on test
                ###############################################################

                #Predict with model (test on test)
                predicted_pheno, covar  = fastlmm.predict(K0_whole_test=covariate_test, X=None,count_A1=False) #test on train
                output_file = self.file_name("lr_as_lmmb_")
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr_as_lmmb.cov_")
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                    pylab.xlim([-1, 10])
                    pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                    pylab.suptitle("test on test: test X to true target (green) and prediction (red)")
                    pylab.show()
                    ## Plot y and predicted y (test on train)
                    #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                    #pylab.suptitle(name+": test on test: true target to prediction")
                    #pylab.show()

                self.compare_files(predicted_pheno,"lr_as_lmmb_")
                self.compare_files(covar2,"lr_as_lmmb.cov_")

    def test_lr2(self):
        do_plot = False

        import pylab
        logging.info("TestLmmTrain test_lr2")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        #make covar just numbers 0,1,...
        covar = self.covariate_whole.read()
        covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)])
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()
        K0_whole_test = KernelIdentity(covar.iid,covariate_test.iid)

        #make pheno  # pheno = 2*covar+100+normal(0,1)*10
        pheno = self.pheno_whole.read()
        np.random.seed(0)
        pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10

        pheno_train = pheno[train_idx,:].read()
        pheno_test = pheno[test_idx,:].read()

        if do_plot:
            #Plot training x and y, testing x and y
            pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
            pylab.suptitle("Plot training x and y, testing x and y")
            pylab.show()

        Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
        Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
        lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
        bs=lsqSol[0] #weights
        r2=lsqSol[1] #squared residuals
        D=lsqSol[2]  #rank of design matrix
        N=pheno_train.iid_count
        REML = False
        if not REML:
            sigma2 = float(r2/N)
            nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
        else:
            sigma2 = float(r2 / (N-D))
            nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
            nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

        predicted = Xtest.dot(bs)
        yerr = [np.sqrt(sigma2)] * len(predicted)
        if do_plot:
            pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
            pylab.xlim([-1, 10])
            pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
            pylab.suptitle("real linear regression: actual to prediction")
            pylab.show()

        #These should all give the same result
        first_name = None
        for name,K0_train,K0_whole_test in [("Identity Kernel",
                                            KernelIdentity(self.snpreader_whole.iid[train_idx]),
                                            KernelIdentity(self.snpreader_whole.iid,test=self.snpreader_whole.iid[test_idx])),
                                      #!!!later("sid_count=0", self.snpreader_whole[train_idx,[]],self.snpreader_whole[test_idx,[]])
                                      ]:
            logging.info(name)
            first_name = first_name or name
            #Learn model, save, load
            fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                
                
            filename = self.tempout_dir + "/model_lr2.flm.p"
            joblib.dump(fastlmmx, filename) 
            fastlmm = joblib.load(filename)


            do_test_on_train = True
            if do_test_on_train:
                #Predict with model (test on train)
                predicted_pheno, covar = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train
                output_file = self.file_name("lr2a_"+name)
                Dat.write(output_file,predicted_pheno)
                covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
                output_file = self.file_name("lr2a.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar.val))
                predicted = predicted_pheno.val
                if do_plot:
                    pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                    pylab.xlim([0, 50])
                    pylab.ylim([100, 200])
                    pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_pheno,"lr2a_"+first_name)
                self.compare_files(covar2,"lr2a.cov_"+first_name)

            #Predict with model (test on test)
            predicted_pheno, covar  = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on train
            output_file = self.file_name("lr2b_"+name)
            Dat.write(output_file,predicted_pheno)
            covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format
            output_file = self.file_name("lr2b.cov_"+name)
            Dat.write(output_file,covar2)

            yerr = np.sqrt(np.diag(covar.val))
            predicted = predicted_pheno.val
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                pylab.show()
                ## Plot y and predicted y (test on train)
                #pylab.plot(pheno_test.val,predicted_pheno.val,".")
                #pylab.suptitle(name+": test on test: true target to prediction")
                #pylab.show()

            self.compare_files(predicted_pheno,"lr2b_"+first_name)
            self.compare_files(covar2,"lr2b.cov_"+first_name)


    def test_str2(self):
        logging.info("TestLmmTrain test_str2")


        #Standardize train and test together
        whole_kernel = self.snpreader_whole.read_kernel(Unit())

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        K0_train_filename = self.tempout_dir + "/model_str2.kernel.npz"
        pstutil.create_directory_if_necessary(K0_train_filename)
        from pysnptools.kernelreader import KernelNpz
        KernelNpz.write(K0_train_filename,whole_kernel[train_idx].read(order='A',view_ok=True))

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train_filename, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_str2.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=whole_kernel[:,test_idx].read(order='A',view_ok=True), X=covariate_test,count_A1=False)

        output_file = self.file_name("str2")
        Dat.write(output_file,predicted_pheno)

        #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]
        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"str2")

    #Creating multiple tests so that will run faster when on cluster.
    def test_fasttwoK(self):
        logging.info("TestLmmTrain test_fasttwoK")
        self._fasttwoK(None,None)

    def test_fasttwoK_force_low_rank(self):
        logging.info("TestLmmTrain test_fasttwoK_force_low_rank")
        self._fasttwoK(True,None)

    def test_fasttwoK_GB2(self):
        logging.info("TestLmmTrain test_fasttwoK_GB2")
        self._fasttwoK(None,2)

    def test_fasttwoK_force_low_rank_GB2(self):
        logging.info("TestLmmTrain test_fasttwoK_force_low_rank_GB2")
        self._fasttwoK(True,2)

    def _fasttwoK(self,force_low_rank,GB_goal):

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        G1_train = SnpData(iid=G0_train.iid,sid=[item+"_1" for item in G0_train.sid],val=G0_train.read().val,pos=G0_train.pos,name="Different SNP names for {0}".format(G0_train))
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        logging.info("force_low_rank = {0}".format(force_low_rank))
        fastlmm1 = FastLMM(force_low_rank=force_low_rank,GB_goal=GB_goal).fit(K0_train=G0_train, K1_train=G1_train, X=covariate_train, y=pheno_train, mixing=.1)

        filename = self.tempout_dir + "/model_fasttwoK.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)
                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        G1_test = SnpData(iid=G0_test.iid,sid=[item+"_1" for item in G0_test.sid],val=G0_test.read().val,pos=G0_test.pos,name="Different SNP names for {0}".format(G0_test))
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G1_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("fasttwoK"+("_force_low" if force_low_rank else "")+("GB{0}".format(GB_goal) if GB_goal is not None else ""))
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")

    def test_lowrank(self):
        logging.info("TestLmmTrain test_lowrank")

        snpreader = self.snpreader_whole[:,:100]

        train_idx = np.r_[10:snpreader.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = snpreader[train_idx,:]
        G0_test = snpreader[test_idx,:]

        pheno_whole = self.pheno_whole.read()
        pheno_whole.val *= 100
        pheno_whole.val += 1000

        mean_low, covar_low =   FastLMM(force_low_rank=True,GB_goal=2).fit(K0_train=G0_train, y=pheno_whole[train_idx,:], X=self.covariate_whole[train_idx,:]). predict(K0_whole_test=G0_test,X=self.covariate_whole[test_idx,:],count_A1=False)
        mean_full, covar_full = FastLMM(force_full_rank=True,GB_goal=2).fit(K0_train=G0_train, y=pheno_whole[train_idx,:], X=self.covariate_whole[train_idx,:]).predict(K0_whole_test=G0_test,X=self.covariate_whole[test_idx,:],count_A1=False)

        np.testing.assert_allclose(mean_low.val, mean_full.val)
        np.testing.assert_allclose(covar_low.val,covar_full.val)

        logging.info("finished with TestLmmTrain test_lowrank")

    def test_twoK(self):
        logging.info("TestLmmTrain test_twoK")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, K1_train=G0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one")

    def test_lr(self):
        import matplotlib.pyplot as plt
        import pylab


        logging.info("TestLmmTrain test_lr")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)])
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        np.random.seed(0)
        pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1)

        ##Plot training x and y
        #pylab.plot(covariate_train3.val, pheno_train3.val,".")
        #pylab.show()

        for force_full_rank,force_low_rank in [(True,False),(False,True)]:
            #Learn model, save, load
            fastlmm3x = FastLMM(force_full_rank=force_full_rank,force_low_rank=force_low_rank,GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3)
            filename = self.tempout_dir + "/model_lr.flm.p"
            pstutil.create_directory_if_necessary(filename)
            joblib.dump(fastlmm3x, filename) 
            fastlmm3 = joblib.load(filename)


            #Predict with model (test on train)
            predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train
            output_file = self.file_name("lr")
            Dat.write(output_file,predicted_pheno)

            ## Plot training x and y, and training x with predicted y
            #do_plot = True 
            #if do_plot:
            #    pylab.plot(covariate_train3.val, pheno_train3.val,covariate_train3.val,predicted_pheno.val,".")
            #    pylab.show()

            #    # Plot y and predicted y (test on train)
            #    pheno_actual = pheno_train3.val[:,0]
            #    pylab.plot(pheno_actual,predicted_pheno.val,".")
            #    pylab.show()


            self.compare_files(predicted_pheno,"lr")

    def test_lmm(self):
        do_plot = False
        iid_count = 500
        seed = 0


        import pylab
        logging.info("TestLmmTrain test_lmm")

        iid = [["cid{0}P{1}".format(iid_index,iid_index//250)]*2 for iid_index in xrange(iid_count)]
        train_idx = np.r_[10:iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids


        #Every person is 100% related to everyone in one of 5 families
        K0a = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index0 in xrange(iid_count):
            for iid_index1 in xrange(iid_count):
                K0a.val[iid_index0,iid_index1] = 1 if iid_index0 % 5 == iid_index1 % 5 else 0
                if iid_index1 < iid_index0:
                    assert K0a.val[iid_index0,iid_index1] == K0a.val[iid_index1,iid_index0]

        #every person lives on a line from 0 to 1
        # They are related to every other person as a function of distance on the line
        np.random.seed(seed)
        home = np.random.random([iid_count])
        K0b = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance")
        for iid_index in xrange(iid_count):
            K0b.val[iid_index,:] = 1 - np.abs(home-home[iid_index])**.1

        #make covar just numbers 0,1,...
        covar = SnpData(iid=iid,sid=["x"],val=np.array([[float(num)] for num in xrange(iid_count)]))
        covariate_train = covar[train_idx,:].read()
        covariate_test = covar[test_idx,:].read()

        for name, h2, K0 in [("clones", 1, K0a),("line_world",.75,K0b)]:

            sigma2x = 100
            varg = sigma2x * h2
            vare = sigma2x * (1-h2)

            #######################################################################
            #make pheno  # pheno = 2*covar+100+normal(0,1)*2.5+normal(0,K)*7.5
            #######################################################################
            #random.multivariate_normal is sensitive to mkl_num_thread, so we control it.
            if 'MKL_NUM_THREADS' in os.environ:
                mkl_num_thread = os.environ['MKL_NUM_THREADS']
            else:
                mkl_num_thread = None
            os.environ['MKL_NUM_THREADS'] = '1'
            np.random.seed(seed)
            p1 = covar.val * 2.0 + 100
            p2 = np.random.normal(size=covar.val.shape)*np.sqrt(vare)
            p3 = (np.random.multivariate_normal(np.zeros(iid_count),K0.val)*np.sqrt(varg)).reshape(-1,1)
            if mkl_num_thread is not None:
                os.environ['MKL_NUM_THREADS'] = mkl_num_thread
            else:
                del os.environ['MKL_NUM_THREADS']
            pheno = SnpData(iid=iid,sid=["pheno0"],val= p1 + p2 + p3)

            pheno_train = pheno[train_idx,:].read()
            pheno_test = pheno[test_idx,:].read()

            if do_plot:
                #Plot training x and y, testing x and y
                pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".")
                pylab.suptitle(name + ": Plot training x and y, testing x and y")
                pylab.show()

            Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))]
            Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))]
            lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1)
            bs=lsqSol[0] #weights
            r2=lsqSol[1] #squared residuals
            D=lsqSol[2]  #rank of design matrix
            N=pheno_train.iid_count
            REML = False
            if not REML:
                sigma2 = float(r2/N)
                nLL =  N*0.5*np.log(2*np.pi*sigma2) + N*0.5
            else:
                sigma2 = float(r2 / (N-D))
                nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2;
                nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term

            predicted = Xtest.dot(bs)
            yerr = [np.sqrt(sigma2)] * len(predicted)
            if do_plot:
                pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                pylab.xlim([-1, 10])
                pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                pylab.suptitle(name + ": real linear regression: actual to prediction")
                pylab.show()

            for factor in [1,100,.02]:
                K0 = K0.read()
                K0.val *= factor

                K0_train = K0[train_idx]
                K0_whole_test = K0[:,test_idx]

                #Learn model, save, load
                fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
                v2 = np.var(p2)
                v3 = np.var(p3)
                logging.debug("Original h2 of {0}. Generated h2 of {1}. Learned h2 of {2}".format(h2, v3/(v2+v3), fastlmmx.h2raw))
                
                
                filename = self.tempout_dir + "/model_lmm.flm.p"
                pstutil.create_directory_if_necessary(filename)
                joblib.dump(fastlmmx, filename) 
                fastlmm = joblib.load(filename)


                do_test_on_train = True
                if do_test_on_train:
                    #Predict with model (test on train)
                    predicted_pheno, covar_pheno = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train
                    output_file = self.file_name("lmma_"+name)
                    Dat.write(output_file,predicted_pheno)
                    covar2 = SnpData(iid=covar_pheno.row,sid=covar_pheno.col[:,1],val=covar_pheno.val) #kludge to write kernel to text format
                    output_file = self.file_name("lmma.cov_"+name)
                    Dat.write(output_file,covar2)

                    yerr = np.sqrt(np.diag(covar_pheno.val))
                    predicted = predicted_pheno.val
                    if do_plot:
                        pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.")
                        pylab.xlim([0, 50])
                        pylab.ylim([100, 200])
                        pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None')
                        pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)")
                        pylab.show()

                    self.compare_files(predicted_pheno,"lmma_"+name)
                    self.compare_files(covar2,"lmma.cov_"+name)

                    predicted_pheno0, covar_pheno0 = fastlmm.predict(K0_whole_test=K0_train[:,0], X=covariate_train[0,:],count_A1=False) #test on train #0
                    assert np.abs(predicted_pheno0.val[0,0] - predicted_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                    assert np.abs(covar_pheno0.val[0,0] - covar_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"


                #Predict with model (test on test)
                predicted_phenoB, covar_phenoB  = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on test
                output_file = self.file_name("lmmb_"+name)
                Dat.write(output_file,predicted_phenoB)
                covar2 = SnpData(iid=covar_phenoB.row,sid=covar_phenoB.col[:,1],val=covar_phenoB.val) #kludge to write kernel to text format
                output_file = self.file_name("lmmb.cov_"+name)
                Dat.write(output_file,covar2)

                yerr = np.sqrt(np.diag(covar_phenoB.val))
                predicted = predicted_phenoB.val
                if do_plot:
                    pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.")
                    pylab.xlim([-1, 10])
                    pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None')
                    pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)")
                    pylab.show()

                self.compare_files(predicted_phenoB,"lmmb_"+name)
                self.compare_files(covar2,"lmmb.cov_"+name)

                predicted_phenoB0, covar_phenoB0  = fastlmm.predict(K0_whole_test=K0_whole_test[:,0], X=covariate_test[0,:],count_A1=False) #test on a single test case
                assert np.abs(predicted_phenoB0.val[0,0] - predicted_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"
                assert np.abs(covar_phenoB0.val[0,0] - covar_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases"

                #Predict with model test on some train and some test
                some_idx = range(covar.iid_count)
                some_idx.remove(train_idx[0])
                some_idx.remove(test_idx[0])
                covariate_some = covar[some_idx,:]
                K0_whole_some = K0[:,some_idx]
                predicted_phenoC, covar_phenoC  = fastlmm.predict(K0_whole_test=K0_whole_some, X=covariate_some,count_A1=False)
                for idxC, iidC in enumerate(predicted_phenoC.iid):
                    meanC = predicted_phenoC.val[idxC]
                    varC = covar_phenoC.val[idxC,idxC]
                    if iidC in predicted_pheno.iid:
                        predicted_pheno_ref = predicted_pheno
                        covar_pheno_ref = covar_pheno
                    else:
                        assert iidC in predicted_phenoB.iid
                        predicted_pheno_ref = predicted_phenoB
                        covar_pheno_ref = covar_phenoB
                    idx_ref = predicted_pheno_ref.iid_to_index([iidC])[0]
                    mean_ref = predicted_pheno_ref.val[idx_ref]
                    var_ref = covar_pheno_ref.val[idx_ref,idx_ref]
                    assert np.abs(meanC - mean_ref) < 1e-6
                    assert np.abs(varC - var_ref) < 1e-6


    def test_snps(self):
        logging.info("TestLmmTrain test_snps")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        # Show it using the snps
        G0_train = self.snpreader_whole[train_idx,:]
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        pheno_train3.val = G0_train[:,0:1].read().val*2

        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".")
        #pylab.show()

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model_snps.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("snps")
        Dat.write(output_file,predicted_pheno)

        ### Plot training x and y, and training x with predicted y
        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val,".",G0_train[:,0:1].read().val[:,0],predicted_pheno.val,".")
        #pylab.show()

        ### Plot y and predicted y (test on train)
        #pheno_actual = pheno_train3.val[:,0]
        #pylab.plot(pheno_actual,predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"snps")

    def test_kernel(self):
        logging.info("TestLmmTrain test_kernel")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        # Show it using the snps
        K0_train = self.snpreader_whole[train_idx,:].read_kernel(Unit())
        covariate_train3 = self.covariate_whole[train_idx,:].read()
        pheno_train3 = self.pheno_whole[train_idx,:].read()
        pheno_train3.val = self.snpreader_whole[train_idx,0:1].read().val*2
        assert np.array_equal(K0_train.iid,covariate_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"
        assert np.array_equal(K0_train.iid,pheno_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"

        #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".")
        #pylab.show()

        #Learn model, save, load
        fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train3, y=pheno_train3)
        filename = self.tempout_dir + "/model_snps.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm3x, filename) 
        fastlmm3 = joblib.load(filename)


        #Predict with model (test on train)
        predicted_pheno, covar = fastlmm3.predict(K0_whole_test=K0_train, X=covariate_train3,count_A1=False) #test on train
        output_file = self.file_name("kernel")
        Dat.write(output_file,predicted_pheno)

        #### Plot training x and y, and training x with predicted y
        #pylab.plot(self.snpreader_whole[train_idx,0:1].read().val[:,0], pheno_train3.val,".",self.snpreader_whole[train_idx,0:1].read().val[:,0],predicted_pheno.val,".")
        #pylab.show()

        #### Plot y and predicted y (test on train)
        #pheno_actual = pheno_train3.val[:,0]
        #pylab.plot(pheno_actual,predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"snps") #"kernel" and "snps" test cases should give the same results

    def test_kernel_one(self):
        logging.info("TestLmmTrain test_kernel_one")

        train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on
        test_idx  = np.r_[0:10] # the first 10 iids

        K0_train = SnpKernel(self.snpreader_whole[train_idx,:],standardizer=Unit())
        covariate_train = self.covariate_whole[train_idx,:]
        pheno_train = self.pheno_whole[train_idx,:]
        assert np.array_equal(K0_train.iid,covariate_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"
        assert np.array_equal(K0_train.iid,pheno_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)"

        fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train)
        filename = self.tempout_dir + "/model_kernel_one.flm.p"
        pstutil.create_directory_if_necessary(filename)
        joblib.dump(fastlmm1, filename) 
        fastlmm2 = joblib.load(filename)

                
        # predict on test set
        G0_test = self.snpreader_whole[test_idx,:]
        covariate_test = self.covariate_whole[test_idx,:]

        predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False)

        output_file = self.file_name("kernel_one")
        Dat.write(output_file,predicted_pheno)

        pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0]

        #pylab.plot(pheno_actual, predicted_pheno.val,".")
        #pylab.show()


        self.compare_files(predicted_pheno,"one") #Expect same results as SNPs "one"

    def compare_files(self,answer,ref_base):
        reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat")
        reference=Dat(reffile).read()
        assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile)
        assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile)
        for iid_index in xrange(reference.row_count):
            for sid_index in xrange(reference.col_count):
                a_v = answer.val[iid_index,sid_index]
                r_v = reference.val[iid_index,sid_index]
                assert abs(a_v - r_v) < 1e-4 or abs(a_v - r_v)/abs(r_v) < 1e5, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile)

    def test_doctest(self):
        old_dir = os.getcwd()
        os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..")
        result = doctest.testfile("../fastlmm_predictor.py")
        os.chdir(old_dir)
        assert result.failed == 0, "failed doc test: " + __file__