def divideData(filename,direct,num=5,mph=3,delet=True): print "Estimating heritability using "+str(num)+" components" [yFil,sFil]=getData(filename,mph=mph); n=sFil.iid_count reOrd=perm(n); yFil=yFil[reOrd,:]; sFil=sFil[reOrd,:]; div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)]; varEsts=[]; for i in range(0,num): print "For component "+str(i); sFilTemp=sFil[div[i]:div[i+1],:]; yFilTemp=yFil[div[i]:div[i+1],:]; fileTemp=direct+"/tempFile_"+str(i); Bed.write(fileTemp,sFilTemp.read()); Pheno.write(fileTemp+".phen",yFilTemp.read()) varEsts.append(varRes(fileTemp,direct)); if delet: os.system("rm "+direct+"/tempFile_"+str(i)+"*"); return varEsts;
def __init__(self,filename,snpfile="",params="",n0=-1,n1=-1): self.BED=Bed(filename); self.pheno=Pheno(filename+".fam"); self.y=self.pheno.read().val[:,3]; self.y=self.y-1.0; self.params=params; n=len(self.y) if n0>0: print "Initiate with n0" I0=[i for i in range(0,n) if self.y[i]==0.0] I0=I0[:n0] I1=[i for i in range(0,n) if self.y[i]==1.0] I1=I1[:n1] I0.extend(I1); self.y=self.y[I0] self.BED=self.BED[I0,:] try: if len(snpfile)>0: fil=open(snpfile) lines=fil.readlines(); fil.close(); self.snps=[l.strip() for l in lines] else: self.snps=self.BED.sid; except: print "Error loading SNPs!" sys.exit(); self.setUp(); self.n=len(self.y) print "Number of individuals: "+str(self.n) self.Cov=[]; self.params="";
def divideData(self,filename,num=5,mph=3,delet=True): print "Estimating heritability using "+str(num)+" components" direct="TEMP" sFil=Bed(filename); yFil=Pheno(filename+".fam"); n=sFil.iid_count reOrd=perm(n); yFil=yFil[reOrd,:]; sFil=sFil[reOrd,:]; y=yFil.read().val[:,3]; div=[int(math.ceil( i*n/float(num) )) for i in range(0,num+1)]; varEsts=[]; for i in range(0,num): print "For component "+str(i); sFilTemp=self.BED[div[i]:div[i+1],:]; Xtemp=sFilTemp.read().standardize().val; ytemp=y[div[i]:div[i+1]]; varEsts.append(self.VarCalc.RealVar(ytemp,Xtemp)); return varEsts;
def divideData(self, filename, num=5, mph=3, delet=True): print "Estimating heritability using " + str(num) + " components" direct = "TEMP" sFil = Bed(filename) yFil = Pheno(filename + ".fam") n = sFil.iid_count reOrd = perm(n) yFil = yFil[reOrd, :] sFil = sFil[reOrd, :] y = yFil.read().val[:, 3] div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)] varEsts = [] for i in range(0, num): print "For component " + str(i) sFilTemp = self.BED[div[i]:div[i + 1], :] Xtemp = sFilTemp.read().standardize().val ytemp = y[div[i]:div[i + 1]] varEsts.append(self.VarCalc.RealVar(ytemp, Xtemp)) return varEsts
def divideData(filename, direct, num=5, mph=3, delet=True): print "Estimating heritability using " + str(num) + " components" [yFil, sFil] = getData(filename, mph=mph) n = sFil.iid_count reOrd = perm(n) yFil = yFil[reOrd, :] sFil = sFil[reOrd, :] div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)] varEsts = [] for i in range(0, num): print "For component " + str(i) sFilTemp = sFil[div[i]:div[i + 1], :] yFilTemp = yFil[div[i]:div[i + 1], :] fileTemp = direct + "/tempFile_" + str(i) Bed.write(fileTemp, sFilTemp.read()) Pheno.write(fileTemp + ".phen", yFilTemp.read()) varEsts.append(varRes(fileTemp, direct)) if delet: os.system("rm " + direct + "/tempFile_" + str(i) + "*") return varEsts
def loadData(filename): mph = 3 sFil = Bed(filename) yFil = Pheno(filename + ".fam") y = yFil.read().val[:, mph] y = [i - 1 for i in y] return [y, sFil]
def getData(filename): mph=3; sFil=Bed(filename); yFil=Pheno(filename+".fam"); y=yFil.read().val[:,mph]; y=[i-1 for i in y] return [y,sFil];
def read_phen(self,fn_phen = None): """ read phenotype file """ PH = Pheno(fn_phen) PHOB = PH.read() self.Y = PHOB.val self.SID = PHOB.iid[:,1]
def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all") self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
def getData(filename): mph=3; sFil=Bed(filename); yFil=Pheno(filename+".fam"); X=sFil.read().standardize().val; y=yFil.read().val[:,mph]; return [y,sFil];
def read_phen(self, fn_phen=None): """ read phenotype file """ PH = Pheno(fn_phen) PHOB = PH.read() self.Y = PHOB.val self.SID = PHOB.iid[:, 1]
def getData(filename): mph = 3 sFil = Bed(filename, count_A1=False) # Bed object yFil = Pheno(filename + ".fam") y = yFil.read().val[:, mph] y = [i - 1 for i in y ] # the last column of .fam file is the disease states of data owners return [y, sFil]
def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None): do_plot = False use_cache = False # define file names bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt") if not (use_cache and os.path.exists(pcs_fn)): from fastlmm.util import compute_auto_pcs covar = compute_auto_pcs(bed_fn, count_A1=count_A1) logging.info("selected number of PCs: {0}".format( covar["vals"].shape[1])) Pheno.write( pcs_fn, SnpData(iid=covar['iid'], sid=covar['header'], val=covar['vals'])) else: logging.info("Using top pcs's cache") covar = Pheno(pcs_fn) mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp" runner = mf_to_runner_function(mf_name)(20) logging.info( "Working on h2={0},force_low_rank={1},force_full_rank={2}".format( h2, force_low_rank, force_full_rank)) result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search") output_file_name = os.path.join(self.tempout_dir, result_file_name) + ".txt" results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, k_list=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 160, 200, 250, 320, 400, 500, 630, 800, 1000 ], h2=h2, n_folds=self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt", covar=covar, output_file_name=output_file_name, force_low_rank=force_low_rank, force_full_rank=force_full_rank, GB_goal=2, count_A1=False #runner = runner ) logging.info(results.head()) self.compare_files(results, result_file_name)
def getData(filename="", mph=3, UseCov=False): sFil = Bed(filename) yFil = Pheno(filename + ".fam") Q = [] if isfile(filename + ".cov") and UseCov: QFil = Pheno(filename + ".cov") [sFil, yFil, QFil] = intersect_apply([sFil, yFil, QFil]) if isfile(filename + ".phen"): yFil = Pheno(filename + ".phen") [sFil, yFil] = intersect_apply([sFil, yFil]) return [yFil, sFil]
def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1): """Read a phenotype file and remove missing values. Args: phenofile : :class:`str` path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ... missing_char : :class:`str` The character that denotes a missing phenotype value; 'NA' by default. phen_index : :class:`int` The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile Returns: y : :class:`~numpy:numpy.array` vector of non-missing phenotype values from specified column of phenofile pheno_ids: :class:`~numpy:numpy.array` corresponding vector of individual IDs (IID) """ pheno = Pheno(phenofile, missing=missing_char)[:,phen_index-1].read() y = np.array(pheno.val) y.reshape((y.shape[0],1)) pheno_ids = np.array(pheno.iid)[:,1] # Remove y NAs y_not_nan = np.logical_not(np.isnan(y[:,0])) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan,:] pheno_ids = pheno_ids[y_not_nan] print('Number of non-missing phenotype observations: ' + str(y.shape[0])) return gtarray(y,ids=pheno_ids)
def estVar(self, num, epsilon): filename = self.BED.filename y = Pheno(filename + ".fam").read().val[:, 3] varEsts = self.divideData(filename, num=num) if epsilon < 0: return varEsts[0] e1 = .1 * epsilon e2 = .45 * epsilon e3 = .45 * epsilon vary = self.estVarY(y, e1) se2 = sum([v[1] for v in varEsts]) / float(num) + Lap( 0.0, vary / (e2 * float(num))) if se2 < 0: se2 = 0 if se2 > vary: se2 = vary sg2 = sum([v[0] for v in varEsts]) / float(num) + Lap( 0.0, vary / (e3 * float(num))) if sg2 < 0: sg2 = .01 * vary if sg2 > vary: sg2 = vary return [sg2, se2]
def read_covariates(covar_file, ids_to_match, missing): ## Read a covariate file and reorder to match ids_to_match ## # Read covariate file covar_f = Pheno(covar_file, missing=missing).read() ids = covar_f.iid # Get covariate values n_X = covar_f._col.shape[0] + 1 X = np.ones((covar_f.val.shape[0], n_X)) X[:, 1:n_X] = covar_f.val # Get covariate names X_names = np.zeros((n_X), dtype='S10') X_names[0] = 'Intercept' X_names[1:n_X] = np.array(covar_f._col, dtype='S20') # Remove NAs NA_rows = np.isnan(X).any(axis=1) n_NA_row = np.sum(NA_rows) if n_NA_row > 0: print( 'Number of rows removed from covariate file due to missing observations: ' + str(np.sum(NA_rows))) X = X[~NA_rows] ids = ids[~NA_rows] id_dict = id_dict_make(ids) # Match with pheno_ids ids_to_match_tuples = [tuple(x) for x in ids_to_match] common_ids = id_dict.viewkeys() & set(ids_to_match_tuples) pheno_in = np.array([(tuple(x) in common_ids) for x in ids_to_match]) match_ids = ids_to_match[pheno_in, :] X_id_match = np.array([id_dict[tuple(x)] for x in match_ids]) X = X[X_id_match, :] return [X, X_names, pheno_in]
def test_intersection(self): from pysnptools.standardizer import Unit from pysnptools.kernelreader import SnpKernel from pysnptools.snpreader import Pheno from pysnptools.kernelreader._subset import _KernelSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed", count_A1=False) k = SnpKernel(snps_all, stdizer.Identity()) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:, :] # To test intersection we remove a iid from pheno k1, pheno = intersect_apply([ k, pheno ]) #SnpKernel is special because it standardizes AFTER intersecting. assert isinstance(k1.snpreader, _SnpSubset) and not isinstance(k1, _KernelSubset) #What happens with fancy selection? k2 = k[::2] assert isinstance(k2, SnpKernel) logging.info("Done with test_intersection")
def test_single_snp(args): import fastlmm from pysnptools.snpreader import SnpData, Pheno, SnpReader from fastlmm.association import single_snp from utils import read_hdf5_dataset import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import fastlmm.util.util as flutil logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.sample_indices_file is not None: logger.info('read indices from file: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero( (phenotypes['type'] == 'training').values)[0] logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes[phenotypes['type'] == 'training'].copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno[['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('run FastLMM for single SNP test for %s' % trait) results_df = single_snp(test_snps, pheno, K0=K0, count_A1=True, GB_goal=args.GB_goal) result_file = os.path.join(args.output_dir, 'single_snp.' + trait) logger.info('save results to file: ' + result_file) results_df.to_hdf(result_file, trait) if args.manhattan: plot_file = os.path.join(args.output_dir, 'manhattan.%s.pdf' % trait) logger.info('create Manhattan plot: ' + plot_file) plt.clf() flutil.manhattan_plot(results_df.as_matrix( ["Chr", "ChrPos", "PValue"]), pvalue_line=1e-5, xaxis_unit_bp=False) plt.savefig(plot_file)
def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[1,0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe",missing="",vectorize=True) assert len(dict['vals'].shape)==1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None,iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def read_covariates(covar, pheno_ids=None, missing_char = 'NA'): covar = Pheno(covar, missing=missing_char).read() X = np.array(covar.val) X = gtarray(X, ids=np.array(covar.iid)[:,1]) if pheno_ids is not None: in_covar = np.array([x in X.id_dict for x in pheno_ids]) if np.sum((~in_covar))>0: raise(ValueError('Missing covariate values for some phenotyped individuals')) X.fill_NAs() return X
def _pheno_fixup(pheno_input, iid_if_none=None, missing='-9'): try: ret = Pheno(pheno_input, iid_if_none, missing=missing) ret.iid #doing this just to force file load return ret except: return _snps_fixup(pheno_input, iid_if_none=iid_if_none) return pheno_input
def getData(filename): mph=3; sFil=Bed(filename); yFil=Pheno(filename+".fam"); snpList=sFil.sid; y=yFil.read().val[:,mph]; y=[i-1 for i in y] Icases=[i for i in range(0,len(y)) if y[i]>0]; Icont=[i for i in range(0,len(y)) if y[i]<1]; sFilcases=sFil[Icases,:] sFilcont=sFil[Icont,:] Dcont=sFilcont.read().val; Dcases=sFilcases.read().val; r=getMarginals(Dcont); s=getMarginals(Dcases); return [r,s,snpList];
def _sel_plus_pc(self,h2,force_low_rank,force_full_rank,count_A1=None): do_plot = False use_cache = False # define file names bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" pcs_fn = os.path.join(self.tempout_dir,"sel_plus_pc.pcs.txt") if not (use_cache and os.path.exists(pcs_fn)): from fastlmm.util import compute_auto_pcs covar = compute_auto_pcs(bed_fn,count_A1=count_A1) logging.info("selected number of PCs: {0}".format(covar["vals"].shape[1])) Pheno.write(pcs_fn,SnpData(iid=covar['iid'],sid=covar['header'],val=covar['vals'])) else: logging.info("Using top pcs's cache") covar=Pheno(pcs_fn) mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp" runner = mf_to_runner_function(mf_name)(20) logging.info("Working on h2={0},force_low_rank={1},force_full_rank={2}".format(h2,force_low_rank,force_full_rank)) result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search") output_file_name = os.path.join(self.tempout_dir,result_file_name)+".txt" results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, k_list = [0,1,2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,125,160,200,250,320,400,500,630,800,1000], h2=h2, n_folds = self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt", covar=covar, output_file_name=output_file_name, force_low_rank=force_low_rank,force_full_rank=force_full_rank, GB_goal=2, count_A1=False #runner = runner ) logging.info(results.head()) self.compare_files(results,result_file_name)
def test_covar_by_chrom_mixing(self): logging.info( "TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn covar = Pheno(self.cov_fn).read() covar = SnpData(iid=covar.iid, sid=["pheno-1"], val=covar.val) covar_by_chrom = {chrom: self.cov_fn for chrom in xrange(1, 6)} output_file = self.file_name("covar_by_chrom_mixing") frame = single_snp(test_snps, pheno, covar=covar, covar_by_chrom=covar_by_chrom, output_file_name=output_file) self.compare_files(frame, "covar_by_chrom_mixing")
def test_intersection_Dist2Snp(self): from pysnptools.snpreader._dist2snp import _Dist2Snp from pysnptools.snpreader import Pheno from pysnptools.distreader._subset import _DistSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply dist_all = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz") k = dist_all.as_snp(max_weight=25) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:,:] # To test intersection we remove a iid from pheno k1,pheno = intersect_apply([k,pheno]) assert isinstance(k1.distreader,_DistSubset) and not isinstance(k1,_SnpSubset) #What happens with fancy selection? k2 = k[::2,:] assert isinstance(k2,_Dist2Snp) logging.info("Done with test_intersection")
def test_intersection_Snp2Dist(self): from pysnptools.distreader._snp2dist import _Snp2Dist from pysnptools.snpreader import Pheno, Bed from pysnptools.distreader._subset import _DistSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply snp_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True) k = snp_all.as_dist(max_weight=2) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:,:] # To test intersection we remove a iid from pheno k1,pheno = intersect_apply([k,pheno]) assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_DistSubset) #What happens with fancy selection? k2 = k[::2,:] assert isinstance(k2,_Snp2Dist) logging.info("Done with test_intersection")
def test_multipheno(self): logging.info("test_multipheno") random_state = RandomState(29921) pheno_reference = Pheno(self.phen_fn).read() for pheno_count in [2, 5, 1]: val = random_state.normal(loc=pheno_count, scale=pheno_count, size=(pheno_reference.iid_count, pheno_count)) pheno_col = ['pheno{0}'.format(i) for i in range(pheno_count)] pheno_multi = SnpData(iid=pheno_reference.iid, sid=pheno_col, val=val) reference = pd.concat([ single_snp(test_snps=self.bed, pheno=pheno_multi[:, pheno_index], covar=self.cov_fn) for pheno_index in range(pheno_count) ]) frame = single_snp_scale(test_snps=self.bed, pheno=pheno_multi, covar=self.cov_fn) assert len(frame) == len( reference), "# of pairs differs from file '{0}'".format( reffile) for sid in sorted( set(reference.SNP )): #This ignores which pheno produces which pvalue pvalue_frame = np.array( sorted(frame[frame['SNP'] == sid].PValue)) pvalue_reference = np.array( sorted(reference[reference['SNP'] == sid].PValue)) assert ( abs(pvalue_frame - pvalue_reference) < 1e-5 ).all, "pair {0} differs too much from reference".format(sid)
n_V = 1 V_names = np.array(['Intercept']) n_pars = n_X + n_V + 1 print(str(n_pars) + ' parameters in model') # Get sample size n = y.shape[0] if n == 0: raise (ValueError('No non-missing observations with both phenotype and genotype data')) print(str(n) + ' individuals with no missing phenotype or covariate observations') n = float(n) #### Read random effect genotypes #### if args.random_gts is not None: if args.random_gts_txt: random_gts_f = Pheno(args.random_gts) else: random_gts_f = Bed(args.random_gts) random_gts_ids = np.array(random_gts_f.iid) random_gts_f = random_gts_f.read() # Match to phenotypes pheno_id_dict = id_dict_make(pheno_ids) G_random = random_gts_f.val G = np.empty((y.shape[0], G_random.shape[1])) G[:] = np.nan for i in xrange(0, random_gts_ids.shape[0]): if tuple(random_gts_ids[i, :]) in pheno_id_dict: G[pheno_id_dict[tuple(random_gts_ids[i, :])], :] = G_random[i, :] del G_random # Check for NAs random_isnan = np.isnan(G)
default='NA') parser.add_argument('--no_h2_estimate', action='store_true', default=False, help='Suppress output of h2 estimate') args = parser.parse_args() ##### Check minimal model is specified ##### if args.mean_covar is None and args.var_covar is None and args.random_gts is None: raise (ValueError( 'Must specify at least one of: mean_covar, var_covar, random_gts')) ####################### Read in data ######################### #### Read phenotype ### pheno = Pheno(args.phenofile, missing=args.missing_char).read() y = np.array(pheno.val) pheno_ids = np.array(pheno.iid) if y.ndim == 1: pass elif y.ndim == 2: y = y[:, args.phen_index - 1] else: raise (ValueError('Incorrect dimensions of phenotype array')) # Remove y NAs y_not_nan = np.logical_not(np.isnan(y)) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan] pheno_ids = pheno_ids[y_not_nan, :] # Make id dictionary print('Number of non-missing y observations: ' + str(y.shape[0]))
def run_fastlmm(args): from pysnptools.snpreader import SnpData, Pheno, SnpReader from utils import prepare_output_file, read_cvindex from fastlmm.inference import FastLMM import dill as pickle logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.cvindex_file is not None: logger.info('read indices from file: ' + args.cvindex_file) train_index, test_index = read_cvindex(args.cvindex_file) else: train_index = np.nonzero((phenotypes['type'] == 'training').values)[0] test_index = np.nonzero((phenotypes['type'] == 'test').values)[0] n_snps_total = get_num_snps(args.snp_file) n_snps_sel = min(n_snps_total, args.n_snps) logger.info('number of sampled SNPs: %d' % n_snps_sel) sel_snps = np.random.choice(n_snps_total, size=n_snps_sel) logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, transpose=args.transpose_x, snp_indices=sel_snps, std_filter_indices=train_index) logger.info('number of sampled SNPs after filtering by std: %d' % test_snps.shape[1]) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0) if args.seed: logger.info('set random seed for numpy: %d' % args.seed) np.seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes.copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('train FastLMM model for %s' % trait) model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True) model.fit(X=test_snps[train_index, :], y=pheno, K0_train=K0, penalty=args.penalty, Smin=1.0) logger.info('fitted h2: %f' % model.h2raw) logger.info('predict using the FastLMM model for %s' % trait) y_mean, y_var = model.predict(X=test_snps[test_index, :], K0_whole_test=K0[test_index, :]) y_true = phenotypes[trait][test_index].values result_file = os.path.join(args.output_dir, 'predictions.%s' % trait) logger.info('save predictions to file: ' + result_file) prepare_output_file(result_file) with h5py.File(result_file, 'w') as f: f.create_dataset('y_mean', data=y_mean.val) f.create_dataset('y_var', data=y_var.val) f.create_dataset('y_true', data=y_true) f.create_dataset('h2raw', data=model.h2raw) f.create_dataset('sel_snps', data=sel_snps) model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait) logger.info('save model to file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f)
# Via NumPy-style indexing, these allow reading by name and genetic property #Topic: Other SnpReaders and how to write #Read from the PLINK phenotype file (text) instead of a Bed file # Looks like: #cid0P0 cid0P0 0.4853395139922632 #cid1P0 cid1P0 -0.2076984565752155 #cid2P0 cid2P0 1.4909084058931985 #cid3P0 cid3P0 -1.2128996652683697 #cid4P0 cid4P0 0.4293203431508744 #... from pysnptools.snpreader import Pheno phenoreader = Pheno("pheno_10_causals.txt") print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos #Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan nan nan]] phenodata = phenoreader.read() print phenodata.val #[[ 4.85339514e-01] # [ -2.07698457e-01] # [ 1.49090841e+00] # [ -1.21289967e+00] # ... # Write 1st 10 iids and sids of Bed data into Pheno format snpdata1010 = Bed("all.bed")[:10, :10].read() Pheno.write("deleteme1010.txt", snpdata1010) #Write it to Bed format
class TestHeritabilitySpatialCorrection(unittest.TestCase): @classmethod def setUpClass(self): from pysnptools.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all", count_A1=False) self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt") tempout_dir = "tempout/heritability_spatial_correction" def file_name(self, testcase_name): temp_fn = os.path.join(self.tempout_dir, testcase_name) if os.path.exists(temp_fn): os.remove(temp_fn) return temp_fn def test_one(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "one.txt" logging.info(fn) tmpOutfile = self.file_name(fn) half = self.pheno_whole.read().val pheno = SnpData(iid=self.pheno_whole.iid, sid=["pheno0", "pheno1"], val=np.c_[half, half]) spatial_coor = [[i, -i] for i in xrange(self.snpreader_whole.iid_count)] alpha_list = alpha_list_big = [ int(v) for v in np.logspace(2, np.log10(4000), 2) ] dataframe = heritability_spatial_correction(self.snpreader_whole, spatial_coor, self.snpreader_whole.iid, alpha_list, 2, pheno, jackknife_count=2, permute_plus_count=1, permute_times_count=1, just_testing=True) dataframe.to_csv(tmpOutfile, sep="\t", index=False) referenceOutfile = TestFeatureSelection.reference_file( "heritability_spatial_correction/" + fn) out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue( out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile)) def test_two(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "two.txt" logging.info(fn) tmpOutfile = self.file_name(fn) snpreader = self.snpreader_whole[:10, :] spatial_coor = [[i, -i] for i in xrange(snpreader.iid_count)] alpha_list = alpha_list_big = [ int(v) for v in np.logspace(2, np.log10(4000), 2) ] dataframe = heritability_spatial_correction(snpreader, spatial_coor, snpreader.iid, alpha_list, 2, self.pheno_whole, jackknife_count=2, permute_plus_count=1, permute_times_count=1, just_testing=False) dataframe.to_csv(tmpOutfile, sep="\t", index=False) referenceOutfile = TestFeatureSelection.reference_file( "heritability_spatial_correction/" + fn) out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue( out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile)) def test_three(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "three.txt" logging.info(fn) tmpOutfile = self.file_name(fn) snpreader = self.snpreader_whole[:10, :] spatial_coor = [[i, -i] for i in xrange(snpreader.iid_count)] alpha_list = alpha_list_big = [ int(v) for v in np.logspace(2, np.log10(4000), 2) ] dataframe = heritability_spatial_correction(snpreader, spatial_coor, snpreader.iid, alpha_list, 2, self.pheno_whole, jackknife_count=0, permute_plus_count=0, permute_times_count=0, just_testing=False) dataframe.to_csv(tmpOutfile, sep="\t", index=False) referenceOutfile = TestFeatureSelection.reference_file( "heritability_spatial_correction/" + fn) out, msg = ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue( out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile)) def test_doctest(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__)) + "/..") result = doctest.testfile("../heritability_spatial_correction.py") os.chdir(old_dir) assert result.failed == 0, "failed doc test: " + __file__
val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree val = val[iid_index, :] #reorder or trim any extra allocation if not SnpReader._array_properties_are_ok(val, order, dtype): val = val.copy(order=order) self._close_bed() return val if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if True: from pysnptools.util import example_file pheno_fn = example_file("pysnptools/examples/toydata.phe") if False: from pysnptools.snpreader import Pheno, Bed import pysnptools.util as pstutil import os print(os.getcwd()) snpdata = Pheno( '../examples/toydata.phe').read() # Read data from Pheno format pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.bed") Bed.write("tempdir/toydata.5chrom.bed", snpdata, count_A1=False) # Write data in Bed format import doctest doctest.testmod(optionflags=doctest.ELLIPSIS) # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
class TestLinRegTrain(unittest.TestCase): @classmethod def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all") self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt") tempout_dir = "tempout/linear_regression" def file_name(self,testcase_name): temp_fn = os.path.join(self.tempout_dir,testcase_name+".dat") if os.path.exists(temp_fn): os.remove(temp_fn) return temp_fn def test_lr_real(self): do_plot = False import pylab logging.info("TestLinRegTrain test_lr_real") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() K0_test_test = KernelIdentity(covariate_test.iid) #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10 pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0]) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() #These should all give the same result first_name = None for name,K0_train,K0_whole_test in [("Identity Kernel",None,None)]: first_name = first_name or name #Learn model, save, load modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_lr_real.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(modelx, filename) model = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = model.predict(K0_whole_test=K0_train, X=covariate_train) #test on train output_file = self.file_name("lr_reala_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_reala.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lr2a_"+first_name) self.compare_files(covar2,"lr2a.cov_"+first_name) #Predict with model (test on test) predicted_pheno, covar = model.predict(K0_whole_test=K0_whole_test, X=covariate_test) #test on train output_file = self.file_name("lr_realb_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_realb.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)") pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno,"lr2b_"+first_name) self.compare_files(covar2,"lr2b.cov_"+first_name) def compare_files(self,answer,ref_base): reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat") #Uses same results folder as lmm_train reference=Dat(reffile).read() assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile) assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile) for iid_index in xrange(reference.row_count): for sid_index in xrange(reference.col_count): a_v = answer.val[iid_index,sid_index] r_v = reference.val[iid_index,sid_index] assert abs(a_v - r_v) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile) def test_doctest(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..") result = doctest.testfile("../linear_regression.py") os.chdir(old_dir) assert result.failed == 0, "failed doc test: " + __file__
class TestLinRegTrain(unittest.TestCase): @classmethod def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all", count_A1=False) self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt") tempout_dir = "tempout/linear_regression" def file_name(self, testcase_name): temp_fn = os.path.join(self.tempout_dir, testcase_name + ".dat") if os.path.exists(temp_fn): os.remove(temp_fn) return temp_fn def test_lr_real(self): do_plot = False import pylab logging.info("TestLinRegTrain test_lr_real") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covariate_train = covar[train_idx, :].read() covariate_test = covar[test_idx, :].read() K0_test_test = KernelIdentity(covariate_test.iid) #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal( size=covar.val.shape) * 10 pheno_train = pheno[train_idx, :].read() pheno_test = pheno[test_idx, :].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val, ".", covariate_test.val, pheno_test.val, ".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val, np.ones((covariate_train.iid_count, 1))] Xtest = np.c_[covariate_test.val, np.ones((covariate_test.iid_count, 1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:, 0], rcond=-1) bs = lsqSol[0] #weights r2 = lsqSol[1] #squared residuals D = lsqSol[2] #rank of design matrix N = pheno_train.iid_count REML = False if not REML: sigma2 = float(r2 / N) nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + N * 0.5 else: sigma2 = float(r2 / (N - D)) nLL = N * 0.5 * np.log(2 * np.pi * sigma2) + 0.5 / sigma2 * r2 nLL -= 0.5 * D * np.log(2 * np.pi * sigma2) #REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val, "g.", covariate_test.val, predicted, "r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted, yerr, linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() #These should all give the same result first_name = None for name, K0_train, K0_whole_test in [("Identity Kernel", None, None)]: first_name = first_name or name #Learn model, save, load modelx = LinearRegression().fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_lr_real.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(modelx, filename) model = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = model.predict( K0_whole_test=K0_train, X=covariate_train) #test on train output_file = self.file_name("lr_reala_" + name) Dat.write(output_file, predicted_pheno) covar2 = SnpData( iid=covar.row, sid=covar.col[:, 1], val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_reala.cov_" + name) Dat.write(output_file, covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val, "g.", covariate_train.val, predicted, "r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted, yerr, linestyle='None') pylab.suptitle( name + ": test on train: train X to true target (green) and prediction (red)" ) pylab.show() self.compare_files(predicted_pheno, "lr2a_" + first_name) self.compare_files(covar2, "lr2a.cov_" + first_name) #Predict with model (test on test) predicted_pheno, covar = model.predict( K0_whole_test=K0_whole_test, X=covariate_test) #test on train output_file = self.file_name("lr_realb_" + name) Dat.write(output_file, predicted_pheno) covar2 = SnpData( iid=covar.row, sid=covar.col[:, 1], val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_realb.cov_" + name) Dat.write(output_file, covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val, "g.", covariate_test.val, predicted, "r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted, yerr, linestyle='None') pylab.suptitle( name + ": test on test: test X to true target (green) and prediction (red)" ) pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno, "lr2b_" + first_name) self.compare_files(covar2, "lr2b.cov_" + first_name) def compare_files(self, answer, ref_base): reffile = TestFeatureSelection.reference_file( "fastlmm/" + ref_base + ".dat") #Uses same results folder as lmm_train reference = Dat(reffile).read() assert np.array_equal( answer.col, reference.col), "sid differs. File '{0}'".format(reffile) assert np.array_equal( answer.row, reference.row), "iid differs. File '{0}'".format(reffile) for iid_index in xrange(reference.row_count): for sid_index in xrange(reference.col_count): a_v = answer.val[iid_index, sid_index] r_v = reference.val[iid_index, sid_index] assert abs( a_v - r_v ) < 1e-4, "Value at {0},{1} differs too much from file '{2}'".format( iid_index, sid_index, reffile) def test_doctest(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__)) + "/..") result = doctest.testfile("../linear_regression.py") os.chdir(old_dir) assert result.failed == 0, "failed doc test: " + __file__
def test_old(self): do_plot = False from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample from pysnptools.util import intersect_apply logging.info("TestSingleSnpAllPlusSelect test_old") bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" #load data ################################################################### snp_reader = Bed(bed_fn) pheno = Pheno(pheno_fn) cov = Pheno(cov_fn) # intersect sample ids snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) # read in snps # partition snps on chr5 vs rest test_chr = 5 G0 = snp_reader[:,snp_reader.pos[:,0] != test_chr].read(order='C').standardize() test_snps = snp_reader[:,snp_reader.pos[:,0] == test_chr].read(order='C').standardize() y = pheno.read().val[:,0] y -= y.mean() y /= y.std() # load covariates X_cov = cov.read().val X_cov.flags.writeable = False # invoke feature selection to learn which SNPs to use to build G1 logging.info("running feature selection conditioned on background kernel") # partition data into the first 50 SNPs on chr1 and all but chr1 select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov) # plot out of sample error if do_plot: select.plot_results(measure="ll") # select.plot_results(measure="mse") # print results logging.info("best_k:{0}".format(best_k)) logging.info("best_mix:{0}".format(best_mix)) logging.info("best_delta:{0}".format(best_delta)) ############################### # use selected SNPs to build G1 logging.info(feat_idx) G1 = G0[:,feat_idx] output_file_name = self.file_name("old") results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None,leave_out_one_chrom=False,output_file_name=output_file_name) logging.info("results:") logging.info("#"*40) logging.info(results_df.head()) self.compare_files(results_df,"old")
def test_old(self): do_plot = False from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample from pysnptools.util import intersect_apply logging.info("TestSingleSnpAllPlusSelect test_old") bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" #load data ################################################################### snp_reader = Bed(bed_fn, count_A1=False) pheno = Pheno(pheno_fn) cov = Pheno(cov_fn) # intersect sample ids snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) # read in snps # partition snps on chr5 vs rest test_chr = 5 G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read( order='C').standardize() test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read( order='C').standardize() y = pheno.read().val[:, 0] y -= y.mean() y /= y.std() # load covariates X_cov = cov.read().val X_cov.flags.writeable = False # invoke feature selection to learn which SNPs to use to build G1 logging.info( "running feature selection conditioned on background kernel") # partition data into the first 50 SNPs on chr1 and all but chr1 select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov) # plot out of sample error if do_plot: select.plot_results(measure="ll") # select.plot_results(measure="mse") # print results logging.info("best_k:{0}".format(best_k)) logging.info("best_mix:{0}".format(best_mix)) logging.info("best_delta:{0}".format(best_delta)) ############################### # use selected SNPs to build G1 logging.info(feat_idx) G1 = G0[:, feat_idx] output_file_name = self.file_name("old") results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None, leave_out_one_chrom=False, output_file_name=output_file_name, count_A1=False) logging.info("results:") logging.info("#" * 40) logging.info(results_df.head()) self.compare_files(results_df, "old")
# The iid_to_index and sid_to_index methods turn iid's and sid's into indexes # Via NumPy-style indexing, these allow reading by name and genetic property #Topic: Other SnpReaders and how to write #Read from the PLINK phenotype file (text) instead of a Bed file # Looks like: #cid0P0 cid0P0 0.4853395139922632 #cid1P0 cid1P0 -0.2076984565752155 #cid2P0 cid2P0 1.4909084058931985 #cid3P0 cid3P0 -1.2128996652683697 #cid4P0 cid4P0 0.4293203431508744 #... from pysnptools.snpreader import Pheno phenoreader = Pheno("pheno_10_causals.txt") print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos #Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan nan nan]] phenodata = phenoreader.read() print phenodata.val #[[ 4.85339514e-01] # [ -2.07698457e-01] # [ 1.49090841e+00] # [ -1.21289967e+00] # ... # Write 1st 10 iids and sids of Bed data into Pheno format snpdata1010 = Bed("all.bed")[:10,:10].read() Pheno.write("deleteme1010.txt",snpdata1010) #Write it to Bed format
delim = ' ' if cols[0] == 'FID' and cols[1]== 'IID': pass else: raise ValueError('First two columns of PGS must be FID, IID') f.close() ids = np.loadtxt(args.pgs, dtype='U', usecols=(0,1), delimiter=delim, skiprows=1) pgs_vals = np.loadtxt(args.pgs, usecols=tuple([x for x in range(2, cols.shape[0])]),delimiter=delim, skiprows=1) pg = gtarray(pgs_vals.reshape((pgs_vals.shape[0],1)), ids[:, 1], sid=cols[2:cols.shape[0]], fams=ids[:, 0]) print('Normalising PGS to have mean zero and variance 1') pg.mean_normalise() pg.scale() # Read phenotype print('Reading '+str(args.phenofile)) pheno = Pheno(args.phenofile, missing=args.missing_char).read() # pheno = Pheno('phenotypes/eduyears_resid.ped', missing='NA').read() y = np.array(pheno.val) pheno_ids = np.array(pheno.iid)[:, 1] if y.ndim == 1: pass elif y.ndim == 2: y = y[:, args.phen_index - 1] else: raise ValueError('Incorrect dimensions of phenotype array') # Remove y NAs y_not_nan = np.logical_not(np.isnan(y)) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan] pheno_ids = pheno_ids[y_not_nan] y = y-np.mean(y)
class TestHeritabilitySpatialCorrection(unittest.TestCase): @classmethod def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt") tempout_dir = "tempout/heritability_spatial_correction" def file_name(self,testcase_name): temp_fn = os.path.join(self.tempout_dir,testcase_name) if os.path.exists(temp_fn): os.remove(temp_fn) return temp_fn def test_one(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "one.txt" logging.info(fn) tmpOutfile = self.file_name(fn) half = self.pheno_whole.read().val pheno = SnpData(iid=self.pheno_whole.iid,sid=["pheno0","pheno1"],val=np.c_[half,half]) spatial_coor = [[i,-i] for i in xrange(self.snpreader_whole.iid_count)] alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)] dataframe = heritability_spatial_correction(self.snpreader_whole,spatial_coor,self.snpreader_whole.iid,alpha_list,pheno,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=True) dataframe.to_csv(tmpOutfile,sep="\t",index=False) referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn) out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile)) def test_two(self): ''' Lock in results on arbitrary data -- because meaningful runs take too long to run. ''' fn = "two.txt" logging.info(fn) tmpOutfile = self.file_name(fn) snpreader = self.snpreader_whole[:10,:] spatial_coor = [[i,-i] for i in xrange(snpreader.iid_count)] alpha_list = alpha_list_big=[int(v) for v in np.logspace(2,np.log10(4000), 2)] dataframe = heritability_spatial_correction(snpreader,spatial_coor,snpreader.iid,alpha_list,self.pheno_whole,jackknife_count=2,permute_plus_count=1,permute_times_count=1,just_testing=False) dataframe.to_csv(tmpOutfile,sep="\t",index=False) referenceOutfile = TestFeatureSelection.reference_file("heritability_spatial_correction/"+fn) out,msg=ut.compare_files(tmpOutfile, referenceOutfile, tolerance) self.assertTrue(out, "msg='{0}', ref='{1}', tmp='{2}'".format(msg, referenceOutfile, tmpOutfile)) def test_doctest(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..") result = doctest.testfile("../heritability_spatial_correction.py") os.chdir(old_dir) assert result.failed == 0, "failed doc test: " + __file__
# Load FaST-LMM basic association test: from fastlmm.association import single_snp from pysnptools.snpreader import Ped from pysnptools.snpreader import Pheno from pysnptools.snpreader import wrap_plink_parser import numpy as np from sys import argv import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import fastlmm.util.util as flutil script, inped_file, inpheno_file, results_dataframe, output_manhattan = argv # Load snp data: print "Loading variant data..." ped_file = Ped(inped_file) print "Loading phenotype data..." pheno_fn = Pheno(inpheno_file) # Run basic association test: print "Running FaST-LMM single_snp test..." results_df = single_snp(test_snps=ped_file, pheno=pheno_fn, leave_out_one_chrom=0, output_file_name=results_dataframe) chromosome_starts = flutil.manhattan_plot(results_df.as_matrix(["Chr", "ChrPos", "PValue"]), pvalue_line=4.4e-7, xaxis_unit_bp=True) plt.show() # fig = plt.figure() # fig.savefig(output_manhattan)
class TestFastLMM(unittest.TestCase): @classmethod def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False) self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt") tempout_dir = "tempout/fastlmm" def file_name(self,testcase_name): temp_fn = os.path.join(self.tempout_dir,testcase_name+".dat") if os.path.exists(temp_fn): os.remove(temp_fn) return temp_fn def test_api(self): train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids ##################################################### # Train and standardize cov and then apply to test ##################################################### cov_train, unit_trained = self.covariate_whole[train_idx,:].read().standardize(Unit(),return_trained=True) cov_test = self.covariate_whole[test_idx,:].read().standardize(unit_trained) ##################################################### # standardize whole kernel from snps (both ways) and then pull out the 3 parts ##################################################### whole_kernel = SnpKernel(self.covariate_whole,Unit()).read().standardize(DiagKtoN()) train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True) test_kernel = whole_kernel[train_idx,test_idx].read(order='A',view_ok=True) test_test_kernel = whole_kernel[test_idx,test_idx].read(order='A',view_ok=True) ##################################################### # create train_train, train_test, and test_test based on just the training snps (both standardizations) ##################################################### K_train = SnpKernel(self.snpreader_whole[train_idx,:],Unit(),block_size=100) train_train_kernel, snp_trained, kernel_trained = K_train._read_with_standardizing(to_kerneldata=True, kernel_standardizer=DiagKtoN(), return_trained=True) K_whole_test = _SnpWholeTest(train=self.snpreader_whole[train_idx,:],test=self.snpreader_whole[test_idx,:],standardizer=snp_trained,block_size=100) train_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[train_idx]) #The new reader may have the iids in a different order than the original reader train_test_kernel = K_whole_test[train_idx2,:].read().standardize(kernel_trained) test_idx2 = K_whole_test.iid0_to_index(self.snpreader_whole.iid[test_idx]) test_test_kernel = K_whole_test[test_idx2,:].read().standardize(kernel_trained) ##################################################### # How does predict look with whole_test as input? ##################################################### # a. - standardize whole up front whole_kernel = SnpKernel(self.snpreader_whole,Unit(),block_size=100).read().standardize() train_kernel = whole_kernel[train_idx].read(order='A',view_ok=True) whole_test_kernel = whole_kernel[:,test_idx].read(order='A',view_ok=True) fastlmm1 = FastLMM(snp_standardizer=SS_Identity(), kernel_standardizer=KS_Identity()) fastlmm1.fit(K0_train=train_kernel, X=self.covariate_whole, y=self.pheno_whole) #iid intersection means we won't really be using whole covar or pheno predicted_pheno, covar = fastlmm1.predict(K0_whole_test=whole_test_kernel, X=self.covariate_whole,count_A1=False) output_file = self.file_name("whole") Dat.write(output_file,predicted_pheno) self.compare_files(predicted_pheno,"whole") # b -- just files fastlmm2 = FastLMM() fastlmm2.fit(K0_train=self.snpreader_whole[train_idx,:], X=self.covariate_whole, y=self.pheno_whole[train_idx,:]) #iid intersection means we won't really be using whole covar predicted_pheno, covar = fastlmm2.predict(K0_whole_test=self.snpreader_whole[test_idx,:], X=self.covariate_whole,count_A1=False) self.compare_files(predicted_pheno,"one") def test_notebook1(self): do_plot=False import matplotlib.pyplot as plt from pysnptools.snpreader import Pheno,Bed bed = Bed(self.pythonpath + "/tests/datasets/synth/all",count_A1=False) cov = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") pheno = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt").read() # Now we learn from the first 400 students. training = bed[:400,:] #!!!later: the learning code doesn't like it if there are two instances of bed[:400] that are not "is -equal" fastlmm2 = FastLMM(GB_goal=2).fit(K0_train=training, X=cov[:400,:], y=pheno[:400,:]) # Predict on training data: predicted_score,covariance = fastlmm2.predict(K0_whole_test=training, X=cov[:400,:],count_A1=False) assert np.array_equal(pheno.iid[:400],predicted_score.iid), "for plots to make sense, the iids must be in the order" if do_plot: plt.plot(pheno.val[:400,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r") plt.errorbar(pheno.val[:400,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.') plt.xlabel('score (actual train)') plt.ylabel('predicted (test on train with stdev)') plt.show() # How well does this model predict the (unseen) TEST data? predicted_score,covariance = fastlmm2.predict(K0_whole_test=bed[400:500,:], X=cov[400:500,:],count_A1=False) assert np.array_equal(pheno.iid[400:500],predicted_score.iid), "for plots to make sense, the iids must be in the order" if do_plot: plt.plot(pheno.val[400:500,:],predicted_score.val,"b.",[-5,5],[-5,5],"-r") plt.errorbar(pheno.val[400:500,:],predicted_score.val, yerr=np.sqrt(np.diag(covariance.val)),fmt='.') plt.xlabel('score (actual test)') plt.ylabel('predicted') plt.show() def test_one(self): logging.info("TestLmmTrain test_one") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = self.snpreader_whole[train_idx,:] covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_one.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False) output_file = self.file_name("one") Dat.write(output_file,predicted_pheno) pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"one") def test_str(self): logging.info("TestLmmTrain test_str") G0_train = self.pythonpath + "/tests/datasets/synth/all" covariate_train = None pheno_train = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train, y=pheno_train,count_A1=False) filename = self.tempout_dir + "/model_str.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on same G0_test = G0_train covariate_test = covariate_train predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False) output_file = self.file_name("str") Dat.write(output_file,predicted_pheno) #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"str") def test_lr_no_K0(self): logging.info("TestLinRegTrain test_lr_no_k0") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids covariate_train3 = self.covariate_whole[train_idx,:].read() covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)]) pheno_train3 = self.pheno_whole[train_idx,:].read() np.random.seed(0) pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1) #Learn model, save, load fastlmm3x = FastLMM(GB_goal=2).fit(X=covariate_train3, y=pheno_train3) filename = self.tempout_dir + "/model3.flm.p" joblib.dump(fastlmm3x, filename) fastlmm3 = joblib.load(filename) #Predict with model (test on train) predicted_pheno, covariance = fastlmm3.predict(K0_whole_test=KernelIdentity(pheno_train3.iid), X=covariate_train3,count_A1=False) #test on train output_file = self.file_name("lr_no_k0") Dat.write(output_file,predicted_pheno) self.compare_files(predicted_pheno,"lr_no_k0") def test_lr_as_lmm(self): do_plot = False #later why does this test case generate two intersect info messages instead of just one? import pylab logging.info("TestLmmTrain test_lr_as_lmm") ############################################################### # Create a linear data set with just a little noise ############################################################### train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covar._name = 'np.array([[float(num)] for num in xrange(covar.iid_count)])' covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10 pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() ############################################################### # Show that linear regression does a good job predicting ############################################################### Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() ############################################################### # Use LMM as LR and apply test on train ############################################################### for force_full_rank in [True, False]: #Learn model, save, load fastlmmx = FastLMM(GB_goal=2,force_full_rank=force_full_rank).fit(K0_train=covariate_train, X=None, y=pheno_train) filename = self.tempout_dir + "/model_lr_as_lmm.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmmx, filename) fastlmm = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = fastlmm.predict(K0_whole_test=covariate_train, X=None,count_A1=False) #test on train output_file = self.file_name("lr_as_lmma_") Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_as_lmma.cov_") Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle("test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lr_as_lmma_") self.compare_files(covar2,"lr_as_lmma.cov_") ############################################################### # Use LMM as LR and apply test on test ############################################################### #Predict with model (test on test) predicted_pheno, covar = fastlmm.predict(K0_whole_test=covariate_test, X=None,count_A1=False) #test on train output_file = self.file_name("lr_as_lmmb_") Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr_as_lmmb.cov_") Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle("test on test: test X to true target (green) and prediction (red)") pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno,"lr_as_lmmb_") self.compare_files(covar2,"lr_as_lmmb.cov_") def test_lr2(self): do_plot = False import pylab logging.info("TestLmmTrain test_lr2") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #make covar just numbers 0,1,... covar = self.covariate_whole.read() covar.val = np.array([[float(num)] for num in xrange(covar.iid_count)]) covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() K0_whole_test = KernelIdentity(covar.iid,covariate_test.iid) #make pheno # pheno = 2*covar+100+normal(0,1)*10 pheno = self.pheno_whole.read() np.random.seed(0) pheno.val = covar.val * 2.0 + 100 + np.random.normal(size=covar.val.shape)*10 pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle("Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle("real linear regression: actual to prediction") pylab.show() #These should all give the same result first_name = None for name,K0_train,K0_whole_test in [("Identity Kernel", KernelIdentity(self.snpreader_whole.iid[train_idx]), KernelIdentity(self.snpreader_whole.iid,test=self.snpreader_whole.iid[test_idx])), #!!!later("sid_count=0", self.snpreader_whole[train_idx,[]],self.snpreader_whole[test_idx,[]]) ]: logging.info(name) first_name = first_name or name #Learn model, save, load fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_lr2.flm.p" joblib.dump(fastlmmx, filename) fastlmm = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train output_file = self.file_name("lr2a_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr2a.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lr2a_"+first_name) self.compare_files(covar2,"lr2a.cov_"+first_name) #Predict with model (test on test) predicted_pheno, covar = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on train output_file = self.file_name("lr2b_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar.row,sid=covar.col[:,1],val=covar.val) #kludge to write kernel to text format output_file = self.file_name("lr2b.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)") pylab.show() ## Plot y and predicted y (test on train) #pylab.plot(pheno_test.val,predicted_pheno.val,".") #pylab.suptitle(name+": test on test: true target to prediction") #pylab.show() self.compare_files(predicted_pheno,"lr2b_"+first_name) self.compare_files(covar2,"lr2b.cov_"+first_name) def test_str2(self): logging.info("TestLmmTrain test_str2") #Standardize train and test together whole_kernel = self.snpreader_whole.read_kernel(Unit()) train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] K0_train_filename = self.tempout_dir + "/model_str2.kernel.npz" pstutil.create_directory_if_necessary(K0_train_filename) from pysnptools.kernelreader import KernelNpz KernelNpz.write(K0_train_filename,whole_kernel[train_idx].read(order='A',view_ok=True)) fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train_filename, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_str2.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=whole_kernel[:,test_idx].read(order='A',view_ok=True), X=covariate_test,count_A1=False) output_file = self.file_name("str2") Dat.write(output_file,predicted_pheno) #pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"str2") #Creating multiple tests so that will run faster when on cluster. def test_fasttwoK(self): logging.info("TestLmmTrain test_fasttwoK") self._fasttwoK(None,None) def test_fasttwoK_force_low_rank(self): logging.info("TestLmmTrain test_fasttwoK_force_low_rank") self._fasttwoK(True,None) def test_fasttwoK_GB2(self): logging.info("TestLmmTrain test_fasttwoK_GB2") self._fasttwoK(None,2) def test_fasttwoK_force_low_rank_GB2(self): logging.info("TestLmmTrain test_fasttwoK_force_low_rank_GB2") self._fasttwoK(True,2) def _fasttwoK(self,force_low_rank,GB_goal): train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = self.snpreader_whole[train_idx,:] G1_train = SnpData(iid=G0_train.iid,sid=[item+"_1" for item in G0_train.sid],val=G0_train.read().val,pos=G0_train.pos,name="Different SNP names for {0}".format(G0_train)) covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] logging.info("force_low_rank = {0}".format(force_low_rank)) fastlmm1 = FastLMM(force_low_rank=force_low_rank,GB_goal=GB_goal).fit(K0_train=G0_train, K1_train=G1_train, X=covariate_train, y=pheno_train, mixing=.1) filename = self.tempout_dir + "/model_fasttwoK.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] G1_test = SnpData(iid=G0_test.iid,sid=[item+"_1" for item in G0_test.sid],val=G0_test.read().val,pos=G0_test.pos,name="Different SNP names for {0}".format(G0_test)) covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G1_test, X=covariate_test,count_A1=False) output_file = self.file_name("fasttwoK"+("_force_low" if force_low_rank else "")+("GB{0}".format(GB_goal) if GB_goal is not None else "")) Dat.write(output_file,predicted_pheno) pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"one") def test_lowrank(self): logging.info("TestLmmTrain test_lowrank") snpreader = self.snpreader_whole[:,:100] train_idx = np.r_[10:snpreader.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = snpreader[train_idx,:] G0_test = snpreader[test_idx,:] pheno_whole = self.pheno_whole.read() pheno_whole.val *= 100 pheno_whole.val += 1000 mean_low, covar_low = FastLMM(force_low_rank=True,GB_goal=2).fit(K0_train=G0_train, y=pheno_whole[train_idx,:], X=self.covariate_whole[train_idx,:]). predict(K0_whole_test=G0_test,X=self.covariate_whole[test_idx,:],count_A1=False) mean_full, covar_full = FastLMM(force_full_rank=True,GB_goal=2).fit(K0_train=G0_train, y=pheno_whole[train_idx,:], X=self.covariate_whole[train_idx,:]).predict(K0_whole_test=G0_test,X=self.covariate_whole[test_idx,:],count_A1=False) np.testing.assert_allclose(mean_low.val, mean_full.val) np.testing.assert_allclose(covar_low.val,covar_full.val) logging.info("finished with TestLmmTrain test_lowrank") def test_twoK(self): logging.info("TestLmmTrain test_twoK") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = self.snpreader_whole[train_idx,:] covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=G0_train, K1_train=G0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_one.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, K1_whole_test=G0_test, X=covariate_test,count_A1=False) output_file = self.file_name("one") Dat.write(output_file,predicted_pheno) pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"one") def test_lr(self): import matplotlib.pyplot as plt import pylab logging.info("TestLmmTrain test_lr") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids G0_train = self.snpreader_whole[train_idx,:] covariate_train3 = self.covariate_whole[train_idx,:].read() covariate_train3.val = np.array([[float(num)] for num in xrange(covariate_train3.iid_count)]) pheno_train3 = self.pheno_whole[train_idx,:].read() np.random.seed(0) pheno_train3.val = covariate_train3.val * 2.0 + 100 + np.random.normal(size=covariate_train3.val.shape) # y = 2*x+100+normal(0,1) ##Plot training x and y #pylab.plot(covariate_train3.val, pheno_train3.val,".") #pylab.show() for force_full_rank,force_low_rank in [(True,False),(False,True)]: #Learn model, save, load fastlmm3x = FastLMM(force_full_rank=force_full_rank,force_low_rank=force_low_rank,GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3) filename = self.tempout_dir + "/model_lr.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm3x, filename) fastlmm3 = joblib.load(filename) #Predict with model (test on train) predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train output_file = self.file_name("lr") Dat.write(output_file,predicted_pheno) ## Plot training x and y, and training x with predicted y #do_plot = True #if do_plot: # pylab.plot(covariate_train3.val, pheno_train3.val,covariate_train3.val,predicted_pheno.val,".") # pylab.show() # # Plot y and predicted y (test on train) # pheno_actual = pheno_train3.val[:,0] # pylab.plot(pheno_actual,predicted_pheno.val,".") # pylab.show() self.compare_files(predicted_pheno,"lr") def test_lmm(self): do_plot = False iid_count = 500 seed = 0 import pylab logging.info("TestLmmTrain test_lmm") iid = [["cid{0}P{1}".format(iid_index,iid_index//250)]*2 for iid_index in xrange(iid_count)] train_idx = np.r_[10:iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids #Every person is 100% related to everyone in one of 5 families K0a = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance") for iid_index0 in xrange(iid_count): for iid_index1 in xrange(iid_count): K0a.val[iid_index0,iid_index1] = 1 if iid_index0 % 5 == iid_index1 % 5 else 0 if iid_index1 < iid_index0: assert K0a.val[iid_index0,iid_index1] == K0a.val[iid_index1,iid_index0] #every person lives on a line from 0 to 1 # They are related to every other person as a function of distance on the line np.random.seed(seed) home = np.random.random([iid_count]) K0b = KernelData(iid=iid,val=np.empty([iid_count,iid_count]),name="related by distance") for iid_index in xrange(iid_count): K0b.val[iid_index,:] = 1 - np.abs(home-home[iid_index])**.1 #make covar just numbers 0,1,... covar = SnpData(iid=iid,sid=["x"],val=np.array([[float(num)] for num in xrange(iid_count)])) covariate_train = covar[train_idx,:].read() covariate_test = covar[test_idx,:].read() for name, h2, K0 in [("clones", 1, K0a),("line_world",.75,K0b)]: sigma2x = 100 varg = sigma2x * h2 vare = sigma2x * (1-h2) ####################################################################### #make pheno # pheno = 2*covar+100+normal(0,1)*2.5+normal(0,K)*7.5 ####################################################################### #random.multivariate_normal is sensitive to mkl_num_thread, so we control it. if 'MKL_NUM_THREADS' in os.environ: mkl_num_thread = os.environ['MKL_NUM_THREADS'] else: mkl_num_thread = None os.environ['MKL_NUM_THREADS'] = '1' np.random.seed(seed) p1 = covar.val * 2.0 + 100 p2 = np.random.normal(size=covar.val.shape)*np.sqrt(vare) p3 = (np.random.multivariate_normal(np.zeros(iid_count),K0.val)*np.sqrt(varg)).reshape(-1,1) if mkl_num_thread is not None: os.environ['MKL_NUM_THREADS'] = mkl_num_thread else: del os.environ['MKL_NUM_THREADS'] pheno = SnpData(iid=iid,sid=["pheno0"],val= p1 + p2 + p3) pheno_train = pheno[train_idx,:].read() pheno_test = pheno[test_idx,:].read() if do_plot: #Plot training x and y, testing x and y pylab.plot(covariate_train.val, pheno_train.val,".",covariate_test.val, pheno_test.val,".") pylab.suptitle(name + ": Plot training x and y, testing x and y") pylab.show() Xtrain = np.c_[covariate_train.val,np.ones((covariate_train.iid_count,1))] Xtest = np.c_[covariate_test.val,np.ones((covariate_test.iid_count,1))] lsqSol = np.linalg.lstsq(Xtrain, pheno_train.val[:,0],rcond=-1) bs=lsqSol[0] #weights r2=lsqSol[1] #squared residuals D=lsqSol[2] #rank of design matrix N=pheno_train.iid_count REML = False if not REML: sigma2 = float(r2/N) nLL = N*0.5*np.log(2*np.pi*sigma2) + N*0.5 else: sigma2 = float(r2 / (N-D)) nLL = N*0.5*np.log(2*np.pi*sigma2) + 0.5/sigma2*r2; nLL -= 0.5*D*np.log(2*np.pi*sigma2);#REML term predicted = Xtest.dot(bs) yerr = [np.sqrt(sigma2)] * len(predicted) if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name + ": real linear regression: actual to prediction") pylab.show() for factor in [1,100,.02]: K0 = K0.read() K0.val *= factor K0_train = K0[train_idx] K0_whole_test = K0[:,test_idx] #Learn model, save, load fastlmmx = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train) v2 = np.var(p2) v3 = np.var(p3) logging.debug("Original h2 of {0}. Generated h2 of {1}. Learned h2 of {2}".format(h2, v3/(v2+v3), fastlmmx.h2raw)) filename = self.tempout_dir + "/model_lmm.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmmx, filename) fastlmm = joblib.load(filename) do_test_on_train = True if do_test_on_train: #Predict with model (test on train) predicted_pheno, covar_pheno = fastlmm.predict(K0_whole_test=K0_train, X=covariate_train,count_A1=False) #test on train output_file = self.file_name("lmma_"+name) Dat.write(output_file,predicted_pheno) covar2 = SnpData(iid=covar_pheno.row,sid=covar_pheno.col[:,1],val=covar_pheno.val) #kludge to write kernel to text format output_file = self.file_name("lmma.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar_pheno.val)) predicted = predicted_pheno.val if do_plot: pylab.plot(covariate_train.val, pheno_train.val,"g.",covariate_train.val, predicted,"r.") pylab.xlim([0, 50]) pylab.ylim([100, 200]) pylab.errorbar(covariate_train.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on train: train X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_pheno,"lmma_"+name) self.compare_files(covar2,"lmma.cov_"+name) predicted_pheno0, covar_pheno0 = fastlmm.predict(K0_whole_test=K0_train[:,0], X=covariate_train[0,:],count_A1=False) #test on train #0 assert np.abs(predicted_pheno0.val[0,0] - predicted_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" assert np.abs(covar_pheno0.val[0,0] - covar_pheno.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" #Predict with model (test on test) predicted_phenoB, covar_phenoB = fastlmm.predict(K0_whole_test=K0_whole_test, X=covariate_test,count_A1=False) #test on test output_file = self.file_name("lmmb_"+name) Dat.write(output_file,predicted_phenoB) covar2 = SnpData(iid=covar_phenoB.row,sid=covar_phenoB.col[:,1],val=covar_phenoB.val) #kludge to write kernel to text format output_file = self.file_name("lmmb.cov_"+name) Dat.write(output_file,covar2) yerr = np.sqrt(np.diag(covar_phenoB.val)) predicted = predicted_phenoB.val if do_plot: pylab.plot(covariate_test.val, pheno_test.val,"g.",covariate_test.val, predicted,"r.") pylab.xlim([-1, 10]) pylab.errorbar(covariate_test.val, predicted,yerr,linestyle='None') pylab.suptitle(name+": test on test: test X to true target (green) and prediction (red)") pylab.show() self.compare_files(predicted_phenoB,"lmmb_"+name) self.compare_files(covar2,"lmmb.cov_"+name) predicted_phenoB0, covar_phenoB0 = fastlmm.predict(K0_whole_test=K0_whole_test[:,0], X=covariate_test[0,:],count_A1=False) #test on a single test case assert np.abs(predicted_phenoB0.val[0,0] - predicted_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" assert np.abs(covar_phenoB0.val[0,0] - covar_phenoB.val[0,0]) < 1e-6, "Expect a single case to get the same prediction as a set of cases" #Predict with model test on some train and some test some_idx = range(covar.iid_count) some_idx.remove(train_idx[0]) some_idx.remove(test_idx[0]) covariate_some = covar[some_idx,:] K0_whole_some = K0[:,some_idx] predicted_phenoC, covar_phenoC = fastlmm.predict(K0_whole_test=K0_whole_some, X=covariate_some,count_A1=False) for idxC, iidC in enumerate(predicted_phenoC.iid): meanC = predicted_phenoC.val[idxC] varC = covar_phenoC.val[idxC,idxC] if iidC in predicted_pheno.iid: predicted_pheno_ref = predicted_pheno covar_pheno_ref = covar_pheno else: assert iidC in predicted_phenoB.iid predicted_pheno_ref = predicted_phenoB covar_pheno_ref = covar_phenoB idx_ref = predicted_pheno_ref.iid_to_index([iidC])[0] mean_ref = predicted_pheno_ref.val[idx_ref] var_ref = covar_pheno_ref.val[idx_ref,idx_ref] assert np.abs(meanC - mean_ref) < 1e-6 assert np.abs(varC - var_ref) < 1e-6 def test_snps(self): logging.info("TestLmmTrain test_snps") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids # Show it using the snps G0_train = self.snpreader_whole[train_idx,:] covariate_train3 = self.covariate_whole[train_idx,:].read() pheno_train3 = self.pheno_whole[train_idx,:].read() pheno_train3.val = G0_train[:,0:1].read().val*2 #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".") #pylab.show() #Learn model, save, load fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=G0_train, X=covariate_train3, y=pheno_train3) filename = self.tempout_dir + "/model_snps.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm3x, filename) fastlmm3 = joblib.load(filename) #Predict with model (test on train) predicted_pheno, covar = fastlmm3.predict(K0_whole_test=G0_train, X=covariate_train3,count_A1=False) #test on train output_file = self.file_name("snps") Dat.write(output_file,predicted_pheno) ### Plot training x and y, and training x with predicted y #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val,".",G0_train[:,0:1].read().val[:,0],predicted_pheno.val,".") #pylab.show() ### Plot y and predicted y (test on train) #pheno_actual = pheno_train3.val[:,0] #pylab.plot(pheno_actual,predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"snps") def test_kernel(self): logging.info("TestLmmTrain test_kernel") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids # Show it using the snps K0_train = self.snpreader_whole[train_idx,:].read_kernel(Unit()) covariate_train3 = self.covariate_whole[train_idx,:].read() pheno_train3 = self.pheno_whole[train_idx,:].read() pheno_train3.val = self.snpreader_whole[train_idx,0:1].read().val*2 assert np.array_equal(K0_train.iid,covariate_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" assert np.array_equal(K0_train.iid,pheno_train3.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" #pylab.plot(G0_train[:,0:1].read().val[:,0], pheno_train3.val[:,0],".") #pylab.show() #Learn model, save, load fastlmm3x = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train3, y=pheno_train3) filename = self.tempout_dir + "/model_snps.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm3x, filename) fastlmm3 = joblib.load(filename) #Predict with model (test on train) predicted_pheno, covar = fastlmm3.predict(K0_whole_test=K0_train, X=covariate_train3,count_A1=False) #test on train output_file = self.file_name("kernel") Dat.write(output_file,predicted_pheno) #### Plot training x and y, and training x with predicted y #pylab.plot(self.snpreader_whole[train_idx,0:1].read().val[:,0], pheno_train3.val,".",self.snpreader_whole[train_idx,0:1].read().val[:,0],predicted_pheno.val,".") #pylab.show() #### Plot y and predicted y (test on train) #pheno_actual = pheno_train3.val[:,0] #pylab.plot(pheno_actual,predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"snps") #"kernel" and "snps" test cases should give the same results def test_kernel_one(self): logging.info("TestLmmTrain test_kernel_one") train_idx = np.r_[10:self.snpreader_whole.iid_count] # iids 10 and on test_idx = np.r_[0:10] # the first 10 iids K0_train = SnpKernel(self.snpreader_whole[train_idx,:],standardizer=Unit()) covariate_train = self.covariate_whole[train_idx,:] pheno_train = self.pheno_whole[train_idx,:] assert np.array_equal(K0_train.iid,covariate_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" assert np.array_equal(K0_train.iid,pheno_train.iid), "Expect iids to be the same (so that early and late Unit standardization will give the same result)" fastlmm1 = FastLMM(GB_goal=2).fit(K0_train=K0_train, X=covariate_train, y=pheno_train) filename = self.tempout_dir + "/model_kernel_one.flm.p" pstutil.create_directory_if_necessary(filename) joblib.dump(fastlmm1, filename) fastlmm2 = joblib.load(filename) # predict on test set G0_test = self.snpreader_whole[test_idx,:] covariate_test = self.covariate_whole[test_idx,:] predicted_pheno, covar = fastlmm2.predict(K0_whole_test=G0_test, X=covariate_test,count_A1=False) output_file = self.file_name("kernel_one") Dat.write(output_file,predicted_pheno) pheno_actual = self.pheno_whole[test_idx,:].read().val[:,0] #pylab.plot(pheno_actual, predicted_pheno.val,".") #pylab.show() self.compare_files(predicted_pheno,"one") #Expect same results as SNPs "one" def compare_files(self,answer,ref_base): reffile = TestFeatureSelection.reference_file("fastlmm/"+ref_base+".dat") reference=Dat(reffile).read() assert np.array_equal(answer.col,reference.col), "sid differs. File '{0}'".format(reffile) assert np.array_equal(answer.row,reference.row), "iid differs. File '{0}'".format(reffile) for iid_index in xrange(reference.row_count): for sid_index in xrange(reference.col_count): a_v = answer.val[iid_index,sid_index] r_v = reference.val[iid_index,sid_index] assert abs(a_v - r_v) < 1e-4 or abs(a_v - r_v)/abs(r_v) < 1e5, "Value at {0},{1} differs too much from file '{2}'".format(iid_index,sid_index,reffile) def test_doctest(self): old_dir = os.getcwd() os.chdir(os.path.dirname(os.path.realpath(__file__))+"/..") result = doctest.testfile("../fastlmm_predictor.py") os.chdir(old_dir) assert result.failed == 0, "failed doc test: " + __file__