def setUpClass(self): from fastlmm.util.util import create_directory_if_necessary create_directory_if_necessary(self.tempout_dir, isfile=False) self.pythonpath = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","..")) self.snpreader_whole = Bed(self.pythonpath + "/tests/datasets/synth/all") self.covariate_whole = Pheno(self.pythonpath + "/tests/datasets/synth/cov.txt") self.pheno_whole = Pheno(self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt")
def getData(filename="", mph=3, UseCov=False): sFil = Bed(filename) yFil = Pheno(filename + ".fam") Q = [] if isfile(filename + ".cov") and UseCov: QFil = Pheno(filename + ".cov") [sFil, yFil, QFil] = intersect_apply([sFil, yFil, QFil]) if isfile(filename + ".phen"): yFil = Pheno(filename + ".phen") [sFil, yFil] = intersect_apply([sFil, yFil]) return [yFil, sFil]
def read_covariates(covar_file, ids_to_match, missing): ## Read a covariate file and reorder to match ids_to_match ## # Read covariate file covar_f = Pheno(covar_file, missing=missing).read() ids = covar_f.iid # Get covariate values n_X = covar_f._col.shape[0] + 1 X = np.ones((covar_f.val.shape[0], n_X)) X[:, 1:n_X] = covar_f.val # Get covariate names X_names = np.zeros((n_X), dtype='S10') X_names[0] = 'Intercept' X_names[1:n_X] = np.array(covar_f._col, dtype='S20') # Remove NAs NA_rows = np.isnan(X).any(axis=1) n_NA_row = np.sum(NA_rows) if n_NA_row > 0: print( 'Number of rows removed from covariate file due to missing observations: ' + str(np.sum(NA_rows))) X = X[~NA_rows] ids = ids[~NA_rows] id_dict = id_dict_make(ids) # Match with pheno_ids ids_to_match_tuples = [tuple(x) for x in ids_to_match] common_ids = id_dict.viewkeys() & set(ids_to_match_tuples) pheno_in = np.array([(tuple(x) in common_ids) for x in ids_to_match]) match_ids = ids_to_match[pheno_in, :] X_id_match = np.array([id_dict[tuple(x)] for x in match_ids]) X = X[X_id_match, :] return [X, X_names, pheno_in]
def estVar(self, num, epsilon): filename = self.BED.filename y = Pheno(filename + ".fam").read().val[:, 3] varEsts = self.divideData(filename, num=num) if epsilon < 0: return varEsts[0] e1 = .1 * epsilon e2 = .45 * epsilon e3 = .45 * epsilon vary = self.estVarY(y, e1) se2 = sum([v[1] for v in varEsts]) / float(num) + Lap( 0.0, vary / (e2 * float(num))) if se2 < 0: se2 = 0 if se2 > vary: se2 = vary sg2 = sum([v[0] for v in varEsts]) / float(num) + Lap( 0.0, vary / (e3 * float(num))) if sg2 < 0: sg2 = .01 * vary if sg2 > vary: sg2 = vary return [sg2, se2]
def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1): """Read a phenotype file and remove missing values. Args: phenofile : :class:`str` path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ... missing_char : :class:`str` The character that denotes a missing phenotype value; 'NA' by default. phen_index : :class:`int` The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile Returns: y : :class:`~numpy:numpy.array` vector of non-missing phenotype values from specified column of phenofile pheno_ids: :class:`~numpy:numpy.array` corresponding vector of individual IDs (IID) """ pheno = Pheno(phenofile, missing=missing_char)[:,phen_index-1].read() y = np.array(pheno.val) y.reshape((y.shape[0],1)) pheno_ids = np.array(pheno.iid)[:,1] # Remove y NAs y_not_nan = np.logical_not(np.isnan(y[:,0])) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan,:] pheno_ids = pheno_ids[y_not_nan] print('Number of non-missing phenotype observations: ' + str(y.shape[0])) return gtarray(y,ids=pheno_ids)
def test_single_snp(args): import fastlmm from pysnptools.snpreader import SnpData, Pheno, SnpReader from fastlmm.association import single_snp from utils import read_hdf5_dataset import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import fastlmm.util.util as flutil logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.sample_indices_file is not None: logger.info('read indices from file: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero( (phenotypes['type'] == 'training').values)[0] logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes[phenotypes['type'] == 'training'].copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno[['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('run FastLMM for single SNP test for %s' % trait) results_df = single_snp(test_snps, pheno, K0=K0, count_A1=True, GB_goal=args.GB_goal) result_file = os.path.join(args.output_dir, 'single_snp.' + trait) logger.info('save results to file: ' + result_file) results_df.to_hdf(result_file, trait) if args.manhattan: plot_file = os.path.join(args.output_dir, 'manhattan.%s.pdf' % trait) logger.info('create Manhattan plot: ' + plot_file) plt.clf() flutil.manhattan_plot(results_df.as_matrix( ["Chr", "ChrPos", "PValue"]), pvalue_line=1e-5, xaxis_unit_bp=False) plt.savefig(plot_file)
def test_intersection(self): from pysnptools.standardizer import Unit from pysnptools.kernelreader import SnpKernel from pysnptools.snpreader import Pheno from pysnptools.kernelreader._subset import _KernelSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply snps_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed", count_A1=False) k = SnpKernel(snps_all, stdizer.Identity()) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:, :] # To test intersection we remove a iid from pheno k1, pheno = intersect_apply([ k, pheno ]) #SnpKernel is special because it standardizes AFTER intersecting. assert isinstance(k1.snpreader, _SnpSubset) and not isinstance(k1, _KernelSubset) #What happens with fancy selection? k2 = k[::2] assert isinstance(k2, SnpKernel) logging.info("Done with test_intersection")
def __init__(self,filename,snpfile="",params="",n0=-1,n1=-1): self.BED=Bed(filename); self.pheno=Pheno(filename+".fam"); self.y=self.pheno.read().val[:,3]; self.y=self.y-1.0; self.params=params; n=len(self.y) if n0>0: print "Initiate with n0" I0=[i for i in range(0,n) if self.y[i]==0.0] I0=I0[:n0] I1=[i for i in range(0,n) if self.y[i]==1.0] I1=I1[:n1] I0.extend(I1); self.y=self.y[I0] self.BED=self.BED[I0,:] try: if len(snpfile)>0: fil=open(snpfile) lines=fil.readlines(); fil.close(); self.snps=[l.strip() for l in lines] else: self.snps=self.BED.sid; except: print "Error loading SNPs!" sys.exit(); self.setUp(); self.n=len(self.y) print "Number of individuals: "+str(self.n) self.Cov=[]; self.params="";
def divideData(self, filename, num=5, mph=3, delet=True): print "Estimating heritability using " + str(num) + " components" direct = "TEMP" sFil = Bed(filename) yFil = Pheno(filename + ".fam") n = sFil.iid_count reOrd = perm(n) yFil = yFil[reOrd, :] sFil = sFil[reOrd, :] y = yFil.read().val[:, 3] div = [int(math.ceil(i * n / float(num))) for i in range(0, num + 1)] varEsts = [] for i in range(0, num): print "For component " + str(i) sFilTemp = self.BED[div[i]:div[i + 1], :] Xtemp = sFilTemp.read().standardize().val ytemp = y[div[i]:div[i + 1]] varEsts.append(self.VarCalc.RealVar(ytemp, Xtemp)) return varEsts
def loadData(filename): mph = 3 sFil = Bed(filename) yFil = Pheno(filename + ".fam") y = yFil.read().val[:, mph] y = [i - 1 for i in y] return [y, sFil]
def read_phen(self, fn_phen=None): """ read phenotype file """ PH = Pheno(fn_phen) PHOB = PH.read() self.Y = PHOB.val self.SID = PHOB.iid[:, 1]
def getData(filename): mph=3; sFil=Bed(filename); yFil=Pheno(filename+".fam"); X=sFil.read().standardize().val; y=yFil.read().val[:,mph]; return [y,sFil];
def getData(filename): mph = 3 sFil = Bed(filename, count_A1=False) # Bed object yFil = Pheno(filename + ".fam") y = yFil.read().val[:, mph] y = [i - 1 for i in y ] # the last column of .fam file is the disease states of data owners return [y, sFil]
def read_covariates(covar, pheno_ids=None, missing_char = 'NA'): covar = Pheno(covar, missing=missing_char).read() X = np.array(covar.val) X = gtarray(X, ids=np.array(covar.iid)[:,1]) if pheno_ids is not None: in_covar = np.array([x in X.id_dict for x in pheno_ids]) if np.sum((~in_covar))>0: raise(ValueError('Missing covariate values for some phenotyped individuals')) X.fill_NAs() return X
def _pheno_fixup(pheno_input, iid_if_none=None, missing='-9'): try: ret = Pheno(pheno_input, iid_if_none, missing=missing) ret.iid #doing this just to force file load return ret except: return _snps_fixup(pheno_input, iid_if_none=iid_if_none) return pheno_input
def _sel_plus_pc(self, h2, force_low_rank, force_full_rank, count_A1=None): do_plot = False use_cache = False # define file names bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" phen_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" pcs_fn = os.path.join(self.tempout_dir, "sel_plus_pc.pcs.txt") if not (use_cache and os.path.exists(pcs_fn)): from fastlmm.util import compute_auto_pcs covar = compute_auto_pcs(bed_fn, count_A1=count_A1) logging.info("selected number of PCs: {0}".format( covar["vals"].shape[1])) Pheno.write( pcs_fn, SnpData(iid=covar['iid'], sid=covar['header'], val=covar['vals'])) else: logging.info("Using top pcs's cache") covar = Pheno(pcs_fn) mf_name = "lmp" #"lmpl" "local", "coreP", "nodeP", "socketP", "nodeE", "lmp" runner = mf_to_runner_function(mf_name)(20) logging.info( "Working on h2={0},force_low_rank={1},force_full_rank={2}".format( h2, force_low_rank, force_full_rank)) result_file_name = "sel_plus_pc_{0}".format("h2IsHalf" if h2 == .5 else "h2Search") output_file_name = os.path.join(self.tempout_dir, result_file_name) + ".txt" results = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, k_list=[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 160, 200, 250, 320, 400, 500, 630, 800, 1000 ], h2=h2, n_folds=self.pythonpath + "/tests/datasets/synth/DebugEmitFolds.txt", covar=covar, output_file_name=output_file_name, force_low_rank=force_low_rank, force_full_rank=force_full_rank, GB_goal=2, count_A1=False #runner = runner ) logging.info(results.head()) self.compare_files(results, result_file_name)
def test_c_reader_pheno(self): snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() self.assertEqual(np.float64, snpdata1.val.dtype) snpdata1.val[ 1, 0] = np.NaN # Inject a missing value to test writing and reading missing values output = "tempdir/snpreader/toydata.phe" create_directory_if_necessary(output) Pheno.write(output, snpdata1) snpreader = Pheno(output) _fortesting_JustCheckExists().input(snpreader) s = str(snpreader) snpdata2 = snpreader.read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata2.val, decimal=10) snpdata1 = Pheno(self.currentFolder + "/examples/toydata.phe").read() import pysnptools.util.pheno as pstpheno dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="") snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) dict = pstpheno.loadOnePhen(self.currentFolder + "/examples/toydata.phe", missing="", vectorize=True) assert len(dict['vals'].shape) == 1, "test 1-d array of values" snpdata3 = Pheno(dict).read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata3.val, decimal=10) snpdata4 = Pheno(None, iid_if_none=snpdata1.iid) assert (snpdata4.row == snpdata1.row).all() and snpdata4.col_count == 0 snpdata5 = Pheno(self.currentFolder + "/examples/toydata.id.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata5.val, decimal=10) snpdata6 = Pheno(self.currentFolder + "/examples/toydata.fid.phe").read() np.testing.assert_array_almost_equal(snpdata1.val, snpdata6.val, decimal=10)
def test_covar_by_chrom_mixing(self): logging.info( "TestSingleSnpLeaveOutOneChrom test_covar_by_chrom_mixing") test_snps = Bed(self.bedbase) pheno = self.phen_fn covar = self.cov_fn covar = Pheno(self.cov_fn).read() covar = SnpData(iid=covar.iid, sid=["pheno-1"], val=covar.val) covar_by_chrom = {chrom: self.cov_fn for chrom in xrange(1, 6)} output_file = self.file_name("covar_by_chrom_mixing") frame = single_snp(test_snps, pheno, covar=covar, covar_by_chrom=covar_by_chrom, output_file_name=output_file) self.compare_files(frame, "covar_by_chrom_mixing")
def test_intersection_Dist2Snp(self): from pysnptools.snpreader._dist2snp import _Dist2Snp from pysnptools.snpreader import Pheno from pysnptools.distreader._subset import _DistSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply dist_all = DistNpz(self.currentFolder + "/../examples/toydata.dist.npz") k = dist_all.as_snp(max_weight=25) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:,:] # To test intersection we remove a iid from pheno k1,pheno = intersect_apply([k,pheno]) assert isinstance(k1.distreader,_DistSubset) and not isinstance(k1,_SnpSubset) #What happens with fancy selection? k2 = k[::2,:] assert isinstance(k2,_Dist2Snp) logging.info("Done with test_intersection")
def test_intersection_Snp2Dist(self): from pysnptools.distreader._snp2dist import _Snp2Dist from pysnptools.snpreader import Pheno, Bed from pysnptools.distreader._subset import _DistSubset from pysnptools.snpreader._subset import _SnpSubset from pysnptools.util import intersect_apply snp_all = Bed(self.currentFolder + "/../examples/toydata.5chrom.bed",count_A1=True) k = snp_all.as_dist(max_weight=2) pheno = Pheno(self.currentFolder + "/../examples/toydata.phe") pheno = pheno[1:,:] # To test intersection we remove a iid from pheno k1,pheno = intersect_apply([k,pheno]) assert isinstance(k1.snpreader,_SnpSubset) and not isinstance(k1,_DistSubset) #What happens with fancy selection? k2 = k[::2,:] assert isinstance(k2,_Snp2Dist) logging.info("Done with test_intersection")
def test_multipheno(self): logging.info("test_multipheno") random_state = RandomState(29921) pheno_reference = Pheno(self.phen_fn).read() for pheno_count in [2, 5, 1]: val = random_state.normal(loc=pheno_count, scale=pheno_count, size=(pheno_reference.iid_count, pheno_count)) pheno_col = ['pheno{0}'.format(i) for i in range(pheno_count)] pheno_multi = SnpData(iid=pheno_reference.iid, sid=pheno_col, val=val) reference = pd.concat([ single_snp(test_snps=self.bed, pheno=pheno_multi[:, pheno_index], covar=self.cov_fn) for pheno_index in range(pheno_count) ]) frame = single_snp_scale(test_snps=self.bed, pheno=pheno_multi, covar=self.cov_fn) assert len(frame) == len( reference), "# of pairs differs from file '{0}'".format( reffile) for sid in sorted( set(reference.SNP )): #This ignores which pheno produces which pvalue pvalue_frame = np.array( sorted(frame[frame['SNP'] == sid].PValue)) pvalue_reference = np.array( sorted(reference[reference['SNP'] == sid].PValue)) assert ( abs(pvalue_frame - pvalue_reference) < 1e-5 ).all, "pair {0} differs too much from reference".format(sid)
default='NA') parser.add_argument('--no_h2_estimate', action='store_true', default=False, help='Suppress output of h2 estimate') args = parser.parse_args() ##### Check minimal model is specified ##### if args.mean_covar is None and args.var_covar is None and args.random_gts is None: raise (ValueError( 'Must specify at least one of: mean_covar, var_covar, random_gts')) ####################### Read in data ######################### #### Read phenotype ### pheno = Pheno(args.phenofile, missing=args.missing_char).read() y = np.array(pheno.val) pheno_ids = np.array(pheno.iid) if y.ndim == 1: pass elif y.ndim == 2: y = y[:, args.phen_index - 1] else: raise (ValueError('Incorrect dimensions of phenotype array')) # Remove y NAs y_not_nan = np.logical_not(np.isnan(y)) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan] pheno_ids = pheno_ids[y_not_nan, :] # Make id dictionary print('Number of non-missing y observations: ' + str(y.shape[0]))
# Via NumPy-style indexing, these allow reading by name and genetic property #Topic: Other SnpReaders and how to write #Read from the PLINK phenotype file (text) instead of a Bed file # Looks like: #cid0P0 cid0P0 0.4853395139922632 #cid1P0 cid1P0 -0.2076984565752155 #cid2P0 cid2P0 1.4909084058931985 #cid3P0 cid3P0 -1.2128996652683697 #cid4P0 cid4P0 0.4293203431508744 #... from pysnptools.snpreader import Pheno phenoreader = Pheno("pheno_10_causals.txt") print phenoreader, phenoreader.iid_count, phenoreader.sid_count, phenoreader.sid, phenoreader.pos #Pheno('pheno_10_causals.txt') 500 1 ['pheno0'] [[ nan nan nan]] phenodata = phenoreader.read() print phenodata.val #[[ 4.85339514e-01] # [ -2.07698457e-01] # [ 1.49090841e+00] # [ -1.21289967e+00] # ... # Write 1st 10 iids and sids of Bed data into Pheno format snpdata1010 = Bed("all.bed")[:10, :10].read() Pheno.write("deleteme1010.txt", snpdata1010) #Write it to Bed format
val[0::4, SNPsIndex:SNPsIndex + 1][bytes >= 3] = byteThree val = val[iid_index, :] #reorder or trim any extra allocation if not SnpReader._array_properties_are_ok(val, order, dtype): val = val.copy(order=order) self._close_bed() return val if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if True: from pysnptools.util import example_file pheno_fn = example_file("pysnptools/examples/toydata.phe") if False: from pysnptools.snpreader import Pheno, Bed import pysnptools.util as pstutil import os print(os.getcwd()) snpdata = Pheno( '../examples/toydata.phe').read() # Read data from Pheno format pstutil.create_directory_if_necessary("tempdir/toydata.5chrom.bed") Bed.write("tempdir/toydata.5chrom.bed", snpdata, count_A1=False) # Write data in Bed format import doctest doctest.testmod(optionflags=doctest.ELLIPSIS) # There is also a unit test case in 'pysnptools\test.py' that calls this doc test
def run_fastlmm(args): from pysnptools.snpreader import SnpData, Pheno, SnpReader from utils import prepare_output_file, read_cvindex from fastlmm.inference import FastLMM import dill as pickle logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.cvindex_file is not None: logger.info('read indices from file: ' + args.cvindex_file) train_index, test_index = read_cvindex(args.cvindex_file) else: train_index = np.nonzero((phenotypes['type'] == 'training').values)[0] test_index = np.nonzero((phenotypes['type'] == 'test').values)[0] n_snps_total = get_num_snps(args.snp_file) n_snps_sel = min(n_snps_total, args.n_snps) logger.info('number of sampled SNPs: %d' % n_snps_sel) sel_snps = np.random.choice(n_snps_total, size=n_snps_sel) logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, transpose=args.transpose_x, snp_indices=sel_snps, std_filter_indices=train_index) logger.info('number of sampled SNPs after filtering by std: %d' % test_snps.shape[1]) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0) if args.seed: logger.info('set random seed for numpy: %d' % args.seed) np.seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes.copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('train FastLMM model for %s' % trait) model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True) model.fit(X=test_snps[train_index, :], y=pheno, K0_train=K0, penalty=args.penalty, Smin=1.0) logger.info('fitted h2: %f' % model.h2raw) logger.info('predict using the FastLMM model for %s' % trait) y_mean, y_var = model.predict(X=test_snps[test_index, :], K0_whole_test=K0[test_index, :]) y_true = phenotypes[trait][test_index].values result_file = os.path.join(args.output_dir, 'predictions.%s' % trait) logger.info('save predictions to file: ' + result_file) prepare_output_file(result_file) with h5py.File(result_file, 'w') as f: f.create_dataset('y_mean', data=y_mean.val) f.create_dataset('y_var', data=y_var.val) f.create_dataset('y_true', data=y_true) f.create_dataset('h2raw', data=model.h2raw) f.create_dataset('sel_snps', data=sel_snps) model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait) logger.info('save model to file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f)
def test_old(self): do_plot = False from fastlmm.feature_selection.feature_selection_two_kernel import FeatureSelectionInSample from pysnptools.util import intersect_apply logging.info("TestSingleSnpAllPlusSelect test_old") bed_fn = self.pythonpath + "/tests/datasets/synth/all.bed" pheno_fn = self.pythonpath + "/tests/datasets/synth/pheno_10_causals.txt" cov_fn = self.pythonpath + "/tests/datasets/synth/cov.txt" #load data ################################################################### snp_reader = Bed(bed_fn, count_A1=False) pheno = Pheno(pheno_fn) cov = Pheno(cov_fn) # intersect sample ids snp_reader, pheno, cov = intersect_apply([snp_reader, pheno, cov]) # read in snps # partition snps on chr5 vs rest test_chr = 5 G0 = snp_reader[:, snp_reader.pos[:, 0] != test_chr].read( order='C').standardize() test_snps = snp_reader[:, snp_reader.pos[:, 0] == test_chr].read( order='C').standardize() y = pheno.read().val[:, 0] y -= y.mean() y /= y.std() # load covariates X_cov = cov.read().val X_cov.flags.writeable = False # invoke feature selection to learn which SNPs to use to build G1 logging.info( "running feature selection conditioned on background kernel") # partition data into the first 50 SNPs on chr1 and all but chr1 select = FeatureSelectionInSample(max_log_k=7, n_folds=7, order_by_lmm=True, measure="ll", random_state=42) best_k, feat_idx, best_mix, best_delta = select.run_select(G0.val, G0.val, y, cov=X_cov) # plot out of sample error if do_plot: select.plot_results(measure="ll") # select.plot_results(measure="mse") # print results logging.info("best_k:{0}".format(best_k)) logging.info("best_mix:{0}".format(best_mix)) logging.info("best_delta:{0}".format(best_delta)) ############################### # use selected SNPs to build G1 logging.info(feat_idx) G1 = G0[:, feat_idx] output_file_name = self.file_name("old") results_df = single_snp(test_snps, pheno, G0=G0, G1=G1, mixing=best_mix, h2=None, leave_out_one_chrom=False, output_file_name=output_file_name, count_A1=False) logging.info("results:") logging.info("#" * 40) logging.info(results_df.head()) self.compare_files(results_df, "old")
delim = ' ' if cols[0] == 'FID' and cols[1]== 'IID': pass else: raise ValueError('First two columns of PGS must be FID, IID') f.close() ids = np.loadtxt(args.pgs, dtype='U', usecols=(0,1), delimiter=delim, skiprows=1) pgs_vals = np.loadtxt(args.pgs, usecols=tuple([x for x in range(2, cols.shape[0])]),delimiter=delim, skiprows=1) pg = gtarray(pgs_vals.reshape((pgs_vals.shape[0],1)), ids[:, 1], sid=cols[2:cols.shape[0]], fams=ids[:, 0]) print('Normalising PGS to have mean zero and variance 1') pg.mean_normalise() pg.scale() # Read phenotype print('Reading '+str(args.phenofile)) pheno = Pheno(args.phenofile, missing=args.missing_char).read() # pheno = Pheno('phenotypes/eduyears_resid.ped', missing='NA').read() y = np.array(pheno.val) pheno_ids = np.array(pheno.iid)[:, 1] if y.ndim == 1: pass elif y.ndim == 2: y = y[:, args.phen_index - 1] else: raise ValueError('Incorrect dimensions of phenotype array') # Remove y NAs y_not_nan = np.logical_not(np.isnan(y)) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan] pheno_ids = pheno_ids[y_not_nan] y = y-np.mean(y)
# Load FaST-LMM basic association test: from fastlmm.association import single_snp from pysnptools.snpreader import Ped from pysnptools.snpreader import Pheno from pysnptools.snpreader import wrap_plink_parser import numpy as np from sys import argv import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import fastlmm.util.util as flutil script, inped_file, inpheno_file, results_dataframe, output_manhattan = argv # Load snp data: print "Loading variant data..." ped_file = Ped(inped_file) print "Loading phenotype data..." pheno_fn = Pheno(inpheno_file) # Run basic association test: print "Running FaST-LMM single_snp test..." results_df = single_snp(test_snps=ped_file, pheno=pheno_fn, leave_out_one_chrom=0, output_file_name=results_dataframe) chromosome_starts = flutil.manhattan_plot(results_df.as_matrix(["Chr", "ChrPos", "PValue"]), pvalue_line=4.4e-7, xaxis_unit_bp=True) plt.show() # fig = plt.figure() # fig.savefig(output_manhattan)