def load_data(snp_file, pheno_file, covar_file): # Load SNP data snp_reader = Bed(snp_file) # Load phenotype pheno = pysnptools.util.pheno.loadPhen(pheno_file) # Load covariates if covar_file is not None: covar = pysnptools.util.pheno.loadPhen(covar_file) snp_reader, pheno, covar = srutil.intersect_apply([snp_reader, pheno, covar]) covar = covar['vals'] else: snp_reader, pheno = srutil.intersect_apply([snp_reader, pheno]) covar = None snp_data = snp_reader.read().standardize() Y = pheno['vals'] Y -= Y.mean(0) Y /= Y.std(0) X = 1./np.sqrt((snp_data.val**2).sum() / float(snp_data.iid_count)) * snp_data.val K = np.dot(X, X.T) # TODO use symmetric dot to speed this up assert np.all(pheno['iid'] == snp_data.iid), "the samples are not sorted" return snp_data, pheno, covar, X, Y, K
def loadData(bfile, extractSim, phenoFile, missingPhenotype='-9', loadSNPs=False, standardize=True): bed = Bed(bfile) if (extractSim is not None): f = open(extractSim) csvReader = csv.reader(f) extractSnpsSet = set([]) for l in csvReader: extractSnpsSet.add(l[0]) f.close() keepSnpsInds = [ i for i in xrange(bed.sid.shape[0]) if bed.sid[i] in extractSnpsSet ] bed = bed[:, keepSnpsInds] phe = None if (phenoFile is not None): bed, phe = loadPheno(bed, phenoFile, missingPhenotype) if (loadSNPs): bed = bed.read() if (standardize): bed = bed.standardize() return bed, phe
def write_grm(K, out_file, num_snps=500000, K_index=None, bed_nold_file=None): # fill GRM columns 0,1,3 n = K.shape[0] grm = np.zeros((n * (n - 1) / 2 + n, 4)) tril_indices = np.tril_indices(n) grm[:, 0] = tril_indices[0] + 1 grm[:, 1] = tril_indices[1] + 1 grm[:, 2] = num_snps grm[:, 3] = K[tril_indices] # fill #non-missing SNPs columns if bed_nold_file is not None: assert K_index is not None, 'K_index not provided' from pysnptools.snpreader.bed import Bed bed_nold = Bed(bed_nold_file, count_A1=True).read() notNan = (~np.isnan(bed_nold.val)).astype(np.float) notNan_K = notNan.dot(notNan.T) id2ind = dict([]) for ind_i, ind in enumerate(bed_nold.iid[:, 1].astype(np.int)): id2ind[ind] = ind_i tril_indices_nold = [None, None] tril_indices_nold[0] = [id2ind[K_index[ind]] for ind in tril_indices[0]] tril_indices_nold[1] = [id2ind[K_index[ind]] for ind in tril_indices[1]] grm[:, 2] = notNan_K[tril_indices_nold] pd_grm = pd.DataFrame(grm, columns=['i', 'j', 'num_SNPs', 'K']) pd_grm['i'] = pd_grm['i'].astype(np.int) pd_grm['j'] = pd_grm['j'].astype(np.int) pd_grm.to_csv(out_file, compression='gzip', header=False, index=False, sep='\t', float_format='%0.6e')
def _snps_fixup(snp_input, iid_if_none=None,count_A1=None): if isinstance(snp_input, str): return Bed(snp_input,count_A1=count_A1) if isinstance(snp_input, dict): return SnpData(iid=snp_input['iid'],sid=snp_input['header'],val=snp_input['vals']) if snp_input is None: assert iid_if_none is not None, "snp_input cannot be None here" return SnpData(iid_if_none, sid=np.empty((0),dtype='str'), val=np.empty((len(iid_if_none),0)),pos=np.empty((0,3)),name="") #todo: make a static factory method on SnpData return snp_input
def getChromosome(bfile, chrom): bed = Bed(bfile) indsToKeep = (bed.pos[:, 0] == chrom) bed = bed[:, indsToKeep] return bed.read().standardize()
def _fixupBed(bed): if isinstance(bed, str): return Bed(bed).read().standardize() else: return bed
import pandas as pd import numpy as np import leap.leapUtils as leapUtils import leap.leapMain as leapMain from pysnptools.snpreader.bed import Bed #Define analysis data bfile = 'dataset1/dataset1' phenoFile = bfile + '.phe' chromosomes = range(1, 11) prevalence = 0.001 #Find individuals to exclude to eliminate relatedness (kinship coeff > 0.05) bed = Bed(bfile).read().standardize() indsToKeep = leapUtils.findRelated(bed, cutoff=0.05) #Iterate over each chromosome frame_list = [] for chrom in chromosomes: print() print('Analyzing chromosome', chrom, '...') #Create a bed object excluding SNPs from the current chromosome bedExclude = leapUtils.getExcludedChromosome(bfile, chrom) #Create a bed object including only SNPs from the current chromosome bedTest = leapUtils.getChromosome(bfile, chrom) #Compute eigendecomposition for the data eigenFile = 'temp_eigen.npz' eigen = leapMain.eigenDecompose(bedExclude, outFile=eigenFile)
def getExcludedChromosome(bfile, chrom): bed = Bed(bfile, count_A1=True) indsToKeep = (bed.pos[:, 0] != chrom) bed = bed[:, indsToKeep] return bed.read().standardize()