def __init__(self, x, y, sigbeta2, comm, rank, ncore, null='perm', maf=None): self.gt = x self.gx = y self.sigbeta2 = sigbeta2 self.comm = comm self.rank = rank self.ncore = ncore self.null = null self.maf = maf self.mpi = False if self.ncore > 1: self.mpi = True self._pvals = None self._qscores = None self._mu = None self._sigma = None if self.null == 'perm': self.sigx2 = np.var(self.gt, axis=1) elif self.null == 'maf': self.sigx2 = np.ones(self.gt.shape[0]) self.logger = MyLogger(__name__)
def __init__(self, x, y, comm, rank, ncore, masks = None): self.gt = x self.gx = y self._zstats = None self.rank = rank self.comm = comm self.ncore = ncore self.masks = masks self.usemask = True if masks is not None else False self.mpi = False if self.ncore > 1: self.mpi = True self.logger = MyLogger(__name__)
def __init__(self, args): self.logger = MyLogger(__name__) self.args = args self._gtcent = None self._gtnorm = None self._snpinfo = None self._geneinfo = None self._expr = None self._cismaskcomp = None self._cismasklist = None self._tgene_gtnorm = None self._tgene_gtcent = None self._tgene_expr = None
def __init__(self, x, y, comm, rank, ncore, outfile, niter=100000, seed=None): self.gt = x self.gx = y self._niter = niter self.outfile = outfile self.rank = rank self.comm = comm self.ncore = ncore self.mpi = False if seed is not None: np.random.seed(seed) if self.ncore > 1: self.mpi = True self.logger = MyLogger(__name__)
def __init__(self, gtfile, samplefile, startsnp=0, endsnp=1e15, isdosage=True): self.logger = MyLogger(__name__) self._gtfile = gtfile self._samplefile = samplefile self._startsnp = startsnp self._endsnp = endsnp self._isdosage = isdosage if self._isdosage: self._meta_columns = 6 else: self._meta_columns = 5 self._read_genotypes()
def __init__(self, x, y, comm, rank, ncore, qcalc, masks, get_pvals=False, qnull_file=None, statmodel='zstat', target_fdr=None): self.gt = x self.gx = y self._pvals = None self._qscores = None self._jpa_pvals = None self.rank = rank self.comm = comm self.ncore = ncore self.mpi = False if self.ncore > 1: self.mpi = True self.qcalc = qcalc self.masks = masks self.usemask = True if masks is not None else False self.get_empirical_pvals = False if get_pvals: self.get_empirical_pvals = True self.qnull_file = qnull_file self.statmodel = statmodel self.logger = MyLogger(__name__) self.target_fdr = target_fdr if self.target_fdr is not None: self.adj_pvals = list() self.pass_fdr = list()
def __init__(self, comm, rank): self.logger = MyLogger(__name__) self.rank = rank self.comm = comm args = None if self.rank == 0: args = self.parse_args() args = self.comm.bcast(args, root=0) self.vcf_file = args.vcf_filename self.oxf_file = args.oxf_filename self.isdosage = args.isdosage self.fam_file = args.fam_filename if args.chrom is not None: self.chrom = int(args.chrom) else: self.chrom = None self.gx_file = args.gx_filename self.gxcorr_file = args.gxcorr_filename self.gx_datafmt = args.gx_datafmt self.gtf_file = args.gtf_filename self.gxtrim = args.gxtrim self.biotype = args.biotype self.outprefix = args.outprefix if args.incsnps is not None: self.startsnp = args.incsnps[0] - 1 self.endsnp = args.incsnps[1] else: self.startsnp = 0 self.endsnp = 1e15 # an unusually high number to ensure all SNPs are read. self.jpa, self.rr = project.method_selector(args.method) self.nullmodel = args.nullmodel self.cismasking = args.cismasking self.window = args.window self.sigmabeta = args.sigmabeta self.knn_nbr = args.knn self.knncorr = True if args.knn == 0: self.knncorr = False self.shuffle = args.shuffle self.shuffle_file = args.shuffle_file if self.shuffle_file is not None: self.shuffle = True self.psnpcut = args.psnpthres self.pgenecut = args.pgenethres self.maf_file = args.maf_filename self.jpanull_file = args.qnullfile self.jpa_calc_null = project.need_new_jpanull_file( self.jpa, self.jpanull_file) self.jpanull_iter = args.qnull_iter self.seed = args.seed self.maketest = args.maketest self.check_inputs() self.crossmapfile = args.crossmapfile self.usefdr = False self.target_fdr = args.target_fdr if self.target_fdr is not None: self.usefdr = True if self.rank == 0: self.logger.info('Method: {:s}'.format(args.method)) if self.rr: self.logger.info('Null Model: {:s}'.format(args.nullmodel)) self.logger.info('Sigma_beta: {:g}'.format(args.sigmabeta))
class Args(): def __init__(self, comm, rank): self.logger = MyLogger(__name__) self.rank = rank self.comm = comm args = None if self.rank == 0: args = self.parse_args() args = self.comm.bcast(args, root=0) self.vcf_file = args.vcf_filename self.oxf_file = args.oxf_filename self.isdosage = args.isdosage self.fam_file = args.fam_filename if args.chrom is not None: self.chrom = int(args.chrom) else: self.chrom = None self.gx_file = args.gx_filename self.gxcorr_file = args.gxcorr_filename self.gx_datafmt = args.gx_datafmt self.gtf_file = args.gtf_filename self.gxtrim = args.gxtrim self.biotype = args.biotype self.outprefix = args.outprefix if args.incsnps is not None: self.startsnp = args.incsnps[0] - 1 self.endsnp = args.incsnps[1] else: self.startsnp = 0 self.endsnp = 1e15 # an unusually high number to ensure all SNPs are read. self.jpa, self.rr = project.method_selector(args.method) self.nullmodel = args.nullmodel self.cismasking = args.cismasking self.window = args.window self.sigmabeta = args.sigmabeta self.knn_nbr = args.knn self.knncorr = True if args.knn == 0: self.knncorr = False self.shuffle = args.shuffle self.shuffle_file = args.shuffle_file if self.shuffle_file is not None: self.shuffle = True self.psnpcut = args.psnpthres self.pgenecut = args.pgenethres self.maf_file = args.maf_filename self.jpanull_file = args.qnullfile self.jpa_calc_null = project.need_new_jpanull_file( self.jpa, self.jpanull_file) self.jpanull_iter = args.qnull_iter self.seed = args.seed self.maketest = args.maketest self.check_inputs() self.crossmapfile = args.crossmapfile self.usefdr = False self.target_fdr = args.target_fdr if self.target_fdr is not None: self.usefdr = True if self.rank == 0: self.logger.info('Method: {:s}'.format(args.method)) if self.rr: self.logger.info('Null Model: {:s}'.format(args.nullmodel)) self.logger.info('Sigma_beta: {:g}'.format(args.sigmabeta)) def parse_args(self): self.logger.info('Running TEJAAS v{:s}'.format(project.version())) parser = argparse.ArgumentParser( description='Tejaas: Discover trans-eQTLs!') parser.add_argument('--vcf', type=str, dest='vcf_filename', metavar='FILE', help='Input VCF file in vcf.gz format') parser.add_argument('--oxf', type=str, dest='oxf_filename', metavar='FILE', help='Input Oxford file') parser.add_argument('--dosage', dest='isdosage', action='store_true', help='Read dosages') parser.add_argument('--fam', type=str, dest='fam_filename', metavar='FILE', help='Input fam file') parser.add_argument('--chrom', dest='chrom', metavar='NUMBER', help="Chromosome number of the genotype file") parser.add_argument( '--include-SNPs', type=snprange, dest='incsnps', metavar='START:END', help='Colon-separated index of SNPs to be included') parser.add_argument( '--gx', type=str, dest='gx_filename', metavar='FILE', help='input expression file for finding trans-eQTLs') parser.add_argument( '--gxcorr', type=str, dest='gxcorr_filename', metavar='FILE', help='input expression file for finding target genes') parser.add_argument( '--gxfmt', type=str, dest='gx_datafmt', metavar='GX_FORMAT', default='gtex', help= 'Format of input gene expression file. Supported: gtex, cardiogencis and geuvadis' ) parser.add_argument( '--biotype', nargs='*', type=biotype_fmt, dest='biotype', metavar='BIOTYPE_OPTIONS', default=['protein_coding', 'lncRNA'], help= 'List of biotypes to be selected from the GENCODE annotation file. Supported options: protein_coding, lncRNA' ) parser.add_argument('--gtf', type=str, dest='gtf_filename', metavar='FILE', help='input gtf file') parser.add_argument( '--trim', dest='gxtrim', action='store_true', help='Trim version number from GENCODE Ensembl IDs') parser.add_argument('--outprefix', type=str, dest='outprefix', default='out', metavar='STR', help='prefix for all output files') parser.add_argument('--method', default='rr', type=method_strings, dest='method', metavar='STR', help='which method to run: jpa / rr') parser.add_argument('--null', default='perm', type=null_strings, dest='nullmodel', metavar='STR', help='which null model to use: perm / maf') parser.add_argument( '--cismask', dest='cismasking', action='store_true', help='Generate cismasks for the expression matrix for each SNP') parser.add_argument( '--window', type=int, default=1e6, dest='window', help='Window (number of base pairs) used for masking cis genes') parser.add_argument( '--prior-sigma', default=0.1, type=float, dest='sigmabeta', metavar='FLOAT', help= 'standard deviation of the normal prior for reverse multiple linear regression' ) parser.add_argument( '--knn', type=int, dest='knn', help= 'Number of neighbours for KNN (use 0 if you do not want KNN correction)', default=0) parser.add_argument( '--psnpthres', default=0.0001, type=float, dest='psnpthres', metavar='PVAL', help= 'target genes will be reported for trans-eQTLs, which are below this threshold p-value for RR/JPA statistics' ) parser.add_argument( '--pgenethres', default=0.001, type=float, dest='pgenethres', metavar='PVAL', help= 'target genes whose linear regression association with trans-eQTLs are below this threshold p-value will be reported' ) parser.add_argument( '--jpanull', type=str, dest='qnullfile', help='Filename for storing / reading null JPA scores') parser.add_argument( '--jpanull-iter', default=100000, type=int, dest='qnull_iter', help='Number of iterations for creating null JPA scores') parser.add_argument( '--seed', default=None, type=int, dest='seed', help= 'Seed the random generator for numpy, used for development purpose' ) parser.add_argument( '--maf-file', type=str, dest='maf_filename', metavar='FILE', help='file name of the MAF, see Documentation for filetype') parser.add_argument('--shuffle', dest='shuffle', action='store_true', help='Shuffle the genotypes randomly') parser.add_argument( '--shuffle-with', type=str, dest='shuffle_file', metavar='FILE', help='Shuffle the genotypes using the supplied donor IDs file') parser.add_argument('--test', dest='maketest', action='store_true', help='whether to do test run') parser.add_argument('--crossmap', type=str, default=None, dest='crossmapfile', help='Crossmapability file (Saha, Battle 2018) ') parser.add_argument( '--fdrgenethres', default=None, type=float, dest='target_fdr', metavar='FDR', help= 'enable FDR correction up to a certain cutoff for target gene discovery' ) res = parser.parse_args() return res def check_inputs(self): ''' Perform sanity checks on the input options. ''' if self.rank == 0: ''' Check if any genotype file is specified. ''' try: assert (self.vcf_file is not None) or (self.oxf_file is not None) except AssertionError: print( 'Input error: Specify either --vcf or --oxf. See --help for details.' ) raise if (self.oxf_file is not None): try: assert (self.fam_file is not None) except AssertionError: print( 'Input error: Specify the sample file with --fam. See --help for details.' ) raise ''' Check if gene expression and GTF files are specified ''' try: assert (self.gx_file is not None) except AssertionError: print( 'Input error: Specify gene expression file. See --help for details' ) raise try: assert (self.gtf_file is not None) except AssertionError: print( 'Input error: Specify GENCODE file. See --help for details' ) raise ''' Check if files exist. ''' for filepath in [ self.vcf_file, self.oxf_file, self.fam_file, self.gx_file, self.gtf_file, self.gxcorr_file ]: if (filepath is not None): try: assert os.path.isfile(filepath) except AssertionError: print('File {:s} does not exist.'.format(filepath)) raise ''' Check if output directory is writable / can be created ''' outdir = os.path.dirname(os.path.realpath(self.outprefix)) try: if not os.path.exists(outdir): os.makedirs(outdir) except OSError as e: if e.errno != errno.EEXIST: print('Unable to create output directory: {:s}'.format( outdir)) raise try: assert os.path.isdir(outdir) and os.access( outdir, os.W_OK | os.X_OK) #filepath = "{:s}.write_tester".format(self.outprefix) #filehandle = open( filepath, 'w' ) #filehandle.close() #os.remove(filepath) except AssertionError: print('Unable to create files in {:s}'.format(outdir)) raise
class JPANULL: def __init__(self, x, y, comm, rank, ncore, outfile, niter=100000, seed=None): self.gt = x self.gx = y self._niter = niter self.outfile = outfile self.rank = rank self.comm = comm self.ncore = ncore self.mpi = False if seed is not None: np.random.seed(seed) if self.ncore > 1: self.mpi = True self.logger = MyLogger(__name__) def write_qscores(self): qmax = np.max(self._qscores) valid_qscores = self._qscores[np.where(self._qscores < qmax)] with open(self.outfile, 'w') as fout: for qnull in valid_qscores: fout.write(f"{qnull}\n") def jpascore(self, pvals): min_nonzero = np.min(pvals[np.nonzero(pvals)]) pvals[pvals == 0] = min_nonzero p = np.sort(pvals) n = p.shape[0] kmax = min(100, n) krange = [i + 1 for i in range(kmax)] digamma_n1 = special.digamma(n + 1) z = - ( np.log(p[:kmax]) - (special.digamma(krange) - digamma_n1) ) zsum = np.cumsum(z) res = np.max(zsum) return res def slavejob(self, W, Q, Zmean, n): self.logger.debug('Rank {:d} calculating {:d} null JPA scores'.format(self.rank, n)) ngene = W.shape[0] pvals = np.zeros(n * ngene) for i in range(n): ngene = W.shape[0] zrand = np.random.normal(0, 1, size = ngene) znull = Zmean + np.einsum('ij, j, j', Q, np.sqrt(W), zrand) pvals[i*ngene : (i+1)*ngene] = 2.0 * (1 - stats.norm.cdf(np.abs(znull))) qnull = np.array([self.jpascore(pvals[i*ngene : (i+1)*ngene]) for i in range(n)]) return pvals, qnull def mpicompute(self): if self.rank == 0: # this is the master # create a list of N for sending to your slaves thisW = self._W thisQ = self._Q thisZmean = self._Zmean start, end = mpihelper.split_n(self._niter, self.ncore) nlist = [x - y for x, y in zip(end, start)] else: thisW = None thisQ = None thisZmean = None nlist = None slave_W = None slave_Q = None slave_Zmean = None slave_n = None slave_W = self.comm.bcast(thisW, root = 0) slave_Q = self.comm.bcast(thisQ, root = 0) slave_Zmean = self.comm.bcast(thisZmean, root = 0) slave_n = self.comm.scatter(nlist, root = 0) self.comm.barrier() if self.rank == 0: self.logger.debug("Broadcast W, Q and Zmean to the slave nodes") # ================================== # Data sent. Do the calculations # ================================== pvals, qscores = self.slavejob(slave_W, slave_Q, slave_Zmean, slave_n) # ================================== # Collect the results # ================================== recvbuf = None if self.rank == 0: self.logger.debug("Number of SNPs sent to each slave: " + ", ".join(["{:d}".format(x) for x in nlist])) ngene = self._W.shape[0] flat_sizes = np.array([n * ngene for n in nlist]) recvbuf = np.zeros(sum(flat_sizes), dtype=np.float64) else: flat_sizes = None self.comm.Gatherv(sendbuf=pvals, recvbuf=(recvbuf, flat_sizes), root = 0) if self.rank == 0: self._pvals = recvbuf.reshape(sum(nlist), ngene) qscores = self.comm.gather(qscores, root = 0) if self.rank == 0: self._qscores = np.concatenate(qscores) else: assert qscores is None assert recvbuf is None return def WQ_mpiwrap(self): ''' Populates self._W, self._Q and self._Zmean Learns the null Zstats from the ZSTATS class and performs eigendecomposition in the master node. ''' self._W = None self._Q = None self._Zmean = None if self.rank == 0: self.logger.debug("Computing Z-stats") zstats = ZSTATS(self.gt, self.gx, self.comm, self.rank, self.ncore) zstats.compute() if self.rank == 0: self.logger.debug("Computing W and Q") zscores = zstats.scores C = np.cov(zscores.T) # Numpy gives imaginary eigenvalues, use eigh from scipy # for decomposition of real symmetric matrix W, Q = eigh(C) self.logger.debug("Eigendecomposition done") # still some eigenvalues are negative. force them to zero if they are negligible. (!!!!!!!!!!!) # check if everything is ok #Wsparse = W.copy() #Wsparse[np.where(W < 0)] = 0 #W = Wsparse W[np.where(W < 0)] = 0 self.logger.debug("Forced negative eigenvalues to zero") #if not np.allclose(C, Q @ np.diag(W) @ Q.T): # self.logger.error("Eigen vectors could not be forced to positive") # exit #else: # W = Wsparse # self.logger.debug("Eigen vectors are forced to positive") Zmean = np.mean(zscores, axis = 0) self._W = W self._Q = Q self._Zmean = Zmean self.comm.barrier() def compute(self): self.WQ_mpiwrap() if self.rank == 0: self.logger.debug("Start MPI calculation") if self.mpi: self.mpicompute() else: pvals, qscores = self.slavejob(self._W, self._Q, self._Zmean, self._niter) self._pvals = pvals.reshape(self._niter, self._W.shape[0]) self._qscores = qscores if self.rank == 0: self.logger.debug("Null JPA-scores calculated. Writing to file.") self.write_qscores() return
class ZSTATS: def __init__(self, x, y, comm, rank, ncore, masks = None): self.gt = x self.gx = y self._zstats = None self.rank = rank self.comm = comm self.ncore = ncore self.masks = masks self.usemask = True if masks is not None else False self.mpi = False if self.ncore > 1: self.mpi = True self.logger = MyLogger(__name__) @property def scores(self): return self._zstats def clinreg(self, geno, expr, nrow): _path = os.path.dirname(__file__) clib = np.ctypeslib.load_library('../lib/linear_regression_zstat.so', _path) czstat = clib.fit czstat.restype = ctypes.c_int czstat.argtypes = [np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'), np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'), ctypes.c_int, ctypes.c_int, ctypes.c_int, np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED') ] x = geno.reshape(-1,) y = expr.reshape(-1,) xsize = x.shape[0] nsnps = geno.shape[0] nsample = geno.shape[1] ngene = expr.shape[0] zstat = np.zeros(nsnps * ngene) success = czstat(x, y, nsnps, ngene, nsample, zstat) return zstat def slavejob(self, geno, expr, nmax, offset): self.logger.debug('Rank {:d} calculating SNPs {:d} to {:d}'.format(self.rank, offset+1, nmax + offset)) nsnps = geno.shape[0] ngene = expr.shape[0] zstat = self.clinreg(geno, expr, nmax) return zstat def mpicompute(self): if self.rank == 0: # this is the master # create a list of genotypes for sending to your slaves geno, offset = mpihelper.split_genotype(self.gt, self.ncore) expr = self.gx nsnp = [x.shape[0] for x in geno] else: geno = None expr = None nsnp = None offset = None slave_geno = None slave_expr = None slave_nsnp = None slave_offs = None slave_geno = self.comm.scatter(geno, root = 0) slave_expr = self.comm.bcast(expr, root = 0) slave_nsnp = self.comm.scatter(nsnp, root = 0) slave_offs = self.comm.scatter(offset, root = 0) self.comm.barrier() # ================================== # Data sent. Do the calculations # ================================== zstat = self.slavejob(slave_geno, slave_expr, slave_nsnp, slave_offs) # ================================== # Collect the results # ================================== recvbuf = None if self.rank == 0: self.logger.debug("Number of SNPs sent to each slave: " + ", ".join(["{:d}".format(x) for x in nsnp])) ngene = self.gx.shape[0] flat_sizes = np.array([n * ngene for n in nsnp]) recvbuf = np.zeros(sum(flat_sizes), dtype=np.float64) else: flat_sizes = None self.comm.Gatherv(sendbuf=zstat, recvbuf=(recvbuf, flat_sizes), root = 0) if self.rank == 0: self._zstats = recvbuf.reshape(sum(nsnp), ngene) else: assert recvbuf is None return def compute(self): if self.mpi: self.mpicompute() else: zstats = self.slavejob(self.gt, self.gx, self.gt.shape[0], 0) self._zstats = zstats.reshape(self.gt.shape[0], self.gx.shape[0]) return
class Data(): def __init__(self, args): self.logger = MyLogger(__name__) self.args = args self._gtcent = None self._gtnorm = None self._snpinfo = None self._geneinfo = None self._expr = None self._cismaskcomp = None self._cismasklist = None self._tgene_gtnorm = None self._tgene_gtcent = None self._tgene_expr = None @property def geno_centered(self): return self._gtcent @property def geno_normed(self): return self._gtnorm @property def snpinfo(self): return self._snpinfo @property def geneinfo(self): return self._geneinfo @property def cismasks_comp(self): return self._cismaskcomp @property def cismasks_list(self): return self._cismasklist @property def expression(self): return self._expr @property def tgene_geno_normed(self): return self._tgene_gtnorm @property def tgene_geno_centered(self): return self._tgene_gtcent @property def tgene_expression(self): return self._tgene_expr def select_donors(self, vcf_donors, expr_donors): ''' Make sure that donors are in the same order for both expression and genotype ''' common_donors = [x for x in vcf_donors if x in expr_donors] vcfmask = np.array([vcf_donors.index(x) for x in common_donors]) exprmask = np.array([expr_donors.index(x) for x in common_donors]) return vcfmask, exprmask def select_genes(self, info, names): ''' Select genes which would be analyzed. Make sure the indices are not mixed up ''' allowed = [x.ensembl_id for x in info] common = [x for x in names if x in allowed] genes = [x for x in info if x.ensembl_id in common] indices = [names.index(x.ensembl_id) for x in genes] return genes, np.array(indices) def match_gx_indices(self, ref_gx, ref_donors, ref_gnames, gx, donors, gnames): '''Match the indices of gx with those of ref_gx Both gx and ref_gx are of size G x N G = genes (gnames), N = donors ''' gidx = np.array([gnames.index(x) for x in ref_gnames if x in gnames]) didx = np.array([donors.index(x) for x in ref_donors if x in donors]) if (gidx.shape[0] != len(ref_gnames)) or (didx.shape[0] != len(ref_donors)): self.logger.error( "Gene expression files have different donors and / or gene names. Please check. Program cancelled!" ) raise return gx[:, didx][gidx, :] def HWEcheck(self, x): gt = x.tolist() f = np.array([0] * 3) f[0] = gt.count(0) f[1] = gt.count(1) f[2] = gt.count(2) n = sum(f) X2 = n * ((4 * f[0] * f[2] - f[1]**2) / ((2 * f[0] + f[1]) * (2 * f[2] + f[1])))**2 pval = 1 - ss.chi2.cdf(X2, 1) return pval def filter_snps(self, snpinfo, dosage, maf_limit=0.01, use_hwe=False): # Predixcan style filtering of snps newsnps = list() newdosage = list() npoly = 0 nambi = 0 nunkn = 0 nlowf = 0 nlowf_actual = 0 nhwep = 0 nalle = 0 for i, snp in enumerate(snpinfo): pos = snp.bp_pos refAllele = snp.ref_allele effectAllele = snp.alt_allele rsid = snp.varid maf = round(snp.maf, 3) # Actual MAF is lower / higher than population MAF because some samples have been removed maf_actual = sum(dosage[i]) / 2 / len(dosage[i]) # Skip non-single letter polymorphisms if len(refAllele) > 1 or len(effectAllele) > 1: npoly += 1 continue # Skip unknown alleles if refAllele not in SNP_COMPLEMENT or effectAllele not in SNP_COMPLEMENT: nalle += 1 continue # Skip ambiguous strands if SNP_COMPLEMENT[refAllele] == effectAllele: nambi += 1 continue # Skip unknown RSIDs if rsid == '.': nunkn += 1 continue # Skip low MAF if not (maf >= maf_limit and maf <= (1 - maf_limit)): nlowf += 1 continue # Skip low actual MAF if not (maf_actual >= maf_limit and maf_actual <= (1 - maf_limit)): nlowf_actual += 1 continue # Check HWE if use_hwe: # Convert to integers 0, 1 or 2 bins = [0.66, 1.33] intdosage = np.digitize(dosage[i], bins) # Remove SNPs out of HWE hwep = self.HWEcheck(intdosage) if (hwep < 0.000001): nhwep += 1 # self.logger.debug("SNP {:s} has a HWE p-value of {:g}".format(rsid, hwep)) continue new_snp = snp._replace(maf=maf_actual) newsnps.append(new_snp) newdosage.append(dosage[i]) self.logger.debug( "Removed {:d} SNPs because of non-single letter polymorphisms". format(npoly)) self.logger.debug( "Removed {:d} SNPs because of unknown allele symbol".format(nalle)) self.logger.debug( "Removed {:d} SNPs because of ambiguous strands".format(nambi)) self.logger.debug( "Removed {:d} SNPs because of unknown RSIDs".format(nunkn)) self.logger.debug("Removed {:d} SNPs because of low MAF < {:g}".format( nlowf, maf_limit)) self.logger.debug( "Removed {:d} SNPs because of low MAF (current)".format( nlowf_actual)) if use_hwe: self.logger.debug( "Removed {:d} SNPs because of deviation from HWE".format( nhwep)) return newsnps, np.array(newdosage) def normalize_and_center_dosage(self, dosage): f = [snp.maf for snp in self._snpinfo] f = np.array(f).reshape(-1, 1) gtnorm = (dosage - (2 * f)) / np.sqrt(2 * f * (1 - f)) gtcent = dosage - np.mean(dosage, axis=1).reshape(-1, 1) return gtnorm, gtcent def load(self): ## Read Oxford File if self.args.oxf_file: oxf = ReadOxford(self.args.oxf_file, self.args.fam_file, self.args.startsnp, self.args.endsnp, isdosage=self.args.isdosage) dosage = oxf.dosage gt_donor_ids = oxf.samplenames snpinfo = oxf.snpinfo # Read VCF file if self.args.vcf_file: vcf = ReadVCF(self.args.vcf_file, self.args.startsnp, self.args.endsnp, samplefile=self.args.fam_file) dosage = vcf.dosage gt_donor_ids = vcf.donor_ids snpinfo = vcf.snpinfo # Read Gene Expression self.logger.debug("Reading expression levels for trans-eQTL discovery") rpkm = ReadRPKM(self.args.gx_file, self.args.gx_datafmt) expression = rpkm.expression expr_donors = rpkm.donor_ids gene_names = rpkm.gene_names # Read confounder corrected gene expression if self.args.gxcorr_file is not None: self.logger.debug( "Reading expression levels for target gene discovery") rpkm_corr = ReadRPKM(self.args.gxcorr_file, self.args.gx_datafmt) exprcorr = self.match_gx_indices(expression, expr_donors, gene_names, rpkm_corr.expression, rpkm_corr.donor_ids, rpkm_corr.gene_names) self.logger.debug("Found {:d} genes of {:d} samples".format( expression.shape[0], expression.shape[1])) self.logger.debug("Reading gencode file for gene information") gene_info = readgtf.gencode(self.args.gtf_file, trim=self.args.gxtrim, biotype=self.args.biotype) # reorder donors gt and expr self.logger.debug( "Selecting common samples of genotype and gene expression") self.logger.debug( "Before expression selection: {:d} genes from {:d} samples".format( expression.shape[0], expression.shape[1])) vcfmask, exprmask = self.select_donors(gt_donor_ids, expr_donors) genes, indices = self.select_genes(gene_info, gene_names) expression_selected = rpkm._normalize_expr( expression[:, exprmask][indices, :]) if self.args.gxcorr_file is not None: exprcorr_selected = rpkm_corr._normalize_expr( exprcorr[:, exprmask][indices, :]) self._geneinfo = genes dosage_masked = dosage[:, vcfmask] snpinfo_filtered, dosage_filtered_selected = self.filter_snps( snpinfo, dosage_masked) self.logger.debug("{:d} SNPs after filtering".format( len(snpinfo_filtered))) self._snpinfo = snpinfo_filtered self.logger.debug( "After expression selection: {:d} genes from {:d} samples".format( expression_selected.shape[0], expression_selected.shape[1])) self.logger.debug("Retained {:d} samples".format(vcfmask.shape[0])) ### Until here, all filters have been applied and geneinfo and snpinfo reflect current data ### self._tgene_gtnorm, self._tgene_gtcent = self.normalize_and_center_dosage( dosage_filtered_selected) if self.args.gxcorr_file is not None: self._tgene_expr = exprcorr_selected else: self._tgene_expr = expression_selected if self.args.cismasking: self.logger.debug("Generate cis-masks for GX matrix for each SNP") self._cismasklist = cismasking.get_cismasklist( self._snpinfo, self._geneinfo, self.args.chrom, window=self.args.window) self._cismaskcomp = cismasking.compress_cismasklist( self._cismasklist) if self.args.crossmapfile is not None: self._cismaskcomp = cismasking.extend_cismask( self._geneinfo, self._cismaskcomp, self.args.crossmapfile) if self.args.knncorr: self.logger.debug( "Applying KNN correction on gene expression and genotype") gx_corr, gt_corr = knn.knn_correction(expression_selected.T, dosage_filtered_selected, self.args.knn_nbr) self._expr = rpkm._normalize_expr(gx_corr.T) self._gtnorm, self._gtcent = self.normalize_and_center_dosage( gt_corr) else: self.logger.debug("No KNN correction.") self._expr = expression_selected self._gtnorm = self._tgene_gtnorm.copy() self._gtcent = self._tgene_gtcent.copy() # self._gtnorm, self._gtcent = self.normalize_and_center_dosage(dosage_filtered_selected) if self.args.shuffle: usedmask = [gt_donor_ids[i] for i in vcfmask] if self.args.shuffle_file is not None and os.path.isfile( self.args.shuffle_file): self.logger.warn("Shuffling genotype using supplied donor IDs") rand_donor_ids = [ line.strip() for line in open(self.args.shuffle_file) ] else: self.logger.warn("Shuffling genotype randomly") rand_donor_ids = usedmask.copy() random.shuffle(rand_donor_ids) rand_index = np.array( [usedmask.index(x) for x in rand_donor_ids if x in usedmask]) self._gtnorm = self._gtnorm[:, rand_index] self._gtcent = self._gtcent[:, rand_index]
class JPA: def __init__(self, x, y, comm, rank, ncore, qcalc, masks, get_pvals=False, qnull_file=None, statmodel='zstat', target_fdr=None): self.gt = x self.gx = y self._pvals = None self._qscores = None self._jpa_pvals = None self.rank = rank self.comm = comm self.ncore = ncore self.mpi = False if self.ncore > 1: self.mpi = True self.qcalc = qcalc self.masks = masks self.usemask = True if masks is not None else False self.get_empirical_pvals = False if get_pvals: self.get_empirical_pvals = True self.qnull_file = qnull_file self.statmodel = statmodel self.logger = MyLogger(__name__) self.target_fdr = target_fdr if self.target_fdr is not None: self.adj_pvals = list() self.pass_fdr = list() @property def jpa_pvals(self): return self._jpa_pvals @property def pvals(self): return self._pvals @property def scores(self): return self._qscores def masked_jpascore(self, pvals, mask): usedgenes = np.ones_like(pvals, dtype=np.bool) if mask.shape[0] != 0: usedgenes[mask] = False res = self.jpascore(pvals[usedgenes]) return res def jpascore(self, pvals): p = np.sort(pvals) n = p.shape[0] kmax = min(100, n) krange = [i + 1 for i in range(kmax)] digamma_n1 = special.digamma(n + 1) z = -(np.log(p[:kmax]) - (special.digamma(krange) - digamma_n1)) zsum = np.cumsum(z) res = np.max(zsum) return res def clinreg_fstat(self, geno, expr, nrow): _path = os.path.dirname(__file__) clib = np.ctypeslib.load_library('../lib/linear_regression.so', _path) cfstat = clib.fit cfstat.restype = ctypes.c_int cfstat.argtypes = [ np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'), np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'), ctypes.c_int, ctypes.c_int, ctypes.c_int, np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED') ] x = geno.reshape(-1, ) y = expr.reshape(-1, ) xsize = x.shape[0] nsnps = geno.shape[0] nsample = geno.shape[1] ngene = expr.shape[0] fstat = np.zeros(nsnps * ngene) success = cfstat(x, y, nsnps, ngene, nsample, fstat) res = 1 - stats.f.cdf(fstat, 1, nsample - 2) return res def clinreg_zstat(self, geno, expr, nrow): _path = os.path.dirname(__file__) clib = np.ctypeslib.load_library('../lib/linear_regression_zstat.so', _path) czstat = clib.fit czstat.restype = ctypes.c_int czstat.argtypes = [ np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'), np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'), ctypes.c_int, ctypes.c_int, ctypes.c_int, np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED') ] x = geno.reshape(-1, ) y = expr.reshape(-1, ) xsize = x.shape[0] nsnps = geno.shape[0] nsample = geno.shape[1] ngene = expr.shape[0] zstat = np.zeros(nsnps * ngene) success = czstat(x, y, nsnps, ngene, nsample, zstat) res = 2.0 * (1 - stats.norm.cdf(np.abs(zstat))) return res def get_qecdf_fit(self, q, ntop): qecdf = ECDF(q) qsort = q[np.argsort(q)] qneg = qsort[-ntop:] qcut = qneg[0] cumsum = 0 for qnull in qneg: cumsum += qnull - qcut lam = (1 / ntop) * cumsum prefact = ntop / q.shape[0] return qecdf, qcut, lam, prefact def p_qscore(self, q, qecdf, qcut, lam, prefact): if q < qcut: res = 1 - qecdf(q) else: res = prefact * np.exp(-(q - qcut) / lam) return res def get_qnull(self): ''' This function reads the qnull file, if provided. Otherwise, generates uniform random number. Must be called from master and broadcast to the slaves. ''' qmod = np.array([]) if self.get_empirical_pvals: if self.qnull_file is not None and os.path.isfile(self.qnull_file): #qnull = self.read_qnull(self.qnull_file) self.logger.debug("Read null Q-scores from {:s}".format( self.qnull_file)) qnull = list() with open(self.qnull_file, 'r') as instream: for line in instream: lsplit = line.strip().split() q = float(lsplit[0].strip()) qnull.append(q) qnull = np.array(qnull) else: self.logger.debug( "Creating null Q-scores from uniform p-value distribution") ngene = 10000 nsnps = 50000 qnull = np.array([ self.jpascore(np.random.uniform(0, 1, size=ngene)) for i in range(nsnps) ]) qmod = qnull[np.isfinite(qnull)] self.logger.debug("Obtained {:d} null Q-scores".format( qmod.shape[0])) return qmod def slavejob(self, geno, expr, nmax, offset, masks, qnull): ''' Outputs. pvals: p-values from every SNP-gene pair linear regression. Dimension I x G. qscores: JPA-score from the above p-values. Dimension I. p_jpa: p-values for the significance of JPA, calculated empirically. Dimension I. ''' self.logger.debug('Rank {:d} calculating SNPs {:d} to {:d}'.format( self.rank, offset + 1, nmax + offset)) nsnps = geno.shape[0] ngene = expr.shape[0] # Simple linear regression calculating either f-statistic or Z-statistic for every SNP-gene pair if self.statmodel == 'fstat': pvals = self.clinreg_fstat(geno, expr, nmax) elif self.statmodel == 'zstat': pvals = self.clinreg_zstat(geno, expr, nmax) # Calculate JPA-score if self.qcalc: # calculate JPA for each SNP (using ngene pvals) if self.usemask: qscores = np.array([ self.masked_jpascore(pvals[i * ngene:(i + 1) * ngene], masks[i]) for i in range(nsnps) ]) else: qscores = np.array([ self.jpascore(pvals[i * ngene:(i + 1) * ngene]) for i in range(nsnps) ]) # nzpvals = pvals.copy() # zero_mask = nzpvals == 0 # nzpmin = np.min(nzpvals[~zero_mask]) # nzpvals[zero_mask] = nzpmin # qscores = np.array([self.jpascore(nzpvals[i*ngene : (i+1)*ngene]) for i in range(nsnps)]) else: qscores = np.zeros(nsnps) # Calculate empirical p-values for the JPA-scores if self.get_empirical_pvals: ntop = min(500, int(qnull.shape[0] / 10)) qecdf, qcut, lam, prefact = self.get_qecdf_fit(qnull, ntop) p_jpa = np.array( [self.p_qscore(q, qecdf, qcut, lam, prefact) for q in qscores]) else: p_jpa = np.array([1 for q in qscores]) return pvals, qscores, p_jpa def mpicompute(self): if self.rank == 0: # this is the master # create a list of genotypes for sending to your slaves geno, offset = mpihelper.split_genotype(self.gt, self.ncore) expr = self.gx nsnp = [x.shape[0] for x in geno] gmasks = mpihelper.split_genemasks(self.masks, nsnp, offset) qnull = self.get_qnull() else: geno = None expr = None nsnp = None offset = None gmasks = None qnull = None slave_geno = None slave_expr = None slave_nsnp = None slave_offs = None slave_gmasks = None slave_qnull = None slave_geno = self.comm.scatter(geno, root=0) slave_expr = self.comm.bcast(expr, root=0) slave_nsnp = self.comm.scatter(nsnp, root=0) slave_offs = self.comm.scatter(offset, root=0) if self.usemask: slave_gmasks = self.comm.scatter(gmasks, root=0) else: slave_gmasks = self.comm.bcast(gmasks, root=0) slave_qnull = self.comm.bcast(qnull, root=0) self.comm.barrier() # ================================== # Data sent. Do the calculations # ================================== pvals, qscores, p_jpa = self.slavejob(slave_geno, slave_expr, slave_nsnp, slave_offs, slave_gmasks, slave_qnull) # ================================== # Collect the results # ================================== recvbuf = None if self.rank == 0: self.logger.debug("Number of SNPs sent to each slave: " + ", ".join(["{:d}".format(x) for x in nsnp])) ngene = self.gx.shape[0] flat_sizes = np.array([n * ngene for n in nsnp]) recvbuf = np.zeros(sum(flat_sizes), dtype=np.float64) else: flat_sizes = None self.comm.Gatherv(sendbuf=pvals, recvbuf=(recvbuf, flat_sizes), root=0) if self.rank == 0: self._pvals = recvbuf.reshape(sum(nsnp), ngene) ## include FDR correction if self.target_fdr is not None: self.get_fdr_each() qscores = self.comm.gather(qscores, root=0) p_jpa = self.comm.gather(p_jpa, root=0) if self.rank == 0: self._qscores = np.concatenate(qscores) if self.get_empirical_pvals: self._jpa_pvals = np.concatenate(p_jpa) else: assert qscores is None assert recvbuf is None assert p_jpa is None return def get_fdr_each(self): N_snp = self.gt.shape[0] N_gene = self.gx.shape[0] for i in range(N_snp): snp_gene_pval = list() for j in range(N_gene): if self.usemask and j in self.masks[i]: print(f"for snp {i} skipped gene {j}") continue # skip pair, gene is masked else: snp_gene_pval.append((i, j, self._pvals[i, j])) pass_fdr, adj_pvals = self.bh_procedure(snp_gene_pval, self.target_fdr) self.pass_fdr = self.pass_fdr + pass_fdr self.adj_pvals = self.adj_pvals + adj_pvals return def get_fdr_all(self): N_snp = self.gt.shape[0] N_gene = self.gx.shape[0] snp_gene_pval = list() for i in range(N_snp): for j in range(N_gene): if self.usemask and j in self.masks[i]: continue # skip pair, gene is masked else: snp_gene_pval.append((i, j, self._pvals[i, j])) self.pass_fdr, self.adj_pvals = self.bh_procedure( snp_gene_pval, self.target_fdr) return def bh_procedure(self, snp_gene_pval, target_fdr): self.logger.debug( "Calculating FDR ... sorting {:d} SNP-gene pairs".format( len(snp_gene_pval))) sorted_pairs = sorted(snp_gene_pval, key=lambda item: item[2]) n_tests = len( sorted_pairs ) # NOT equivalent to ntrans-eqtls * ngenes, because ntrans is filtered pass_snps = list() bh_index_limit = -1 for i, snp_pval in enumerate(sorted_pairs[::-1]): bh_factor = ((n_tests - i) / n_tests) * target_fdr if snp_pval[2] > bh_factor: continue else: bh_index_limit = n_tests - i - 1 break if bh_index_limit < 0: self.logger.debug( "No significant SNP-gene pairs @ {:f} FDR for SNP".format( target_fdr)) return [], [] else: pass_fdr = [sorted_pairs[i] for i in range(bh_index_limit + 1)] adj_pvals = [ sorted_pairs[i][2] * (n_tests / (i + 1)) for i in range(n_tests) ] # equiv to report FDR return pass_fdr, adj_pvals[:len(pass_fdr)] def compute(self): if self.mpi: self.mpicompute() else: qnull = self.get_qnull() pvals, qscores, p_jpa = self.slavejob(self.gt, self.gx, self.gt.shape[0], 0, self.masks, qnull) self._pvals = pvals.reshape(self.gt.shape[0], self.gx.shape[0]) # include FDR correction if self.target_fdr is not None: self.get_fdr_each() self._qscores = qscores if self.get_empirical_pvals: self._jpa_pvals = p_jpa return
import numpy as np from utils.logs import MyLogger logger = MyLogger(__name__) def load(qnull_file): qnull = list() with open(qnull_file, 'r') as mfile: for line in mfile: l = line.strip().split() q = float(l[0].strip()) qnull.append(q) qnull = np.array(qnull) qmod = qnull[np.isfinite(qnull)] logger.debug("Read {:d} null Q-scores".format(qmod.shape[0])) return qmod
class ReadOxford: _read_samples_once = False _read_genotype_once = False _nloci = 0 _nsample = 0 # deafult is GTEx: # - isdosage = True def __init__(self, gtfile, samplefile, startsnp=0, endsnp=1e15, isdosage=True): self.logger = MyLogger(__name__) self._gtfile = gtfile self._samplefile = samplefile self._startsnp = startsnp self._endsnp = endsnp self._isdosage = isdosage if self._isdosage: self._meta_columns = 6 else: self._meta_columns = 5 self._read_genotypes() @property def nsample(self): self._read_samples() return self._nsample @property def samplenames(self): self._read_samples() return self._samplenames @property def nloci(self): return self._nloci @property def snpinfo(self): self._read_genotypes() return tuple(self._snpinfo) @property def dosage(self): return tuple(self._dosage) @property def gtnorm(self): return tuple(self._gtnorm) @property def gtcent(self): return tuple(self._gtcent) def _read_samples(self): if self._read_samples_once: return self._read_samples_once = True with open(self._samplefile, 'r') as samfile: sample = 0 samplenames = list() next(samfile) next(samfile) for line in samfile: if re.search('^#', line): continue sample += 1 samplenames.append(line.strip().split()[0]) self._nsample = sample self._samplenames = samplenames def _read_dosages(self): dosage = list() allsnps = list() self.logger.info("Started reading genotype.") self._nloci = 0 linenum = 0 with gzip.open(self._gtfile, 'r') as filereader: for snpline in filereader: if linenum >= self._startsnp and linenum < self._endsnp: self._nloci += 1 mline = snpline.split() if self._isdosage: ngenotypes = len(mline) - self._meta_columns else: ngenotypes = (len(mline) - self._meta_columns) / 3 if float(ngenotypes).is_integer(): if ngenotypes != self._nsample: self.logger.error( 'Number of samples differ from genotypes') raise SAMPLE_NUMBER_ERROR else: self.logger.error( 'Number of columns in genotype frequencies not divisible by 3' ) raise GT_FREQS_NUMBER_ERROR if self._isdosage: snp_dosage = np.array( [float(x) for x in mline[self._meta_columns:]]) else: gt_freqs = np.array( [float(x) for x in mline[self._meta_columns:]]) indsAA = np.arange(0, self._nsample) * 3 indsAB = indsAA + 1 indsBB = indsAB + 1 snp_dosage = 2 * gt_freqs[indsBB] + gt_freqs[ indsAB] # [AA, AB, BB] := [0, 1, 2] maf = sum(snp_dosage) / 2 / len(snp_dosage) try: ######## change to get the chrom numberfrom gtfile chrom = int(mline[0]) except: chrom = -1 this_snp = SnpInfo(chrom=chrom, bp_pos=int(mline[2]), varid=mline[1].decode("utf-8"), ref_allele=mline[3].decode("utf-8"), alt_allele=mline[4].decode("utf-8"), maf=maf) allsnps.append(this_snp) dosage.append(snp_dosage) linenum += 1 return allsnps, np.array(dosage) def _read_genotypes(self): if self._read_genotype_once: return self._read_genotype_once = True self._read_samples() # otherwise, self._nsample is not set allsnps, dosage = self._read_dosages() self.logger.info("Found {:d} SNPs of {:d} samples.".format( self._nloci, self._nsample)) self._dosage = dosage self._snpinfo = allsnps
class RevReg: def __init__(self, x, y, sigbeta2, comm, rank, ncore, null='perm', maf=None, masks=None): self.gt = x self.gx = y self.sigbeta2 = sigbeta2 self.comm = comm self.rank = rank self.ncore = ncore self.null = null self.maf = maf self.mpi = False self.masks = masks self.usemask = False if self.masks is not None: self.usemask = True if self.ncore > 1: self.mpi = True self._pvals = None self._qscores = None self._mu = None self._sigma = None self._betas = None if self.null == 'perm': self.sigx2 = np.var(self.gt, axis=1) elif self.null == 'maf': self.sigx2 = np.ones(self.gt.shape[0]) self.logger = MyLogger(__name__) @property def sb2(self): return self.sigbeta2 @sb2.setter def sb2(self, value): self.sigbeta2 = value @property def pvals(self): return self._pvals @property def scores(self): return self._qscores @property def null_mu(self): return self._mu @property def null_sigma(self): return self._sigma @property def betas(self): return self._betas def slavejob(self, gt, gx, sb2, sx2, maf, masks, start, end, usemask, get_betas=False): if usemask: if len(masks) == 0: return [], [], [], [], np.array([]) startsnp = min([min(x.apply2) for x in masks]) endsnp = max([max(x.apply2) for x in masks]) totsnp = sum(x.nsnp for x in masks) self.logger.debug( "Rank {:d} using {:d} masks on {:d} SNPs [{:d} to {:d}]". format(self.rank, len(masks), totsnp, startsnp, endsnp)) stime = time.time() p, q, mu, sig, b = self.maskjob(gt, gx, sb2, sx2, maf, masks, get_betas) self.logger.debug("Rank {:d} took {:g} seconds".format( self.rank, time.time() - stime)) else: self.logger.debug( "Rank {:d}. Using {:d} SNPs [{:d} to {:d}]".format( self.rank, end - start, start, end - 1)) applyon = np.arange(start, end) p, q, mu, sig, b = self.basejob(gt, gx, sb2, sx2, maf, applyon, get_betas) return p, q, mu, sig, b def maskjob(self, gt, gx, sb2, sx2, maf, masks, get_betas): p = np.array([]) q = np.array([]) mu = np.array([]) sig = np.array([]) b = np.array([]) for mask in masks: usegenes = np.ones(gx.shape[0], dtype=bool) if mask.rmv_id.shape[0] > 0: usegenes[mask.rmv_id] = False masked_gx = np.ascontiguousarray(gx[usegenes]) _p, _q, _mu, _sig, _b = self.basejob(gt, masked_gx, sb2, sx2, maf, np.array(mask.apply2), get_betas) p = np.append(p, _p) q = np.append(q, _q) mu = np.append(mu, _mu) sig = np.append(sig, _sig) if get_betas: # set beta value for masked genes to zero betas = self.reshape_masked_betas(_b, mask, gx.shape[0]) b = np.append(b, betas) return p, q, mu, sig, b def basejob(self, gt, gx, sb2, sx2, maf, applyon, get_betas): slv_gt = np.ascontiguousarray(gt[applyon, :]) slv_gx = gx slv_sb2 = sb2[applyon] slv_sx2 = sx2[applyon] b = [] if self.null == 'perm': p, q, mu, sig = crrstat.perm_null(slv_gt, slv_gx, slv_sb2, slv_sx2) elif self.null == 'maf': slv_maf = maf[applyon] p, q, mu, sig = crrstat.maf_null(slv_gt, slv_gx, slv_sb2, slv_sx2, slv_maf) if get_betas: b = crrstat.crrbetas(slv_gt, slv_gx, slv_sb2) #self.logger.debug("Reporting from node {:d}. Sigma = ".format(self.rank) + np.array2string(sig) + "\n" ) return p, q, mu, sig, b def reshape_masked_betas(self, b, mask, ngenes): self.logger.debug( "Rank {:d}: reshaping {:d} betas into ({:d},{:d}) with {:d} masked genes out of {:d}" .format(self.rank, len(b), len(mask.apply2), (ngenes - len(mask.rmv_id)), len(mask.rmv_id), ngenes)) _b = b.reshape(len(mask.apply2), ngenes - len(mask.rmv_id)) paddedBeta = np.zeros((len(mask.apply2), ngenes)) inv_ind = np.delete(np.arange(ngenes), mask.rmv_id) paddedBeta[:, inv_ind] = _b return paddedBeta.reshape(-1) def mpicompute(self, get_betas=False): gmasks = None pstart = None pend = None geno = None expr = None sb2 = None sx2 = None maf = None if self.rank == 0: # this is the master # create a list of index for sending to your slaves if self.usemask: self.logger.debug( "Masks on: " + ", ".join(['{:d}'.format(x.nsnp) for x in self.masks])) gmasks = mpihelper.split_maskcomp(self.masks, self.ncore) else: pstart, pend = mpihelper.split_n(self.gt.shape[0], self.ncore) self.logger.debug("Get betas set to " + str(get_betas)) geno = self.gt expr = self.gx sb2 = self.sigbeta2 sx2 = self.sigx2 maf = self.maf sb2 = self.comm.bcast(sb2, root=0) sx2 = self.comm.bcast(sx2, root=0) maf = self.comm.bcast(maf, root=0) expr = self.comm.bcast(expr, root=0) geno = self.comm.bcast(geno, root=0) if self.usemask: gmasks = self.comm.scatter(gmasks, root=0) else: pstart = self.comm.scatter(pstart, root=0) pend = self.comm.scatter(pend, root=0) self.comm.barrier() # ================================== # Data sent. Now do the calculations # ================================== pvals, qscores, mu, sigma, betas = self.slavejob(geno, expr, sb2, sx2, maf, gmasks, pstart, pend, self.usemask, get_betas=get_betas) pvals = self.comm.gather(pvals, root=0) qscores = self.comm.gather(qscores, root=0) mu = self.comm.gather(mu, root=0) sigma = self.comm.gather(sigma, root=0) if get_betas: recvbuf = None betalength = len(betas) self.comm.barrier() # is it necessary? received_counts = self.comm.gather(betalength) if self.rank == 0: self.logger.debug( "Number of coefficients from each node: {:s}".format( ", ".join(['{:d}'.format(x) for x in received_counts]))) recvbuf = np.zeros(np.sum(received_counts), dtype=np.float64) self.comm.Gatherv(sendbuf=betas, recvbuf=(recvbuf, received_counts), root=0) if self.rank == 0: self._pvals = np.concatenate(pvals) self._qscores = np.concatenate(qscores) self._mu = np.concatenate(mu) self._sigma = np.concatenate(sigma) if get_betas: self._betas = recvbuf.reshape(self.gt.shape[0], self.gx.shape[0]) self.logger.debug( "All nodes computed a total of {:d} pvalues and {:s} betas" .format(len(self._pvals), str(self._betas.shape))) else: assert qscores is None assert pvals is None assert mu is None assert sigma is None return def compute(self, get_betas=False): if self.mpi: self.mpicompute(get_betas) else: start = 0 end = self.gt.shape[0] self._pvals, self._qscores, self._mu, self._sigma, self._betas = self.slavejob( self.gt, self.gx, self.sigbeta2, self.sigx2, self.maf, self.masks, start, end, self.usemask, get_betas) if get_betas: self._betas = self._betas.reshape(self.gt.shape[0], self.gx.shape[0]) return
from iotools import readqnull # ================================== # Start MPI calculation # ================================== MPI.Init() comm = MPI.COMM_WORLD rank = comm.Get_rank() ncore = comm.Get_size() if rank == 0: start_time = time.time() # ================================== # Input Processing # ================================== args = Args(comm, rank) logger = MyLogger(__name__) # List of variables that are broadcast over all slave nodes gtcent = None # Centered genotype (x - mu). Note: Not scaled (divided) by standard deviation. Dimension I x N. gtnorm = None # Centered and scaled genotype (x - mu) / sigma. Dimension I x N. expr = None # Expression matrix. Dimension G x N. tgene_gtnorm = None # Centered and scaled genotype without KNN correction. Dimension I x N. tgene_expr = None # Expression matrix for finding target genes after RR-score. Dimension G x N. masklist = None # List of gene indices masked for each SNP. List of length I. Each element contains a list of indices for the cis-genes of a candidate SNP. maskcomp = None # List of CisMasks. Each element is a collection of (a) gene indices to be masked, and (b) all SNP indices which require this mask. maf = None # List of MAF of each SNP as observed in the sample (or read separately from the population if file is provided). Length I. if rank == 0: logger.debug("Using {:d} cores".format(ncore)) data = Data(args) data.load() gtcent = data.geno_centered
class RevReg: def __init__(self, x, y, sigbeta2, comm, rank, ncore, null='perm', maf=None): self.gt = x self.gx = y self.sigbeta2 = sigbeta2 self.comm = comm self.rank = rank self.ncore = ncore self.null = null self.maf = maf self.mpi = False if self.ncore > 1: self.mpi = True self._pvals = None self._qscores = None self._mu = None self._sigma = None if self.null == 'perm': self.sigx2 = np.var(self.gt, axis=1) elif self.null == 'maf': self.sigx2 = np.ones(self.gt.shape[0]) self.logger = MyLogger(__name__) @property def pvals(self): return self._pvals @property def scores(self): return self._qscores @property def null_mu(self): return self._mu @property def null_sigma(self): return self._sigma def slavejob(self, gt, gx, sb2, sx2, maf, start, end): slv_gt = gt[start:end, :] slv_gx = gx slv_sb2 = sb2[start:end] slv_sx2 = sx2[start:end] if self.null == 'perm': p, q, mu, sig = crrstat.perm_null(slv_gt, slv_gx, slv_sb2, slv_sx2) elif self.null == 'maf': slv_maf = maf[start:end] p, q, mu, sig = crrstat.maf_null(slv_gt, slv_gx, slv_sb2, slv_sx2, slv_maf) #self.logger.debug("Reporting from node {:d}. Sigma = ".format(self.rank) + np.array2string(sig) + "\n" ) return p, q, mu, sig def mpicompute(self): if self.rank == 0: # this is the master # create a list of index for sending to your slaves nmax = int(self.gt.shape[0] / self.ncore) offset = 0 for i in range(1, self.ncore): start = offset end = offset + nmax self.comm.send(start, dest=i, tag=10 + 3 * i - 2) self.comm.send(end, dest=i, tag=10 + 3 * i - 1) offset += nmax start = offset end = self.gt.shape[0] else: start = self.comm.recv(source=0, tag=10 + self.rank * 3 - 2) end = self.comm.recv(source=0, tag=10 + self.rank * 3 - 1) if self.rank == 0: geno = self.gt expr = self.gx sb2 = self.sigbeta2 sx2 = self.sigx2 maf = self.maf else: geno = None expr = None sb2 = None sx2 = None maf = None sb2 = self.comm.bcast(sb2, root=0) sx2 = self.comm.bcast(sx2, root=0) maf = self.comm.bcast(maf, root=0) expr = self.comm.bcast(expr, root=0) geno = self.comm.bcast(geno, root=0) self.comm.barrier() # ================================== # Data sent. Now do the calculations # ================================== self.logger.debug( "Reporting from node {:d}. Start: {:d} and End: {:d}".format( self.rank, start, end)) pvals, qscores, mu, sigma = self.slavejob(geno, expr, sb2, sx2, maf, start, end) pvals = self.comm.gather(pvals, root=0) qscores = self.comm.gather(qscores, root=0) mu = self.comm.gather(mu, root=0) sigma = self.comm.gather(sigma, root=0) if self.rank == 0: self._pvals = np.concatenate(pvals) self._qscores = np.concatenate(qscores) self._mu = np.concatenate(mu) self._sigma = np.concatenate(sigma) else: assert qscores is None assert pvals is None assert mu is None assert sigma is None return def compute(self): if self.mpi: self.mpicompute() else: start = 0 end = self.gt.shape[0] pvals, qscores, mu, sigma = self.slavejob(self.gt, self.gx, self.sigbeta2, self.sigx2, self.maf, start, end) self._pvals = pvals self._qscores = qscores self._mu = mu self._sigma = sigma return