Example #1
0
    def __init__(self,
                 x,
                 y,
                 sigbeta2,
                 comm,
                 rank,
                 ncore,
                 null='perm',
                 maf=None):
        self.gt = x
        self.gx = y
        self.sigbeta2 = sigbeta2
        self.comm = comm
        self.rank = rank
        self.ncore = ncore
        self.null = null
        self.maf = maf
        self.mpi = False
        if self.ncore > 1:
            self.mpi = True
        self._pvals = None
        self._qscores = None
        self._mu = None
        self._sigma = None

        if self.null == 'perm':
            self.sigx2 = np.var(self.gt, axis=1)
        elif self.null == 'maf':
            self.sigx2 = np.ones(self.gt.shape[0])

        self.logger = MyLogger(__name__)
Example #2
0
 def __init__(self, x, y, comm, rank, ncore, masks = None):
     self.gt = x
     self.gx = y
     self._zstats = None
     self.rank = rank
     self.comm = comm
     self.ncore = ncore
     self.masks = masks
     self.usemask = True if masks is not None else False
     self.mpi = False
     if self.ncore > 1:
         self.mpi = True
     self.logger = MyLogger(__name__)
Example #3
0
 def __init__(self, args):
     self.logger = MyLogger(__name__)
     self.args = args
     self._gtcent = None
     self._gtnorm = None
     self._snpinfo = None
     self._geneinfo = None
     self._expr = None
     self._cismaskcomp = None
     self._cismasklist = None
     self._tgene_gtnorm = None
     self._tgene_gtcent = None
     self._tgene_expr = None
Example #4
0
 def __init__(self, x, y, comm, rank, ncore, outfile, niter=100000, seed=None):
     self.gt = x
     self.gx = y
     self._niter = niter
     self.outfile = outfile
     self.rank = rank
     self.comm = comm
     self.ncore = ncore
     self.mpi = False
     if seed is not None:
         np.random.seed(seed)
     if self.ncore > 1:
         self.mpi = True
     self.logger = MyLogger(__name__)
Example #5
0
 def __init__(self,
              gtfile,
              samplefile,
              startsnp=0,
              endsnp=1e15,
              isdosage=True):
     self.logger = MyLogger(__name__)
     self._gtfile = gtfile
     self._samplefile = samplefile
     self._startsnp = startsnp
     self._endsnp = endsnp
     self._isdosage = isdosage
     if self._isdosage:
         self._meta_columns = 6
     else:
         self._meta_columns = 5
     self._read_genotypes()
Example #6
0
    def __init__(self,
                 x,
                 y,
                 comm,
                 rank,
                 ncore,
                 qcalc,
                 masks,
                 get_pvals=False,
                 qnull_file=None,
                 statmodel='zstat',
                 target_fdr=None):
        self.gt = x
        self.gx = y
        self._pvals = None
        self._qscores = None
        self._jpa_pvals = None
        self.rank = rank
        self.comm = comm
        self.ncore = ncore
        self.mpi = False
        if self.ncore > 1:
            self.mpi = True

        self.qcalc = qcalc
        self.masks = masks
        self.usemask = True if masks is not None else False
        self.get_empirical_pvals = False
        if get_pvals:
            self.get_empirical_pvals = True
        self.qnull_file = qnull_file
        self.statmodel = statmodel
        self.logger = MyLogger(__name__)
        self.target_fdr = target_fdr
        if self.target_fdr is not None:
            self.adj_pvals = list()
            self.pass_fdr = list()
Example #7
0
    def __init__(self, comm, rank):

        self.logger = MyLogger(__name__)
        self.rank = rank
        self.comm = comm

        args = None
        if self.rank == 0:
            args = self.parse_args()
        args = self.comm.bcast(args, root=0)

        self.vcf_file = args.vcf_filename
        self.oxf_file = args.oxf_filename
        self.isdosage = args.isdosage
        self.fam_file = args.fam_filename
        if args.chrom is not None:
            self.chrom = int(args.chrom)
        else:
            self.chrom = None
        self.gx_file = args.gx_filename
        self.gxcorr_file = args.gxcorr_filename
        self.gx_datafmt = args.gx_datafmt
        self.gtf_file = args.gtf_filename
        self.gxtrim = args.gxtrim
        self.biotype = args.biotype
        self.outprefix = args.outprefix
        if args.incsnps is not None:
            self.startsnp = args.incsnps[0] - 1
            self.endsnp = args.incsnps[1]
        else:
            self.startsnp = 0
            self.endsnp = 1e15  # an unusually high number to ensure all SNPs are read.

        self.jpa, self.rr = project.method_selector(args.method)
        self.nullmodel = args.nullmodel
        self.cismasking = args.cismasking
        self.window = args.window
        self.sigmabeta = args.sigmabeta
        self.knn_nbr = args.knn
        self.knncorr = True
        if args.knn == 0:
            self.knncorr = False

        self.shuffle = args.shuffle
        self.shuffle_file = args.shuffle_file
        if self.shuffle_file is not None:
            self.shuffle = True

        self.psnpcut = args.psnpthres
        self.pgenecut = args.pgenethres
        self.maf_file = args.maf_filename
        self.jpanull_file = args.qnullfile
        self.jpa_calc_null = project.need_new_jpanull_file(
            self.jpa, self.jpanull_file)
        self.jpanull_iter = args.qnull_iter
        self.seed = args.seed

        self.maketest = args.maketest

        self.check_inputs()
        self.crossmapfile = args.crossmapfile
        self.usefdr = False
        self.target_fdr = args.target_fdr
        if self.target_fdr is not None:
            self.usefdr = True

        if self.rank == 0:
            self.logger.info('Method: {:s}'.format(args.method))
            if self.rr:
                self.logger.info('Null Model: {:s}'.format(args.nullmodel))
                self.logger.info('Sigma_beta: {:g}'.format(args.sigmabeta))
Example #8
0
class Args():
    def __init__(self, comm, rank):

        self.logger = MyLogger(__name__)
        self.rank = rank
        self.comm = comm

        args = None
        if self.rank == 0:
            args = self.parse_args()
        args = self.comm.bcast(args, root=0)

        self.vcf_file = args.vcf_filename
        self.oxf_file = args.oxf_filename
        self.isdosage = args.isdosage
        self.fam_file = args.fam_filename
        if args.chrom is not None:
            self.chrom = int(args.chrom)
        else:
            self.chrom = None
        self.gx_file = args.gx_filename
        self.gxcorr_file = args.gxcorr_filename
        self.gx_datafmt = args.gx_datafmt
        self.gtf_file = args.gtf_filename
        self.gxtrim = args.gxtrim
        self.biotype = args.biotype
        self.outprefix = args.outprefix
        if args.incsnps is not None:
            self.startsnp = args.incsnps[0] - 1
            self.endsnp = args.incsnps[1]
        else:
            self.startsnp = 0
            self.endsnp = 1e15  # an unusually high number to ensure all SNPs are read.

        self.jpa, self.rr = project.method_selector(args.method)
        self.nullmodel = args.nullmodel
        self.cismasking = args.cismasking
        self.window = args.window
        self.sigmabeta = args.sigmabeta
        self.knn_nbr = args.knn
        self.knncorr = True
        if args.knn == 0:
            self.knncorr = False

        self.shuffle = args.shuffle
        self.shuffle_file = args.shuffle_file
        if self.shuffle_file is not None:
            self.shuffle = True

        self.psnpcut = args.psnpthres
        self.pgenecut = args.pgenethres
        self.maf_file = args.maf_filename
        self.jpanull_file = args.qnullfile
        self.jpa_calc_null = project.need_new_jpanull_file(
            self.jpa, self.jpanull_file)
        self.jpanull_iter = args.qnull_iter
        self.seed = args.seed

        self.maketest = args.maketest

        self.check_inputs()
        self.crossmapfile = args.crossmapfile
        self.usefdr = False
        self.target_fdr = args.target_fdr
        if self.target_fdr is not None:
            self.usefdr = True

        if self.rank == 0:
            self.logger.info('Method: {:s}'.format(args.method))
            if self.rr:
                self.logger.info('Null Model: {:s}'.format(args.nullmodel))
                self.logger.info('Sigma_beta: {:g}'.format(args.sigmabeta))

    def parse_args(self):

        self.logger.info('Running TEJAAS v{:s}'.format(project.version()))

        parser = argparse.ArgumentParser(
            description='Tejaas: Discover trans-eQTLs!')

        parser.add_argument('--vcf',
                            type=str,
                            dest='vcf_filename',
                            metavar='FILE',
                            help='Input VCF file in vcf.gz format')

        parser.add_argument('--oxf',
                            type=str,
                            dest='oxf_filename',
                            metavar='FILE',
                            help='Input Oxford file')

        parser.add_argument('--dosage',
                            dest='isdosage',
                            action='store_true',
                            help='Read dosages')

        parser.add_argument('--fam',
                            type=str,
                            dest='fam_filename',
                            metavar='FILE',
                            help='Input fam file')

        parser.add_argument('--chrom',
                            dest='chrom',
                            metavar='NUMBER',
                            help="Chromosome number of the genotype file")

        parser.add_argument(
            '--include-SNPs',
            type=snprange,
            dest='incsnps',
            metavar='START:END',
            help='Colon-separated index of SNPs to be included')

        parser.add_argument(
            '--gx',
            type=str,
            dest='gx_filename',
            metavar='FILE',
            help='input expression file for finding trans-eQTLs')

        parser.add_argument(
            '--gxcorr',
            type=str,
            dest='gxcorr_filename',
            metavar='FILE',
            help='input expression file for finding target genes')

        parser.add_argument(
            '--gxfmt',
            type=str,
            dest='gx_datafmt',
            metavar='GX_FORMAT',
            default='gtex',
            help=
            'Format of input gene expression file. Supported: gtex, cardiogencis and geuvadis'
        )

        parser.add_argument(
            '--biotype',
            nargs='*',
            type=biotype_fmt,
            dest='biotype',
            metavar='BIOTYPE_OPTIONS',
            default=['protein_coding', 'lncRNA'],
            help=
            'List of biotypes to be selected from the GENCODE annotation file. Supported options: protein_coding, lncRNA'
        )

        parser.add_argument('--gtf',
                            type=str,
                            dest='gtf_filename',
                            metavar='FILE',
                            help='input gtf file')

        parser.add_argument(
            '--trim',
            dest='gxtrim',
            action='store_true',
            help='Trim version number from GENCODE Ensembl IDs')

        parser.add_argument('--outprefix',
                            type=str,
                            dest='outprefix',
                            default='out',
                            metavar='STR',
                            help='prefix for all output files')

        parser.add_argument('--method',
                            default='rr',
                            type=method_strings,
                            dest='method',
                            metavar='STR',
                            help='which method to run: jpa / rr')

        parser.add_argument('--null',
                            default='perm',
                            type=null_strings,
                            dest='nullmodel',
                            metavar='STR',
                            help='which null model to use: perm / maf')

        parser.add_argument(
            '--cismask',
            dest='cismasking',
            action='store_true',
            help='Generate cismasks for the expression matrix for each SNP')

        parser.add_argument(
            '--window',
            type=int,
            default=1e6,
            dest='window',
            help='Window (number of base pairs) used for masking cis genes')

        parser.add_argument(
            '--prior-sigma',
            default=0.1,
            type=float,
            dest='sigmabeta',
            metavar='FLOAT',
            help=
            'standard deviation of the normal prior for reverse multiple linear regression'
        )

        parser.add_argument(
            '--knn',
            type=int,
            dest='knn',
            help=
            'Number of neighbours for KNN (use 0 if you do not want KNN correction)',
            default=0)

        parser.add_argument(
            '--psnpthres',
            default=0.0001,
            type=float,
            dest='psnpthres',
            metavar='PVAL',
            help=
            'target genes will be reported for trans-eQTLs, which are below this threshold p-value for RR/JPA statistics'
        )

        parser.add_argument(
            '--pgenethres',
            default=0.001,
            type=float,
            dest='pgenethres',
            metavar='PVAL',
            help=
            'target genes whose linear regression association with trans-eQTLs are below this threshold p-value will be reported'
        )

        parser.add_argument(
            '--jpanull',
            type=str,
            dest='qnullfile',
            help='Filename for storing / reading null JPA scores')

        parser.add_argument(
            '--jpanull-iter',
            default=100000,
            type=int,
            dest='qnull_iter',
            help='Number of iterations for creating null JPA scores')

        parser.add_argument(
            '--seed',
            default=None,
            type=int,
            dest='seed',
            help=
            'Seed the random generator for numpy, used for development purpose'
        )

        parser.add_argument(
            '--maf-file',
            type=str,
            dest='maf_filename',
            metavar='FILE',
            help='file name of the MAF, see Documentation for filetype')

        parser.add_argument('--shuffle',
                            dest='shuffle',
                            action='store_true',
                            help='Shuffle the genotypes randomly')

        parser.add_argument(
            '--shuffle-with',
            type=str,
            dest='shuffle_file',
            metavar='FILE',
            help='Shuffle the genotypes using the supplied donor IDs file')

        parser.add_argument('--test',
                            dest='maketest',
                            action='store_true',
                            help='whether to do test run')

        parser.add_argument('--crossmap',
                            type=str,
                            default=None,
                            dest='crossmapfile',
                            help='Crossmapability file (Saha, Battle 2018) ')

        parser.add_argument(
            '--fdrgenethres',
            default=None,
            type=float,
            dest='target_fdr',
            metavar='FDR',
            help=
            'enable FDR correction up to a certain cutoff for target gene discovery'
        )

        res = parser.parse_args()
        return res

    def check_inputs(self):
        '''
        Perform sanity checks on the input options.
        '''
        if self.rank == 0:
            '''
            Check if any genotype file is specified.
            '''
            try:
                assert (self.vcf_file is not None) or (self.oxf_file
                                                       is not None)
            except AssertionError:
                print(
                    'Input error: Specify either --vcf or --oxf. See --help for details.'
                )
                raise

            if (self.oxf_file is not None):
                try:
                    assert (self.fam_file is not None)
                except AssertionError:
                    print(
                        'Input error: Specify the sample file with --fam. See --help for details.'
                    )
                    raise
            '''
            Check if gene expression and GTF files are specified
            '''
            try:
                assert (self.gx_file is not None)
            except AssertionError:
                print(
                    'Input error: Specify gene expression file. See --help for details'
                )
                raise
            try:
                assert (self.gtf_file is not None)
            except AssertionError:
                print(
                    'Input error: Specify GENCODE file. See --help for details'
                )
                raise
            '''
            Check if files exist.
            '''
            for filepath in [
                    self.vcf_file, self.oxf_file, self.fam_file, self.gx_file,
                    self.gtf_file, self.gxcorr_file
            ]:
                if (filepath is not None):
                    try:
                        assert os.path.isfile(filepath)
                    except AssertionError:
                        print('File {:s} does not exist.'.format(filepath))
                        raise
            '''
            Check if output directory is writable / can be created
            '''
            outdir = os.path.dirname(os.path.realpath(self.outprefix))
            try:
                if not os.path.exists(outdir): os.makedirs(outdir)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    print('Unable to create output directory: {:s}'.format(
                        outdir))
                    raise

            try:
                assert os.path.isdir(outdir) and os.access(
                    outdir, os.W_OK | os.X_OK)
                #filepath = "{:s}.write_tester".format(self.outprefix)
                #filehandle = open( filepath, 'w' )
                #filehandle.close()
                #os.remove(filepath)
            except AssertionError:
                print('Unable to create files in {:s}'.format(outdir))
                raise
Example #9
0
class JPANULL:


    def __init__(self, x, y, comm, rank, ncore, outfile, niter=100000, seed=None):
        self.gt = x
        self.gx = y
        self._niter = niter
        self.outfile = outfile
        self.rank = rank
        self.comm = comm
        self.ncore = ncore
        self.mpi = False
        if seed is not None:
            np.random.seed(seed)
        if self.ncore > 1:
            self.mpi = True
        self.logger = MyLogger(__name__)


    def write_qscores(self):
        qmax = np.max(self._qscores)
        valid_qscores = self._qscores[np.where(self._qscores < qmax)]
        with open(self.outfile, 'w') as fout:
            for qnull in valid_qscores:
                fout.write(f"{qnull}\n")


    def jpascore(self, pvals):
        min_nonzero = np.min(pvals[np.nonzero(pvals)])
        pvals[pvals == 0] = min_nonzero
        p = np.sort(pvals)
        n = p.shape[0]
        kmax = min(100, n)
        krange = [i + 1 for i in range(kmax)]
        digamma_n1 = special.digamma(n + 1)
        z = - ( np.log(p[:kmax]) - (special.digamma(krange) - digamma_n1) )
        zsum = np.cumsum(z)
        res = np.max(zsum)
        return res


    def slavejob(self, W, Q, Zmean, n):
        self.logger.debug('Rank {:d} calculating {:d} null JPA scores'.format(self.rank, n))
        ngene = W.shape[0]
        pvals = np.zeros(n * ngene)
        for i in range(n):
            ngene = W.shape[0]
            zrand = np.random.normal(0, 1, size = ngene)
            znull = Zmean + np.einsum('ij, j, j', Q, np.sqrt(W), zrand)
            pvals[i*ngene : (i+1)*ngene] = 2.0 * (1 - stats.norm.cdf(np.abs(znull)))
        qnull = np.array([self.jpascore(pvals[i*ngene : (i+1)*ngene]) for i in range(n)])
        return pvals, qnull


    def mpicompute(self):
        if self.rank == 0:
            # this is the master
            # create a list of N for sending to your slaves
            thisW = self._W
            thisQ = self._Q
            thisZmean = self._Zmean
            start, end = mpihelper.split_n(self._niter, self.ncore)
            nlist = [x - y for x, y in zip(end, start)]
        else:
            thisW = None
            thisQ = None
            thisZmean = None
            nlist = None
            slave_W = None
            slave_Q = None
            slave_Zmean = None
            slave_n = None

        slave_W = self.comm.bcast(thisW, root = 0)
        slave_Q = self.comm.bcast(thisQ, root = 0)
        slave_Zmean = self.comm.bcast(thisZmean, root = 0)
        slave_n = self.comm.scatter(nlist, root = 0)
        self.comm.barrier()

        if self.rank == 0: self.logger.debug("Broadcast W, Q and Zmean to the slave nodes")

        # ==================================
        # Data sent. Do the calculations
        # ==================================
        pvals, qscores = self.slavejob(slave_W, slave_Q, slave_Zmean, slave_n)

        # ==================================
        # Collect the results
        # ==================================
        recvbuf = None
        if self.rank == 0:
            self.logger.debug("Number of SNPs sent to each slave: " + ", ".join(["{:d}".format(x) for x in nlist]))
            ngene = self._W.shape[0]
            flat_sizes = np.array([n * ngene for n in nlist])
            recvbuf = np.zeros(sum(flat_sizes), dtype=np.float64)
        else:
            flat_sizes = None
        self.comm.Gatherv(sendbuf=pvals, recvbuf=(recvbuf, flat_sizes), root = 0)

        if self.rank == 0:
            self._pvals = recvbuf.reshape(sum(nlist), ngene)

        qscores = self.comm.gather(qscores, root = 0)

        if self.rank == 0:
            self._qscores = np.concatenate(qscores)
        else:
            assert qscores is None
            assert recvbuf is None
        return


    def WQ_mpiwrap(self):
        '''
        Populates self._W, self._Q and self._Zmean
        Learns the null Zstats from the ZSTATS class
        and performs eigendecomposition in the master node.
        '''

        self._W = None
        self._Q = None
        self._Zmean = None

        if self.rank == 0: self.logger.debug("Computing Z-stats")
        zstats = ZSTATS(self.gt, self.gx, self.comm, self.rank, self.ncore)
        zstats.compute()
        
        if self.rank == 0:
            self.logger.debug("Computing W and Q")
            zscores = zstats.scores
            C = np.cov(zscores.T)
            # Numpy gives imaginary eigenvalues, use eigh from scipy
            # for decomposition of real symmetric matrix
            W, Q = eigh(C)
            self.logger.debug("Eigendecomposition done")
            # still some eigenvalues are negative. force them to zero if they are negligible. (!!!!!!!!!!!)
            # check if everything is ok
            #Wsparse = W.copy()
            #Wsparse[np.where(W < 0)] = 0
            #W = Wsparse
            W[np.where(W < 0)] = 0
            self.logger.debug("Forced negative eigenvalues to zero")
            #if not np.allclose(C, Q @ np.diag(W) @ Q.T):
            #    self.logger.error("Eigen vectors could not be forced to positive")
            #    exit
            #else:
            #    W = Wsparse
            #    self.logger.debug("Eigen vectors are forced to positive")
            Zmean = np.mean(zscores, axis = 0)
            self._W = W
            self._Q = Q
            self._Zmean = Zmean
        self.comm.barrier()


    def compute(self):
        self.WQ_mpiwrap()
        if self.rank == 0: self.logger.debug("Start MPI calculation")
        if self.mpi:
            self.mpicompute()
        else:
            pvals, qscores = self.slavejob(self._W, self._Q, self._Zmean, self._niter)
            self._pvals = pvals.reshape(self._niter, self._W.shape[0])
            self._qscores = qscores
        if self.rank == 0:
            self.logger.debug("Null JPA-scores calculated. Writing to file.")
            self.write_qscores()
        return
Example #10
0
class ZSTATS:

    def __init__(self, x, y, comm, rank, ncore, masks = None):
        self.gt = x
        self.gx = y
        self._zstats = None
        self.rank = rank
        self.comm = comm
        self.ncore = ncore
        self.masks = masks
        self.usemask = True if masks is not None else False
        self.mpi = False
        if self.ncore > 1:
            self.mpi = True
        self.logger = MyLogger(__name__)


    @property
    def scores(self):
        return self._zstats


    def clinreg(self, geno, expr, nrow):
        _path = os.path.dirname(__file__)
        clib = np.ctypeslib.load_library('../lib/linear_regression_zstat.so', _path)
        czstat = clib.fit
        czstat.restype = ctypes.c_int
        czstat.argtypes = [np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'),
                           np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED'),
                           ctypes.c_int,
                           ctypes.c_int,
                           ctypes.c_int,
                           np.ctypeslib.ndpointer(ctypes.c_double, ndim=1, flags='C_CONTIGUOUS, ALIGNED')
                          ]
    
        x = geno.reshape(-1,)
        y = expr.reshape(-1,)
        xsize = x.shape[0]
        nsnps = geno.shape[0]
        nsample = geno.shape[1]
        ngene = expr.shape[0]
        zstat = np.zeros(nsnps * ngene)
        success = czstat(x, y, nsnps, ngene, nsample, zstat)
        return zstat


    def slavejob(self, geno, expr, nmax, offset):
        self.logger.debug('Rank {:d} calculating SNPs {:d} to {:d}'.format(self.rank, offset+1, nmax + offset))
        nsnps = geno.shape[0]
        ngene = expr.shape[0]
        zstat = self.clinreg(geno, expr, nmax)
        return zstat


    def mpicompute(self):
        if self.rank == 0:
            # this is the master
            # create a list of genotypes for sending to your slaves
            geno, offset = mpihelper.split_genotype(self.gt, self.ncore)
            expr = self.gx
            nsnp = [x.shape[0] for x in geno]
        else:
            geno = None
            expr = None
            nsnp = None
            offset = None
            slave_geno = None
            slave_expr = None
            slave_nsnp = None
            slave_offs = None
        
        slave_geno = self.comm.scatter(geno, root = 0)
        slave_expr = self.comm.bcast(expr, root = 0)
        slave_nsnp = self.comm.scatter(nsnp, root = 0)
        slave_offs = self.comm.scatter(offset, root = 0)
        self.comm.barrier()

        # ==================================
        # Data sent. Do the calculations
        # ==================================
        zstat = self.slavejob(slave_geno, slave_expr, slave_nsnp, slave_offs)

        # ==================================
        # Collect the results
        # ==================================
        recvbuf = None
        if self.rank == 0:
            self.logger.debug("Number of SNPs sent to each slave: " + ", ".join(["{:d}".format(x) for x in nsnp]))
            ngene = self.gx.shape[0]
            flat_sizes = np.array([n * ngene for n in nsnp])
            recvbuf = np.zeros(sum(flat_sizes), dtype=np.float64)
        else:
            flat_sizes = None
        self.comm.Gatherv(sendbuf=zstat, recvbuf=(recvbuf, flat_sizes), root = 0)

        if self.rank == 0:
            self._zstats = recvbuf.reshape(sum(nsnp), ngene)
        else:
            assert recvbuf is None

        return
            

    def compute(self):
        if self.mpi:
            self.mpicompute()
        else:
            zstats = self.slavejob(self.gt, self.gx, self.gt.shape[0], 0)
            self._zstats = zstats.reshape(self.gt.shape[0], self.gx.shape[0])
        return
Example #11
0
class Data():
    def __init__(self, args):
        self.logger = MyLogger(__name__)
        self.args = args
        self._gtcent = None
        self._gtnorm = None
        self._snpinfo = None
        self._geneinfo = None
        self._expr = None
        self._cismaskcomp = None
        self._cismasklist = None
        self._tgene_gtnorm = None
        self._tgene_gtcent = None
        self._tgene_expr = None

    @property
    def geno_centered(self):
        return self._gtcent

    @property
    def geno_normed(self):
        return self._gtnorm

    @property
    def snpinfo(self):
        return self._snpinfo

    @property
    def geneinfo(self):
        return self._geneinfo

    @property
    def cismasks_comp(self):
        return self._cismaskcomp

    @property
    def cismasks_list(self):
        return self._cismasklist

    @property
    def expression(self):
        return self._expr

    @property
    def tgene_geno_normed(self):
        return self._tgene_gtnorm

    @property
    def tgene_geno_centered(self):
        return self._tgene_gtcent

    @property
    def tgene_expression(self):
        return self._tgene_expr

    def select_donors(self, vcf_donors, expr_donors):
        ''' Make sure that donors are in the same order for both expression and genotype
        '''
        common_donors = [x for x in vcf_donors if x in expr_donors]
        vcfmask = np.array([vcf_donors.index(x) for x in common_donors])
        exprmask = np.array([expr_donors.index(x) for x in common_donors])
        return vcfmask, exprmask

    def select_genes(self, info, names):
        ''' Select genes which would be analyzed. 
            Make sure the indices are not mixed up
        '''
        allowed = [x.ensembl_id for x in info]
        common = [x for x in names if x in allowed]
        genes = [x for x in info if x.ensembl_id in common]
        indices = [names.index(x.ensembl_id) for x in genes]
        return genes, np.array(indices)

    def match_gx_indices(self, ref_gx, ref_donors, ref_gnames, gx, donors,
                         gnames):
        '''Match the indices of gx with those of ref_gx
           Both gx and ref_gx are of size G x N
           G = genes (gnames), N = donors
        '''
        gidx = np.array([gnames.index(x) for x in ref_gnames if x in gnames])
        didx = np.array([donors.index(x) for x in ref_donors if x in donors])
        if (gidx.shape[0] != len(ref_gnames)) or (didx.shape[0] !=
                                                  len(ref_donors)):
            self.logger.error(
                "Gene expression files have different donors and / or gene names. Please check. Program cancelled!"
            )
            raise
        return gx[:, didx][gidx, :]

    def HWEcheck(self, x):
        gt = x.tolist()
        f = np.array([0] * 3)
        f[0] = gt.count(0)
        f[1] = gt.count(1)
        f[2] = gt.count(2)
        n = sum(f)
        X2 = n * ((4 * f[0] * f[2] - f[1]**2) / ((2 * f[0] + f[1]) *
                                                 (2 * f[2] + f[1])))**2
        pval = 1 - ss.chi2.cdf(X2, 1)
        return pval

    def filter_snps(self, snpinfo, dosage, maf_limit=0.01, use_hwe=False):
        # Predixcan style filtering of snps
        newsnps = list()
        newdosage = list()
        npoly = 0
        nambi = 0
        nunkn = 0
        nlowf = 0
        nlowf_actual = 0
        nhwep = 0
        nalle = 0
        for i, snp in enumerate(snpinfo):
            pos = snp.bp_pos
            refAllele = snp.ref_allele
            effectAllele = snp.alt_allele
            rsid = snp.varid
            maf = round(snp.maf, 3)
            # Actual MAF is lower / higher than population MAF because some samples have been removed
            maf_actual = sum(dosage[i]) / 2 / len(dosage[i])
            # Skip non-single letter polymorphisms
            if len(refAllele) > 1 or len(effectAllele) > 1:
                npoly += 1
                continue
            # Skip unknown alleles
            if refAllele not in SNP_COMPLEMENT or effectAllele not in SNP_COMPLEMENT:
                nalle += 1
                continue
            # Skip ambiguous strands
            if SNP_COMPLEMENT[refAllele] == effectAllele:
                nambi += 1
                continue
            # Skip unknown RSIDs
            if rsid == '.':
                nunkn += 1
                continue
            # Skip low MAF
            if not (maf >= maf_limit and maf <= (1 - maf_limit)):
                nlowf += 1
                continue
            # Skip low actual MAF
            if not (maf_actual >= maf_limit and maf_actual <= (1 - maf_limit)):
                nlowf_actual += 1
                continue
            # Check HWE
            if use_hwe:
                # Convert to integers 0, 1 or 2
                bins = [0.66, 1.33]
                intdosage = np.digitize(dosage[i], bins)
                # Remove SNPs out of HWE
                hwep = self.HWEcheck(intdosage)
                if (hwep < 0.000001):
                    nhwep += 1
                    # self.logger.debug("SNP {:s} has a HWE p-value of {:g}".format(rsid, hwep))
                    continue
            new_snp = snp._replace(maf=maf_actual)
            newsnps.append(new_snp)
            newdosage.append(dosage[i])
        self.logger.debug(
            "Removed {:d} SNPs because of non-single letter polymorphisms".
            format(npoly))
        self.logger.debug(
            "Removed {:d} SNPs because of unknown allele symbol".format(nalle))
        self.logger.debug(
            "Removed {:d} SNPs because of ambiguous strands".format(nambi))
        self.logger.debug(
            "Removed {:d} SNPs because of unknown RSIDs".format(nunkn))
        self.logger.debug("Removed {:d} SNPs because of low MAF < {:g}".format(
            nlowf, maf_limit))
        self.logger.debug(
            "Removed {:d} SNPs because of low MAF (current)".format(
                nlowf_actual))
        if use_hwe:
            self.logger.debug(
                "Removed {:d} SNPs because of deviation from HWE".format(
                    nhwep))
        return newsnps, np.array(newdosage)

    def normalize_and_center_dosage(self, dosage):
        f = [snp.maf for snp in self._snpinfo]
        f = np.array(f).reshape(-1, 1)
        gtnorm = (dosage - (2 * f)) / np.sqrt(2 * f * (1 - f))
        gtcent = dosage - np.mean(dosage, axis=1).reshape(-1, 1)
        return gtnorm, gtcent

    def load(self):
        ## Read Oxford File
        if self.args.oxf_file:
            oxf = ReadOxford(self.args.oxf_file,
                             self.args.fam_file,
                             self.args.startsnp,
                             self.args.endsnp,
                             isdosage=self.args.isdosage)
            dosage = oxf.dosage
            gt_donor_ids = oxf.samplenames
            snpinfo = oxf.snpinfo

        # Read VCF file
        if self.args.vcf_file:
            vcf = ReadVCF(self.args.vcf_file,
                          self.args.startsnp,
                          self.args.endsnp,
                          samplefile=self.args.fam_file)
            dosage = vcf.dosage
            gt_donor_ids = vcf.donor_ids
            snpinfo = vcf.snpinfo

        # Read Gene Expression
        self.logger.debug("Reading expression levels for trans-eQTL discovery")
        rpkm = ReadRPKM(self.args.gx_file, self.args.gx_datafmt)
        expression = rpkm.expression
        expr_donors = rpkm.donor_ids
        gene_names = rpkm.gene_names

        # Read confounder corrected gene expression
        if self.args.gxcorr_file is not None:
            self.logger.debug(
                "Reading expression levels for target gene discovery")
            rpkm_corr = ReadRPKM(self.args.gxcorr_file, self.args.gx_datafmt)
            exprcorr = self.match_gx_indices(expression, expr_donors,
                                             gene_names, rpkm_corr.expression,
                                             rpkm_corr.donor_ids,
                                             rpkm_corr.gene_names)

        self.logger.debug("Found {:d} genes of {:d} samples".format(
            expression.shape[0], expression.shape[1]))
        self.logger.debug("Reading gencode file for gene information")

        gene_info = readgtf.gencode(self.args.gtf_file,
                                    trim=self.args.gxtrim,
                                    biotype=self.args.biotype)

        # reorder donors gt and expr
        self.logger.debug(
            "Selecting common samples of genotype and gene expression")
        self.logger.debug(
            "Before expression selection: {:d} genes from {:d} samples".format(
                expression.shape[0], expression.shape[1]))
        vcfmask, exprmask = self.select_donors(gt_donor_ids, expr_donors)
        genes, indices = self.select_genes(gene_info, gene_names)
        expression_selected = rpkm._normalize_expr(
            expression[:, exprmask][indices, :])
        if self.args.gxcorr_file is not None:
            exprcorr_selected = rpkm_corr._normalize_expr(
                exprcorr[:, exprmask][indices, :])
        self._geneinfo = genes

        dosage_masked = dosage[:, vcfmask]
        snpinfo_filtered, dosage_filtered_selected = self.filter_snps(
            snpinfo, dosage_masked)
        self.logger.debug("{:d} SNPs after filtering".format(
            len(snpinfo_filtered)))
        self._snpinfo = snpinfo_filtered

        self.logger.debug(
            "After expression selection: {:d} genes from {:d} samples".format(
                expression_selected.shape[0], expression_selected.shape[1]))
        self.logger.debug("Retained {:d} samples".format(vcfmask.shape[0]))

        ### Until here, all filters have been applied and geneinfo and snpinfo reflect current data ###

        self._tgene_gtnorm, self._tgene_gtcent = self.normalize_and_center_dosage(
            dosage_filtered_selected)
        if self.args.gxcorr_file is not None:
            self._tgene_expr = exprcorr_selected
        else:
            self._tgene_expr = expression_selected

        if self.args.cismasking:
            self.logger.debug("Generate cis-masks for GX matrix for each SNP")
            self._cismasklist = cismasking.get_cismasklist(
                self._snpinfo,
                self._geneinfo,
                self.args.chrom,
                window=self.args.window)
            self._cismaskcomp = cismasking.compress_cismasklist(
                self._cismasklist)
            if self.args.crossmapfile is not None:
                self._cismaskcomp = cismasking.extend_cismask(
                    self._geneinfo, self._cismaskcomp, self.args.crossmapfile)

        if self.args.knncorr:
            self.logger.debug(
                "Applying KNN correction on gene expression and genotype")
            gx_corr, gt_corr = knn.knn_correction(expression_selected.T,
                                                  dosage_filtered_selected,
                                                  self.args.knn_nbr)
            self._expr = rpkm._normalize_expr(gx_corr.T)
            self._gtnorm, self._gtcent = self.normalize_and_center_dosage(
                gt_corr)
        else:
            self.logger.debug("No KNN correction.")
            self._expr = expression_selected
            self._gtnorm = self._tgene_gtnorm.copy()
            self._gtcent = self._tgene_gtcent.copy()
            # self._gtnorm, self._gtcent = self.normalize_and_center_dosage(dosage_filtered_selected)

        if self.args.shuffle:
            usedmask = [gt_donor_ids[i] for i in vcfmask]
            if self.args.shuffle_file is not None and os.path.isfile(
                    self.args.shuffle_file):
                self.logger.warn("Shuffling genotype using supplied donor IDs")
                rand_donor_ids = [
                    line.strip() for line in open(self.args.shuffle_file)
                ]
            else:
                self.logger.warn("Shuffling genotype randomly")
                rand_donor_ids = usedmask.copy()
                random.shuffle(rand_donor_ids)
            rand_index = np.array(
                [usedmask.index(x) for x in rand_donor_ids if x in usedmask])
            self._gtnorm = self._gtnorm[:, rand_index]
            self._gtcent = self._gtcent[:, rand_index]
Example #12
0
class JPA:
    def __init__(self,
                 x,
                 y,
                 comm,
                 rank,
                 ncore,
                 qcalc,
                 masks,
                 get_pvals=False,
                 qnull_file=None,
                 statmodel='zstat',
                 target_fdr=None):
        self.gt = x
        self.gx = y
        self._pvals = None
        self._qscores = None
        self._jpa_pvals = None
        self.rank = rank
        self.comm = comm
        self.ncore = ncore
        self.mpi = False
        if self.ncore > 1:
            self.mpi = True

        self.qcalc = qcalc
        self.masks = masks
        self.usemask = True if masks is not None else False
        self.get_empirical_pvals = False
        if get_pvals:
            self.get_empirical_pvals = True
        self.qnull_file = qnull_file
        self.statmodel = statmodel
        self.logger = MyLogger(__name__)
        self.target_fdr = target_fdr
        if self.target_fdr is not None:
            self.adj_pvals = list()
            self.pass_fdr = list()

    @property
    def jpa_pvals(self):
        return self._jpa_pvals

    @property
    def pvals(self):
        return self._pvals

    @property
    def scores(self):
        return self._qscores

    def masked_jpascore(self, pvals, mask):
        usedgenes = np.ones_like(pvals, dtype=np.bool)
        if mask.shape[0] != 0: usedgenes[mask] = False
        res = self.jpascore(pvals[usedgenes])
        return res

    def jpascore(self, pvals):
        p = np.sort(pvals)
        n = p.shape[0]
        kmax = min(100, n)
        krange = [i + 1 for i in range(kmax)]
        digamma_n1 = special.digamma(n + 1)
        z = -(np.log(p[:kmax]) - (special.digamma(krange) - digamma_n1))
        zsum = np.cumsum(z)
        res = np.max(zsum)
        return res

    def clinreg_fstat(self, geno, expr, nrow):
        _path = os.path.dirname(__file__)
        clib = np.ctypeslib.load_library('../lib/linear_regression.so', _path)
        cfstat = clib.fit
        cfstat.restype = ctypes.c_int
        cfstat.argtypes = [
            np.ctypeslib.ndpointer(ctypes.c_double,
                                   ndim=1,
                                   flags='C_CONTIGUOUS, ALIGNED'),
            np.ctypeslib.ndpointer(ctypes.c_double,
                                   ndim=1,
                                   flags='C_CONTIGUOUS, ALIGNED'),
            ctypes.c_int, ctypes.c_int, ctypes.c_int,
            np.ctypeslib.ndpointer(ctypes.c_double,
                                   ndim=1,
                                   flags='C_CONTIGUOUS, ALIGNED')
        ]

        x = geno.reshape(-1, )
        y = expr.reshape(-1, )
        xsize = x.shape[0]
        nsnps = geno.shape[0]
        nsample = geno.shape[1]
        ngene = expr.shape[0]
        fstat = np.zeros(nsnps * ngene)
        success = cfstat(x, y, nsnps, ngene, nsample, fstat)
        res = 1 - stats.f.cdf(fstat, 1, nsample - 2)
        return res

    def clinreg_zstat(self, geno, expr, nrow):
        _path = os.path.dirname(__file__)
        clib = np.ctypeslib.load_library('../lib/linear_regression_zstat.so',
                                         _path)
        czstat = clib.fit
        czstat.restype = ctypes.c_int
        czstat.argtypes = [
            np.ctypeslib.ndpointer(ctypes.c_double,
                                   ndim=1,
                                   flags='C_CONTIGUOUS, ALIGNED'),
            np.ctypeslib.ndpointer(ctypes.c_double,
                                   ndim=1,
                                   flags='C_CONTIGUOUS, ALIGNED'),
            ctypes.c_int, ctypes.c_int, ctypes.c_int,
            np.ctypeslib.ndpointer(ctypes.c_double,
                                   ndim=1,
                                   flags='C_CONTIGUOUS, ALIGNED')
        ]

        x = geno.reshape(-1, )
        y = expr.reshape(-1, )
        xsize = x.shape[0]
        nsnps = geno.shape[0]
        nsample = geno.shape[1]
        ngene = expr.shape[0]
        zstat = np.zeros(nsnps * ngene)
        success = czstat(x, y, nsnps, ngene, nsample, zstat)
        res = 2.0 * (1 - stats.norm.cdf(np.abs(zstat)))
        return res

    def get_qecdf_fit(self, q, ntop):
        qecdf = ECDF(q)
        qsort = q[np.argsort(q)]
        qneg = qsort[-ntop:]
        qcut = qneg[0]
        cumsum = 0
        for qnull in qneg:
            cumsum += qnull - qcut
        lam = (1 / ntop) * cumsum
        prefact = ntop / q.shape[0]
        return qecdf, qcut, lam, prefact

    def p_qscore(self, q, qecdf, qcut, lam, prefact):
        if q < qcut:
            res = 1 - qecdf(q)
        else:
            res = prefact * np.exp(-(q - qcut) / lam)
        return res

    def get_qnull(self):
        ''' 
        This function reads the qnull file, if provided. Otherwise, generates uniform random number.
        Must be called from master and broadcast to the slaves.
        '''
        qmod = np.array([])
        if self.get_empirical_pvals:
            if self.qnull_file is not None and os.path.isfile(self.qnull_file):
                #qnull = self.read_qnull(self.qnull_file)
                self.logger.debug("Read null Q-scores from {:s}".format(
                    self.qnull_file))
                qnull = list()
                with open(self.qnull_file, 'r') as instream:
                    for line in instream:
                        lsplit = line.strip().split()
                        q = float(lsplit[0].strip())
                        qnull.append(q)
                qnull = np.array(qnull)
            else:
                self.logger.debug(
                    "Creating null Q-scores from uniform p-value distribution")
                ngene = 10000
                nsnps = 50000
                qnull = np.array([
                    self.jpascore(np.random.uniform(0, 1, size=ngene))
                    for i in range(nsnps)
                ])
            qmod = qnull[np.isfinite(qnull)]
            self.logger.debug("Obtained {:d} null Q-scores".format(
                qmod.shape[0]))
        return qmod

    def slavejob(self, geno, expr, nmax, offset, masks, qnull):
        '''
        Outputs.
            pvals: p-values from every SNP-gene pair linear regression. Dimension I x G.
            qscores: JPA-score from the above p-values. Dimension I.
            p_jpa: p-values for the significance of JPA, calculated empirically. Dimension I.
        '''
        self.logger.debug('Rank {:d} calculating SNPs {:d} to {:d}'.format(
            self.rank, offset + 1, nmax + offset))
        nsnps = geno.shape[0]
        ngene = expr.shape[0]

        # Simple linear regression calculating either f-statistic or Z-statistic for every SNP-gene pair
        if self.statmodel == 'fstat':
            pvals = self.clinreg_fstat(geno, expr, nmax)
        elif self.statmodel == 'zstat':
            pvals = self.clinreg_zstat(geno, expr, nmax)

        # Calculate JPA-score
        if self.qcalc:
            # calculate JPA for each SNP (using ngene pvals)
            if self.usemask:
                qscores = np.array([
                    self.masked_jpascore(pvals[i * ngene:(i + 1) * ngene],
                                         masks[i]) for i in range(nsnps)
                ])
            else:
                qscores = np.array([
                    self.jpascore(pvals[i * ngene:(i + 1) * ngene])
                    for i in range(nsnps)
                ])
                # nzpvals = pvals.copy()
                # zero_mask = nzpvals == 0
                # nzpmin = np.min(nzpvals[~zero_mask])
                # nzpvals[zero_mask] = nzpmin
                # qscores = np.array([self.jpascore(nzpvals[i*ngene : (i+1)*ngene]) for i in range(nsnps)])
        else:
            qscores = np.zeros(nsnps)

        # Calculate empirical p-values for the JPA-scores
        if self.get_empirical_pvals:
            ntop = min(500, int(qnull.shape[0] / 10))
            qecdf, qcut, lam, prefact = self.get_qecdf_fit(qnull, ntop)
            p_jpa = np.array(
                [self.p_qscore(q, qecdf, qcut, lam, prefact) for q in qscores])
        else:
            p_jpa = np.array([1 for q in qscores])

        return pvals, qscores, p_jpa

    def mpicompute(self):
        if self.rank == 0:
            # this is the master
            # create a list of genotypes for sending to your slaves
            geno, offset = mpihelper.split_genotype(self.gt, self.ncore)
            expr = self.gx
            nsnp = [x.shape[0] for x in geno]
            gmasks = mpihelper.split_genemasks(self.masks, nsnp, offset)
            qnull = self.get_qnull()
        else:
            geno = None
            expr = None
            nsnp = None
            offset = None
            gmasks = None
            qnull = None
            slave_geno = None
            slave_expr = None
            slave_nsnp = None
            slave_offs = None
            slave_gmasks = None
            slave_qnull = None

        slave_geno = self.comm.scatter(geno, root=0)
        slave_expr = self.comm.bcast(expr, root=0)
        slave_nsnp = self.comm.scatter(nsnp, root=0)
        slave_offs = self.comm.scatter(offset, root=0)
        if self.usemask:
            slave_gmasks = self.comm.scatter(gmasks, root=0)
        else:
            slave_gmasks = self.comm.bcast(gmasks, root=0)
        slave_qnull = self.comm.bcast(qnull, root=0)
        self.comm.barrier()

        # ==================================
        # Data sent. Do the calculations
        # ==================================
        pvals, qscores, p_jpa = self.slavejob(slave_geno, slave_expr,
                                              slave_nsnp, slave_offs,
                                              slave_gmasks, slave_qnull)

        # ==================================
        # Collect the results
        # ==================================
        recvbuf = None
        if self.rank == 0:
            self.logger.debug("Number of SNPs sent to each slave: " +
                              ", ".join(["{:d}".format(x) for x in nsnp]))
            ngene = self.gx.shape[0]
            flat_sizes = np.array([n * ngene for n in nsnp])
            recvbuf = np.zeros(sum(flat_sizes), dtype=np.float64)
        else:
            flat_sizes = None
        self.comm.Gatherv(sendbuf=pvals, recvbuf=(recvbuf, flat_sizes), root=0)

        if self.rank == 0:
            self._pvals = recvbuf.reshape(sum(nsnp), ngene)

            ## include FDR correction
            if self.target_fdr is not None:
                self.get_fdr_each()

        qscores = self.comm.gather(qscores, root=0)
        p_jpa = self.comm.gather(p_jpa, root=0)

        if self.rank == 0:
            self._qscores = np.concatenate(qscores)
            if self.get_empirical_pvals:
                self._jpa_pvals = np.concatenate(p_jpa)
        else:
            assert qscores is None
            assert recvbuf is None
            assert p_jpa is None

        return

    def get_fdr_each(self):
        N_snp = self.gt.shape[0]
        N_gene = self.gx.shape[0]
        for i in range(N_snp):
            snp_gene_pval = list()
            for j in range(N_gene):
                if self.usemask and j in self.masks[i]:
                    print(f"for snp {i} skipped gene {j}")
                    continue  # skip pair, gene is masked
                else:
                    snp_gene_pval.append((i, j, self._pvals[i, j]))
            pass_fdr, adj_pvals = self.bh_procedure(snp_gene_pval,
                                                    self.target_fdr)
            self.pass_fdr = self.pass_fdr + pass_fdr
            self.adj_pvals = self.adj_pvals + adj_pvals
        return

    def get_fdr_all(self):
        N_snp = self.gt.shape[0]
        N_gene = self.gx.shape[0]
        snp_gene_pval = list()
        for i in range(N_snp):
            for j in range(N_gene):
                if self.usemask and j in self.masks[i]:
                    continue  # skip pair, gene is masked
                else:
                    snp_gene_pval.append((i, j, self._pvals[i, j]))
        self.pass_fdr, self.adj_pvals = self.bh_procedure(
            snp_gene_pval, self.target_fdr)
        return

    def bh_procedure(self, snp_gene_pval, target_fdr):
        self.logger.debug(
            "Calculating FDR ... sorting {:d} SNP-gene pairs".format(
                len(snp_gene_pval)))
        sorted_pairs = sorted(snp_gene_pval, key=lambda item: item[2])
        n_tests = len(
            sorted_pairs
        )  # NOT equivalent to ntrans-eqtls * ngenes, because ntrans is filtered
        pass_snps = list()
        bh_index_limit = -1
        for i, snp_pval in enumerate(sorted_pairs[::-1]):
            bh_factor = ((n_tests - i) / n_tests) * target_fdr
            if snp_pval[2] > bh_factor:
                continue
            else:
                bh_index_limit = n_tests - i - 1
                break
        if bh_index_limit < 0:
            self.logger.debug(
                "No significant SNP-gene pairs @ {:f} FDR for SNP".format(
                    target_fdr))
            return [], []
        else:
            pass_fdr = [sorted_pairs[i] for i in range(bh_index_limit + 1)]
            adj_pvals = [
                sorted_pairs[i][2] * (n_tests / (i + 1))
                for i in range(n_tests)
            ]  # equiv to report FDR
            return pass_fdr, adj_pvals[:len(pass_fdr)]

    def compute(self):
        if self.mpi:
            self.mpicompute()
        else:
            qnull = self.get_qnull()
            pvals, qscores, p_jpa = self.slavejob(self.gt, self.gx,
                                                  self.gt.shape[0], 0,
                                                  self.masks, qnull)
            self._pvals = pvals.reshape(self.gt.shape[0], self.gx.shape[0])
            # include FDR correction
            if self.target_fdr is not None:
                self.get_fdr_each()
            self._qscores = qscores
            if self.get_empirical_pvals:
                self._jpa_pvals = p_jpa
        return
Example #13
0
import numpy as np
from utils.logs import MyLogger

logger = MyLogger(__name__)


def load(qnull_file):
    qnull = list()
    with open(qnull_file, 'r') as mfile:
        for line in mfile:
            l = line.strip().split()
            q = float(l[0].strip())
            qnull.append(q)
    qnull = np.array(qnull)
    qmod = qnull[np.isfinite(qnull)]
    logger.debug("Read {:d} null Q-scores".format(qmod.shape[0]))
    return qmod
Example #14
0
class ReadOxford:

    _read_samples_once = False
    _read_genotype_once = False
    _nloci = 0
    _nsample = 0

    # deafult is GTEx:
    #     - isdosage = True

    def __init__(self,
                 gtfile,
                 samplefile,
                 startsnp=0,
                 endsnp=1e15,
                 isdosage=True):
        self.logger = MyLogger(__name__)
        self._gtfile = gtfile
        self._samplefile = samplefile
        self._startsnp = startsnp
        self._endsnp = endsnp
        self._isdosage = isdosage
        if self._isdosage:
            self._meta_columns = 6
        else:
            self._meta_columns = 5
        self._read_genotypes()

    @property
    def nsample(self):
        self._read_samples()
        return self._nsample

    @property
    def samplenames(self):
        self._read_samples()
        return self._samplenames

    @property
    def nloci(self):
        return self._nloci

    @property
    def snpinfo(self):
        self._read_genotypes()
        return tuple(self._snpinfo)

    @property
    def dosage(self):
        return tuple(self._dosage)

    @property
    def gtnorm(self):
        return tuple(self._gtnorm)

    @property
    def gtcent(self):
        return tuple(self._gtcent)

    def _read_samples(self):
        if self._read_samples_once:
            return
        self._read_samples_once = True
        with open(self._samplefile, 'r') as samfile:
            sample = 0
            samplenames = list()
            next(samfile)
            next(samfile)
            for line in samfile:
                if re.search('^#', line):
                    continue
                sample += 1
                samplenames.append(line.strip().split()[0])
        self._nsample = sample
        self._samplenames = samplenames

    def _read_dosages(self):
        dosage = list()
        allsnps = list()
        self.logger.info("Started reading genotype.")
        self._nloci = 0
        linenum = 0
        with gzip.open(self._gtfile, 'r') as filereader:
            for snpline in filereader:
                if linenum >= self._startsnp and linenum < self._endsnp:
                    self._nloci += 1
                    mline = snpline.split()

                    if self._isdosage:
                        ngenotypes = len(mline) - self._meta_columns
                    else:
                        ngenotypes = (len(mline) - self._meta_columns) / 3

                    if float(ngenotypes).is_integer():
                        if ngenotypes != self._nsample:
                            self.logger.error(
                                'Number of samples differ from genotypes')
                            raise SAMPLE_NUMBER_ERROR
                    else:
                        self.logger.error(
                            'Number of columns in genotype frequencies not divisible by 3'
                        )
                        raise GT_FREQS_NUMBER_ERROR

                    if self._isdosage:
                        snp_dosage = np.array(
                            [float(x) for x in mline[self._meta_columns:]])
                    else:
                        gt_freqs = np.array(
                            [float(x) for x in mline[self._meta_columns:]])
                        indsAA = np.arange(0, self._nsample) * 3
                        indsAB = indsAA + 1
                        indsBB = indsAB + 1
                        snp_dosage = 2 * gt_freqs[indsBB] + gt_freqs[
                            indsAB]  # [AA, AB, BB] := [0, 1, 2]

                    maf = sum(snp_dosage) / 2 / len(snp_dosage)
                    try:  ######## change to get the chrom numberfrom gtfile
                        chrom = int(mline[0])
                    except:
                        chrom = -1
                    this_snp = SnpInfo(chrom=chrom,
                                       bp_pos=int(mline[2]),
                                       varid=mline[1].decode("utf-8"),
                                       ref_allele=mline[3].decode("utf-8"),
                                       alt_allele=mline[4].decode("utf-8"),
                                       maf=maf)
                    allsnps.append(this_snp)
                    dosage.append(snp_dosage)
                linenum += 1
        return allsnps, np.array(dosage)

    def _read_genotypes(self):
        if self._read_genotype_once:
            return
        self._read_genotype_once = True
        self._read_samples()  # otherwise, self._nsample is not set
        allsnps, dosage = self._read_dosages()
        self.logger.info("Found {:d} SNPs of {:d} samples.".format(
            self._nloci, self._nsample))
        self._dosage = dosage
        self._snpinfo = allsnps
Example #15
0
class RevReg:
    def __init__(self,
                 x,
                 y,
                 sigbeta2,
                 comm,
                 rank,
                 ncore,
                 null='perm',
                 maf=None,
                 masks=None):
        self.gt = x
        self.gx = y
        self.sigbeta2 = sigbeta2
        self.comm = comm
        self.rank = rank
        self.ncore = ncore
        self.null = null
        self.maf = maf
        self.mpi = False
        self.masks = masks
        self.usemask = False
        if self.masks is not None:
            self.usemask = True
        if self.ncore > 1:
            self.mpi = True
        self._pvals = None
        self._qscores = None
        self._mu = None
        self._sigma = None
        self._betas = None

        if self.null == 'perm':
            self.sigx2 = np.var(self.gt, axis=1)
        elif self.null == 'maf':
            self.sigx2 = np.ones(self.gt.shape[0])

        self.logger = MyLogger(__name__)

    @property
    def sb2(self):
        return self.sigbeta2

    @sb2.setter
    def sb2(self, value):
        self.sigbeta2 = value

    @property
    def pvals(self):
        return self._pvals

    @property
    def scores(self):
        return self._qscores

    @property
    def null_mu(self):
        return self._mu

    @property
    def null_sigma(self):
        return self._sigma

    @property
    def betas(self):
        return self._betas

    def slavejob(self,
                 gt,
                 gx,
                 sb2,
                 sx2,
                 maf,
                 masks,
                 start,
                 end,
                 usemask,
                 get_betas=False):
        if usemask:
            if len(masks) == 0: return [], [], [], [], np.array([])
            startsnp = min([min(x.apply2) for x in masks])
            endsnp = max([max(x.apply2) for x in masks])
            totsnp = sum(x.nsnp for x in masks)
            self.logger.debug(
                "Rank {:d} using {:d} masks on {:d} SNPs [{:d} to {:d}]".
                format(self.rank, len(masks), totsnp, startsnp, endsnp))
            stime = time.time()
            p, q, mu, sig, b = self.maskjob(gt, gx, sb2, sx2, maf, masks,
                                            get_betas)
            self.logger.debug("Rank {:d} took {:g} seconds".format(
                self.rank,
                time.time() - stime))
        else:
            self.logger.debug(
                "Rank {:d}. Using {:d} SNPs [{:d} to {:d}]".format(
                    self.rank, end - start, start, end - 1))
            applyon = np.arange(start, end)
            p, q, mu, sig, b = self.basejob(gt, gx, sb2, sx2, maf, applyon,
                                            get_betas)
        return p, q, mu, sig, b

    def maskjob(self, gt, gx, sb2, sx2, maf, masks, get_betas):
        p = np.array([])
        q = np.array([])
        mu = np.array([])
        sig = np.array([])
        b = np.array([])
        for mask in masks:
            usegenes = np.ones(gx.shape[0], dtype=bool)
            if mask.rmv_id.shape[0] > 0: usegenes[mask.rmv_id] = False
            masked_gx = np.ascontiguousarray(gx[usegenes])
            _p, _q, _mu, _sig, _b = self.basejob(gt, masked_gx, sb2, sx2, maf,
                                                 np.array(mask.apply2),
                                                 get_betas)
            p = np.append(p, _p)
            q = np.append(q, _q)
            mu = np.append(mu, _mu)
            sig = np.append(sig, _sig)

            if get_betas:
                # set beta value for masked genes to zero
                betas = self.reshape_masked_betas(_b, mask, gx.shape[0])
                b = np.append(b, betas)

        return p, q, mu, sig, b

    def basejob(self, gt, gx, sb2, sx2, maf, applyon, get_betas):
        slv_gt = np.ascontiguousarray(gt[applyon, :])
        slv_gx = gx
        slv_sb2 = sb2[applyon]
        slv_sx2 = sx2[applyon]
        b = []
        if self.null == 'perm':
            p, q, mu, sig = crrstat.perm_null(slv_gt, slv_gx, slv_sb2, slv_sx2)
        elif self.null == 'maf':
            slv_maf = maf[applyon]
            p, q, mu, sig = crrstat.maf_null(slv_gt, slv_gx, slv_sb2, slv_sx2,
                                             slv_maf)
        if get_betas:
            b = crrstat.crrbetas(slv_gt, slv_gx, slv_sb2)
        #self.logger.debug("Reporting from node {:d}. Sigma = ".format(self.rank) + np.array2string(sig) + "\n" )
        return p, q, mu, sig, b

    def reshape_masked_betas(self, b, mask, ngenes):
        self.logger.debug(
            "Rank {:d}: reshaping {:d} betas into ({:d},{:d}) with {:d} masked genes out of {:d}"
            .format(self.rank, len(b), len(mask.apply2),
                    (ngenes - len(mask.rmv_id)), len(mask.rmv_id), ngenes))
        _b = b.reshape(len(mask.apply2), ngenes - len(mask.rmv_id))
        paddedBeta = np.zeros((len(mask.apply2), ngenes))
        inv_ind = np.delete(np.arange(ngenes), mask.rmv_id)
        paddedBeta[:, inv_ind] = _b
        return paddedBeta.reshape(-1)

    def mpicompute(self, get_betas=False):
        gmasks = None
        pstart = None
        pend = None
        geno = None
        expr = None
        sb2 = None
        sx2 = None
        maf = None
        if self.rank == 0:
            # this is the master
            # create a list of index for sending to your slaves
            if self.usemask:
                self.logger.debug(
                    "Masks on: " +
                    ", ".join(['{:d}'.format(x.nsnp) for x in self.masks]))
                gmasks = mpihelper.split_maskcomp(self.masks, self.ncore)
            else:
                pstart, pend = mpihelper.split_n(self.gt.shape[0], self.ncore)

            self.logger.debug("Get betas set to " + str(get_betas))
            geno = self.gt
            expr = self.gx
            sb2 = self.sigbeta2
            sx2 = self.sigx2
            maf = self.maf

        sb2 = self.comm.bcast(sb2, root=0)
        sx2 = self.comm.bcast(sx2, root=0)
        maf = self.comm.bcast(maf, root=0)
        expr = self.comm.bcast(expr, root=0)
        geno = self.comm.bcast(geno, root=0)
        if self.usemask:
            gmasks = self.comm.scatter(gmasks, root=0)
        else:
            pstart = self.comm.scatter(pstart, root=0)
            pend = self.comm.scatter(pend, root=0)
        self.comm.barrier()

        # ==================================
        # Data sent. Now do the calculations
        # ==================================
        pvals, qscores, mu, sigma, betas = self.slavejob(geno,
                                                         expr,
                                                         sb2,
                                                         sx2,
                                                         maf,
                                                         gmasks,
                                                         pstart,
                                                         pend,
                                                         self.usemask,
                                                         get_betas=get_betas)

        pvals = self.comm.gather(pvals, root=0)
        qscores = self.comm.gather(qscores, root=0)
        mu = self.comm.gather(mu, root=0)
        sigma = self.comm.gather(sigma, root=0)

        if get_betas:
            recvbuf = None
            betalength = len(betas)
            self.comm.barrier()  # is it necessary?
            received_counts = self.comm.gather(betalength)
            if self.rank == 0:
                self.logger.debug(
                    "Number of coefficients from each node: {:s}".format(
                        ", ".join(['{:d}'.format(x)
                                   for x in received_counts])))
                recvbuf = np.zeros(np.sum(received_counts), dtype=np.float64)
            self.comm.Gatherv(sendbuf=betas,
                              recvbuf=(recvbuf, received_counts),
                              root=0)

        if self.rank == 0:
            self._pvals = np.concatenate(pvals)
            self._qscores = np.concatenate(qscores)
            self._mu = np.concatenate(mu)
            self._sigma = np.concatenate(sigma)
            if get_betas:
                self._betas = recvbuf.reshape(self.gt.shape[0],
                                              self.gx.shape[0])
                self.logger.debug(
                    "All nodes computed a total of {:d} pvalues and {:s} betas"
                    .format(len(self._pvals), str(self._betas.shape)))
        else:
            assert qscores is None
            assert pvals is None
            assert mu is None
            assert sigma is None
        return

    def compute(self, get_betas=False):
        if self.mpi:
            self.mpicompute(get_betas)
        else:
            start = 0
            end = self.gt.shape[0]
            self._pvals, self._qscores, self._mu, self._sigma, self._betas = self.slavejob(
                self.gt, self.gx, self.sigbeta2, self.sigx2, self.maf,
                self.masks, start, end, self.usemask, get_betas)
            if get_betas:
                self._betas = self._betas.reshape(self.gt.shape[0],
                                                  self.gx.shape[0])
        return
Example #16
0
from iotools import readqnull

# ==================================
# Start MPI calculation
# ==================================
MPI.Init()
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
ncore = comm.Get_size()
if rank == 0: start_time = time.time()

# ==================================
# Input Processing
# ==================================
args = Args(comm, rank)
logger = MyLogger(__name__)

# List of variables that are broadcast over all slave nodes
gtcent = None  # Centered genotype (x - mu). Note: Not scaled (divided) by standard deviation. Dimension I x N.
gtnorm = None  # Centered and scaled genotype (x - mu) / sigma. Dimension I x N.
expr = None  # Expression matrix. Dimension G x N.
tgene_gtnorm = None  # Centered and scaled genotype without KNN correction. Dimension I x N.
tgene_expr = None  # Expression matrix for finding target genes after RR-score. Dimension G x N.
masklist = None  # List of gene indices masked for each SNP. List of length I. Each element contains a list of indices for the cis-genes of a candidate SNP.
maskcomp = None  # List of CisMasks. Each element is a collection of (a) gene indices to be masked, and (b) all SNP indices which require this mask.
maf = None  # List of MAF of each SNP as observed in the sample (or read separately from the population if file is provided). Length I.
if rank == 0:
    logger.debug("Using {:d} cores".format(ncore))
    data = Data(args)
    data.load()
    gtcent = data.geno_centered
Example #17
0
class RevReg:
    def __init__(self,
                 x,
                 y,
                 sigbeta2,
                 comm,
                 rank,
                 ncore,
                 null='perm',
                 maf=None):
        self.gt = x
        self.gx = y
        self.sigbeta2 = sigbeta2
        self.comm = comm
        self.rank = rank
        self.ncore = ncore
        self.null = null
        self.maf = maf
        self.mpi = False
        if self.ncore > 1:
            self.mpi = True
        self._pvals = None
        self._qscores = None
        self._mu = None
        self._sigma = None

        if self.null == 'perm':
            self.sigx2 = np.var(self.gt, axis=1)
        elif self.null == 'maf':
            self.sigx2 = np.ones(self.gt.shape[0])

        self.logger = MyLogger(__name__)

    @property
    def pvals(self):
        return self._pvals

    @property
    def scores(self):
        return self._qscores

    @property
    def null_mu(self):
        return self._mu

    @property
    def null_sigma(self):
        return self._sigma

    def slavejob(self, gt, gx, sb2, sx2, maf, start, end):
        slv_gt = gt[start:end, :]
        slv_gx = gx
        slv_sb2 = sb2[start:end]
        slv_sx2 = sx2[start:end]
        if self.null == 'perm':
            p, q, mu, sig = crrstat.perm_null(slv_gt, slv_gx, slv_sb2, slv_sx2)
        elif self.null == 'maf':
            slv_maf = maf[start:end]
            p, q, mu, sig = crrstat.maf_null(slv_gt, slv_gx, slv_sb2, slv_sx2,
                                             slv_maf)
        #self.logger.debug("Reporting from node {:d}. Sigma = ".format(self.rank) + np.array2string(sig) + "\n" )
        return p, q, mu, sig

    def mpicompute(self):
        if self.rank == 0:
            # this is the master
            # create a list of index for sending to your slaves
            nmax = int(self.gt.shape[0] / self.ncore)
            offset = 0
            for i in range(1, self.ncore):
                start = offset
                end = offset + nmax
                self.comm.send(start, dest=i, tag=10 + 3 * i - 2)
                self.comm.send(end, dest=i, tag=10 + 3 * i - 1)
                offset += nmax
            start = offset
            end = self.gt.shape[0]
        else:
            start = self.comm.recv(source=0, tag=10 + self.rank * 3 - 2)
            end = self.comm.recv(source=0, tag=10 + self.rank * 3 - 1)

        if self.rank == 0:
            geno = self.gt
            expr = self.gx
            sb2 = self.sigbeta2
            sx2 = self.sigx2
            maf = self.maf
        else:
            geno = None
            expr = None
            sb2 = None
            sx2 = None
            maf = None

        sb2 = self.comm.bcast(sb2, root=0)
        sx2 = self.comm.bcast(sx2, root=0)
        maf = self.comm.bcast(maf, root=0)
        expr = self.comm.bcast(expr, root=0)
        geno = self.comm.bcast(geno, root=0)
        self.comm.barrier()

        # ==================================
        # Data sent. Now do the calculations
        # ==================================
        self.logger.debug(
            "Reporting from node {:d}. Start: {:d} and End: {:d}".format(
                self.rank, start, end))
        pvals, qscores, mu, sigma = self.slavejob(geno, expr, sb2, sx2, maf,
                                                  start, end)

        pvals = self.comm.gather(pvals, root=0)
        qscores = self.comm.gather(qscores, root=0)
        mu = self.comm.gather(mu, root=0)
        sigma = self.comm.gather(sigma, root=0)

        if self.rank == 0:
            self._pvals = np.concatenate(pvals)
            self._qscores = np.concatenate(qscores)
            self._mu = np.concatenate(mu)
            self._sigma = np.concatenate(sigma)
        else:
            assert qscores is None
            assert pvals is None
            assert mu is None
            assert sigma is None
        return

    def compute(self):
        if self.mpi:
            self.mpicompute()
        else:
            start = 0
            end = self.gt.shape[0]
            pvals, qscores, mu, sigma = self.slavejob(self.gt, self.gx,
                                                      self.sigbeta2,
                                                      self.sigx2, self.maf,
                                                      start, end)
            self._pvals = pvals
            self._qscores = qscores
            self._mu = mu
            self._sigma = sigma
        return