def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG, dmatrix1):

    mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis]
    index = sp.where(disp_conv)[0]

    lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1)
    upperBound = sp.percentile(sp.unique(disp_raw[index]), 99)

    idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0]

    matrix = sp.ones((idx.shape[0], 2), dtype='float')
    matrix[:, 0] /= mean_count[idx].ravel()

    modGamma = sm.GLM(disp_raw[idx], matrix, family=sm.families.Gamma(sm.families.links.identity))
    res = modGamma.fit()
    Lambda = res.params

    disp_fitted = disp_raw.copy()
    ok_idx = sp.where(~sp.isnan(disp_fitted))[0]
    disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1]

    if sp.sum(disp_fitted > 0) > 0:
        print "Found dispersion fit"

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=counts,
                                disp=disp_fitted,
                                matrix=dmatrix1,
                                figtitle='Fitted Dispersion Estimate',
                                filename=os.path.join(CFG['plot_dir'], 'dispersion_fitted.pdf'),
                                CFG=CFG)

    return (disp_fitted, Lambda, idx)
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG):

    if CFG['verbose']:
        print 'Start to estimate adjusted dispersions.'

    varLogDispSamp = polygamma(1, (dmatrix1.shape[0] - dmatrix1.shape[1] ) / 2) ## number of samples - number of coefficients
    varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp)

    if CFG['parallel'] > 1:
        disp_adj = sp.empty((counts.shape[0], 1))
        disp_adj.fill(sp.nan)
        disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [sp.arange(x, min(x + binsize, counts.shape[0])) for x in range(0, counts.shape[0], binsize)]

        try:
            result = [pool.apply_async(adjust_dispersion_chunk, args=(counts[cidx, :], dmatrix1, disp_raw[cidx], disp_fitted[cidx], varPrior, sf, CFG, cidx,)) for cidx in idx_chunks]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, counts.shape[0])
                        res_cnt += 1
                    disp_adj[j] = tmp[0][i]
                    disp_adj_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(counts.shape[0], counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:        
        (disp_adj, disp_adj_conv, _) = adjust_dispersion_chunk(counts, dmatrix1, disp_raw, disp_fitted, varPrior, sf, CFG, sp.arange(counts.shape[0]), log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=counts,
                           disp=disp_adj,
                           matrix=dmatrix1,
                           figtitle='Adjusted Dispersion Estimate',
                           filename=os.path.join(CFG['plot_dir'], 'dispersion_adjusted.pdf'),
                           CFG=CFG)

    return (disp_adj, disp_adj_conv)
def estimate_dispersion(gene_counts, matrix, sf, CFG):
    
    if CFG['verbose']:
        print 'Estimating raw dispersions'

    if CFG['parallel'] > 1:
        disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float')
        disp_raw.fill(sp.nan)
        disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'], initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [sp.arange(x, min(x + binsize, gene_counts.shape[0])) for x in range(0, gene_counts.shape[0], binsize)]

        try:
            result = [pool.apply_async(estimate_dispersion_chunk, args=(gene_counts[idx, :], matrix, sf, CFG, idx,)) for idx in idx_chunks]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, gene_counts.shape[0])
                        res_cnt += 1
                    disp_raw[j] = tmp[0][i]
                    disp_raw_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(gene_counts.shape[0], gene_counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:        
        (disp_raw, disp_raw_conv, _) = estimate_dispersion_chunk(gene_counts, matrix, sf, CFG, sp.arange(gene_counts.shape[0]), log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=gene_counts,
                                disp=disp_raw,
                                matrix=matrix,
                                figtitle='Raw Dispersion Estimate',
                                filename=os.path.join(CFG['plot_dir'], 'dispersion_raw.pdf'),
                                CFG=CFG)

    return (disp_raw, disp_raw_conv)
Exemple #4
0
def fit_dispersion(counts, disp_raw, disp_conv, sf, CFG, dmatrix1):

    mean_count = sp.mean(counts / sf, axis=1)[:, sp.newaxis]
    index = sp.where(disp_conv)[0]

    lowerBound = sp.percentile(sp.unique(disp_raw[index]), 1)
    upperBound = sp.percentile(sp.unique(disp_raw[index]), 99)

    idx = sp.where((disp_raw > lowerBound) & (disp_raw < upperBound))[0]

    matrix = sp.ones((idx.shape[0], 2), dtype='float')
    matrix[:, 0] /= mean_count[idx].ravel()

    modGamma = sm.GLM(disp_raw[idx],
                      matrix,
                      family=sm.families.Gamma(sm.families.links.identity))
    res = modGamma.fit()
    Lambda = res.params

    disp_fitted = disp_raw.copy()
    ok_idx = sp.where(~sp.isnan(disp_fitted))[0]
    disp_fitted[ok_idx] = Lambda[0] / mean_count[ok_idx] + Lambda[1]

    if sp.sum(disp_fitted > 0) > 0:
        print "Found dispersion fit"

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=counts,
                                disp=disp_fitted,
                                matrix=dmatrix1,
                                figtitle='Fitted Dispersion Estimate',
                                filename=os.path.join(CFG['plot_dir'],
                                                      'dispersion_fitted.pdf'),
                                CFG=CFG)

    return (disp_fitted, Lambda, idx)
Exemple #5
0
def adjust_dispersion(counts, dmatrix1, disp_raw, disp_fitted, idx, sf, CFG):

    if CFG['verbose']:
        print 'Start to estimate adjusted dispersions.'

    varLogDispSamp = polygamma(
        1, (dmatrix1.shape[0] - dmatrix1.shape[1]) /
        2)  ## number of samples - number of coefficients
    varPrior = calculate_varPrior(disp_raw, disp_fitted, idx, varLogDispSamp)

    if CFG['parallel'] > 1:
        disp_adj = sp.empty((counts.shape[0], 1))
        disp_adj.fill(sp.nan)
        disp_adj_conv = sp.zeros_like(disp_adj, dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'],
                       initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [
            sp.arange(x, min(x + binsize, counts.shape[0]))
            for x in range(0, counts.shape[0], binsize)
        ]

        try:
            result = [
                pool.apply_async(adjust_dispersion_chunk,
                                 args=(
                                     counts[cidx, :],
                                     dmatrix1,
                                     disp_raw[cidx],
                                     disp_fitted[cidx],
                                     varPrior,
                                     sf,
                                     CFG,
                                     cidx,
                                 )) for cidx in idx_chunks
            ]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, counts.shape[0])
                        res_cnt += 1
                    disp_adj[j] = tmp[0][i]
                    disp_adj_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(counts.shape[0], counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:
        (disp_adj, disp_adj_conv,
         _) = adjust_dispersion_chunk(counts,
                                      dmatrix1,
                                      disp_raw,
                                      disp_fitted,
                                      varPrior,
                                      sf,
                                      CFG,
                                      sp.arange(counts.shape[0]),
                                      log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=counts,
                                disp=disp_adj,
                                matrix=dmatrix1,
                                figtitle='Adjusted Dispersion Estimate',
                                filename=os.path.join(
                                    CFG['plot_dir'],
                                    'dispersion_adjusted.pdf'),
                                CFG=CFG)

    return (disp_adj, disp_adj_conv)
Exemple #6
0
def estimate_dispersion(gene_counts, matrix, sf, CFG):

    if CFG['verbose']:
        print 'Estimating raw dispersions'

    if CFG['parallel'] > 1:
        disp_raw = sp.empty((gene_counts.shape[0], 1), dtype='float')
        disp_raw.fill(sp.nan)
        disp_raw_conv = sp.zeros((gene_counts.shape[0], 1), dtype='bool')

        pool = mp.Pool(processes=CFG['parallel'],
                       initializer=lambda: sig.signal(sig.SIGINT, sig.SIG_IGN))
        binsize = 30
        idx_chunks = [
            sp.arange(x, min(x + binsize, gene_counts.shape[0]))
            for x in range(0, gene_counts.shape[0], binsize)
        ]

        try:
            result = [
                pool.apply_async(estimate_dispersion_chunk,
                                 args=(
                                     gene_counts[idx, :],
                                     matrix,
                                     sf,
                                     CFG,
                                     idx,
                                 )) for idx in idx_chunks
            ]
            res_cnt = 0
            while result:
                tmp = result.pop(0).get()
                for i, j in enumerate(tmp[2]):
                    if CFG['verbose']:
                        log_progress(res_cnt, gene_counts.shape[0])
                        res_cnt += 1
                    disp_raw[j] = tmp[0][i]
                    disp_raw_conv[j] = tmp[1][i]
            if CFG['verbose']:
                log_progress(gene_counts.shape[0], gene_counts.shape[0])
                print ''
            pool.terminate()
            pool.join()
        except KeyboardInterrupt:
            print >> sys.stderr, 'Keyboard Interrupt - exiting'
            pool.terminate()
            pool.join()
            sys.exit(1)
    else:
        (disp_raw, disp_raw_conv,
         _) = estimate_dispersion_chunk(gene_counts,
                                        matrix,
                                        sf,
                                        CFG,
                                        sp.arange(gene_counts.shape[0]),
                                        log=CFG['verbose'])

    if CFG['diagnose_plots']:
        plot.mean_variance_plot(counts=gene_counts,
                                disp=disp_raw,
                                matrix=matrix,
                                figtitle='Raw Dispersion Estimate',
                                filename=os.path.join(CFG['plot_dir'],
                                                      'dispersion_raw.pdf'),
                                CFG=CFG)

    return (disp_raw, disp_raw_conv)