Exemple #1
0
def compareDensity(data1, data2):
    ''' Compare two multi-dimensional arrays by the 
    Wasserstein metric (https://en.wikipedia.org/wiki/Wasserstein_metric).
    The input data should have outliers removed before applying this funciton.
    The multidmensional input data is projected onto mutiple directions. 
    The Wasserstein metric is computed on each projected result. 
    This function returns the averaged metrics and its standard error. 
    
    
    Parameters
    ----------
        data1: the first multi-dimensional dataset. Each row is 
                an observation. Each column is a covariate. 
        data2: the second multi-dimensional dataset.
        numBins: the number of bins.
        K: the number of trial random projections.
        
    Outputs
    -------
        mu, sigma: the average discrepency measure and its standard error.
        
    '''
    K = 2000
    result = np.zeros(K)
    pCovariate = data1.shape[1]
    for i in pbar(range(K)):
        # random projection onto one dimension
        transMat = np.random.normal(size=(pCovariate, 1))
        transMat = transMat / np.linalg.norm(transMat, 'fro')
        data1_proj = data1 @ transMat
        data2_proj = data2 @ transMat
        # record the discrepency on the projected dimension
        # between two datasets.
        result[i] = wass1dim(data1_proj, data2_proj)
    return result.mean(), result.std() / np.sqrt(K)
Exemple #2
0
    def plot_MLE_mu_distributions(self):
        sample_sizes = [50, 100, 500, 1000, 5000]
        n = 400
        estimates = [[self.get_loglikelihood_estimate_for_mu(self.sample(sample_size)) for _ in range(n)] \
                     for sample_size in pbar(sample_sizes)]

        plt.figure(figsize=(12, 4))
        plt.subplot(121)
        r = np.r_[[[np.mean(i), np.std(i)] for i in estimates]]
        plt.plot(r[:, 0], lw=3, label="mean MLE $\mu$")
        plt.fill_between(range(len(r)),
                         r[:, 0] + r[:, 1],
                         r[:, 0] - r[:, 1],
                         color="blue",
                         alpha=.1,
                         label="+-1 std MLE $\mu$")
        plt.axhline(self.mu,
                    color="red",
                    alpha=.4,
                    label="true $\mu=%.2f$" % self.mu)
        plt.grid()
        plt.xticks(range(len(sample_sizes)), sample_sizes)
        plt.xlabel("data size")
        plt.ylabel("MLE $\mu$")
        plt.legend()

        plt.subplot(122)
        for size, i in zip(sample_sizes, estimates):
            plt.plot(*kdensity_smoothed_histogram(i),
                     label="data size = %d" % size)
        plt.grid()
        plt.legend()
        plt.title("smoothed histograms for estimated $\mu$")
        plt.xlabel("MLE $\mu$")
        plt.xlim(0, self.mu * 2)
Exemple #3
0
 def rvs(self, n_samples):
     """
     samples the histograms distribution: n = s + b
     """
     nl, sl, bl = [], [], []
     be = self.bin_edges
     iterator = pbar(
         range(n_samples)) if n_samples > 1 else range(n_samples)
     for _ in iterator:
         sample_s = self.s.rvs(self.stot)
         sample_b = self.b.rvs(self.btot)
         ks = np.r_[[
             np.sum((sample_s > be[i]) & (sample_s < be[i + 1]))
             for i in range(0,
                            len(be) - 1)
         ]]
         kb = np.r_[[
             np.sum((sample_b > be[i]) & (sample_b < be[i + 1]))
             for i in range(0,
                            len(be) - 1)
         ]]
         nl.append(self.mu * ks + kb)
         sl.append(ks)
         bl.append(kb)
     nl = pd.DataFrame(np.r_[nl])
     nl.index.name = "sample_nb"
     nl.columns.name = "bin_nb"
     sl = pd.DataFrame(np.r_[sl])
     sl.index.name = "sample_nb"
     sl.columns.name = "bin_nb"
     bl = pd.DataFrame(np.r_[bl])
     bl.index.name = "sample_nb"
     bl.columns.name = "bin_nb"
     return nl, sl, bl
Exemple #4
0
def radialfilter(xp, yp, zp, xb, yb, r = 1.0, method = 'linear'):
    n = len(zp)
    D = np.zeros((n, n))
    pb = pbar(maxval = n*(n-1)/2)
    pb.start()
    j = 0
    for i in range(n):
        for k in range(i):
            pb.update(j)
            j += 1
            dik = np.sqrt((xp[i]-xp[k])**2+(yp[i]-yp[k])**2)
            D[i, k] = D[k, i] = dik

    pb.finish()
    t = np.zeros(n, dtype = 'bool')
    for i in range(n):
        di = D[i, :]
        b = di <= r
        if len(zp[b]) > 5:
            m = np.median(zp[b])
            t += b * (zp <= m)

    # grid center
    dbx, dby = abs(xb[0]-xb[1]), abs(yb[0]-yb[1])
    x, y = xb[:-1] + dbx/2, yb[:-1] + dby/2
    X, Y = np.meshgrid(x, y)

    return griddata((xp[t], yp[t]), zp[t], (X, Y), method = method)
    def _compute_null_dist(self,iterations=500):
        '''Compute the bootstrap null-distribution of MMD2u.
        '''
        mmd2u_null = np.zeros(iterations)
        for i in pbar(range(iterations)):
            idx = np.random.permutation(self._n1 + self._n2)
            XY_i = self._XY[idx, :]
            mmd2u_null[i] = self._MMD2ufast(XY_i[:self._n1,:], XY_i[self._n1:,], self._scale)

        return mmd2u_null
Exemple #6
0
 def fit(self, x, n_steps=5000, use_pbar=False):
     assert self.use_tf, "must set use_tf=True"
     self.history_lastbatch = []
     it = range(n_steps)
     it = pbar(it) if use_pbar else it
     for epoch in it:
         loss_value = self.train_step(x)
         if np.isnan(loss_value):
             break
         self.history_lastbatch.append(loss_value.numpy())
         self.history.append(loss_value.numpy())
Exemple #7
0
 def parse_content_from_db(self):
     for ayat in pbar(Ayat.objects.all()):
         soup = get_soup(ayat.html)
         ayat.content = "".join(
             [x for x in self.get_content(soup)]).replace(
                 # FIXME починить очистку текста
                 "Ссылки на богословские первоисточники и комментарий:",
                 "")
         ayat.arab_text = self.get_arab_text(soup)
         ayat.trans = self.get_transcription(soup)
         logger.debug(f"{ayat.arab_text=}")
         ayat.save()
Exemple #8
0
    def plot_curvature_for_2D_input_flat_manifold(self, x_range=[-1,1], y_range=[-1,1], n_points=1000, batch_size = 20, remove_percentile=3):
        """
        plots the curvature of the output manifold of transformed 2D data.
        the output manifold can have any dimension, but curvature will be plotted
        in the flat input 2D space.
        """
        assert self.model.input.shape[1]==2, "model must have 2D input only"
        from sklearn.preprocessing import MinMaxScaler
        x_range[0] -= np.abs(x_range[0])*.2
        x_range[1] += np.abs(x_range[1])*.2
        y_range[0] -= np.abs(y_range[0])*.2
        y_range[1] += np.abs(y_range[1])*.2
        z = []
        xy = []
        for i in pbar(range(n_points//batch_size)):
            data = (np.random.random(size=(batch_size, 2))-.5)*10
            data[:,0] = MinMaxScaler(feature_range=x_range).fit_transform(data[:,0].reshape(-1,1))[:,0]
            data[:,1] = MinMaxScaler(feature_range=y_range).fit_transform(data[:,1].reshape(-1,1))[:,0]
            tu = tf.Variable(data, dtype=np.float32)
            try:
                r = self.get_ricci_scalar(tu).numpy()
                z.append(r)
                xy.append(tu.numpy())
            except:
                # first call might fail in TF
                pass
        z = np.r_[z].flatten()

        # remove x% on top and bottom of distribution since outliers distort graphs
        zmin,zmax = np.percentile(z, [remove_percentile,100-remove_percentile])
        keep_idxs = (z>=zmin)&(z<=zmax)
        z = z[keep_idxs]

        xy = np.vstack(xy)[keep_idxs]
        x,y = xy[:,0], xy[:,1]


        fig = plt.figure(figsize=(13,3.5))
        
        ax1 = plt.subplot(121)
        ax1.tricontour(x, y, z, levels=14, linewidths=0.5, colors='k')
        cntr1 = ax1.tricontourf(x, y, z, levels=14, cmap="RdBu_r")
        fig.colorbar(cntr1, ax=ax1)
        ax1.plot(x, y, 'ko', ms=3, alpha=.1)
        ax1.set_title("Ricci curvature in transformed data space")

        ax2 = plt.subplot(122)
        ax2.hist(z, bins=30);
        ax2.set_title("distribution of Ricci curvature")
        return ax1
def fit_all_splines(expr, pool=None, progress=False):
    xs = get_xs(expr)
    is_good = (expr.isnull().sum() == 0)

    out = {}
    if progress:
        pb = pbar()
    else:
        pb = lambda x: x

    if pool is True:
        close = True
        pool = Pool()
    elif pool is None:
        for gene in pb(expr.index):
            expr_smooth = pd.rolling_mean(expr.ix[gene],
                                          3,
                                          center=True,
                                          min_periods=1)
            is_good = ~expr_smooth.isnull()
            out[gene] = interpolate.UnivariateSpline(xs[is_good],
                                                     expr_smooth[is_good])
        return out
    else:
        close = False

    asyncs = {}
    for gene in expr.index:
        expr_smooth = pd.rolling_mean(expr.ix[gene],
                                      3,
                                      center=True,
                                      min_periods=1)
        is_good = ~expr_smooth.isnull()
        asyncs[gene] = pool.apply_async(interpolate.UnivariateSpline,
                                        (xs[is_good], expr_smooth))

    for gene in pb(asyncs):
        res = asyncs[gene]
        out[gene] = res.get()
    if close:
        pool.close()
    return out
Exemple #10
0
    def plot_bins_distributions(self, dataset_size, n_datasets, bin_edges):
        k = [
            self.get_bins_counts(self.sample(dataset_size), bin_edges)
            for _ in pbar(range(n_datasets))
        ]
        k = pd.DataFrame(k)
        k.index.name = "dataset_nb"
        k.columns.name = "bin_nb"

        plt.figure(figsize=(2.5 * len(k.columns), 2.5))
        for i, col in enumerate(k.columns):
            plt.subplot(1, len(k.columns), i + 1)
            plot_kdensity_smoothed_histogram(k[col].values,
                                             plot_equivalent_poisson=True)
            plt.title("bin %d\n$M_{\gamma \gamma} \in [%.2f, %.2f]$" %
                      (col, bin_edges[i], bin_edges[i + 1]))
            plt.yticks([])
            plt.xlabel("nb events")
            if i == len(k.columns) // 2:
                plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.25))
Exemple #11
0
    def build_ts_distribution(self, n_events, n_experiments, show_pbar=True):
        from progressbar import progressbar as pbar

        pbar = pbar if show_pbar else list
        s = [self.rvs(n_events) for _ in range(n_experiments)]
        self.ts = np.r_[[-self.likelihood(i) for i in pbar(s)]]
                                     'melXsim_cyc14C_rep2',
                                     'melXsim_cyc14C_rep3',
                                     'simXmel_cyc14C_rep1',
                                     'simXmel_cyc14C_rep2'])


    xs = np.linspace(0, 1, 20, endpoint=True)
    avgs = pd.DataFrame(index=hyb_spatial_difference.index,
                        columns=['avg_sl{}'.format(i+1) for i in range(20)])

    if args.multi:
        with Pool() as p:
            results = (p.apply_async(get_diffs, (expr.ix[gene], mel_splines[gene],
                                                 sim_splines[gene], avgs.columns))
                      for gene in hyb_spatial_difference.index)
            for gene in pbar()(hyb_spatial_difference.index):

                res = next(results).get()
                #res = get_diffs(expr.ix[gene], mel_splines[gene], sim_splines[gene],
                                #avgs.columns)
                (hyb_hyb_diffs[gene],
                 parental_diffs[gene],
                 mel_hyb_diffs[gene],
                 sim_hyb_diffs[gene],
                 avgs.ix[gene],
                 avg_hyb_diffs[gene],
                 avg_levels[gene], hyb_levels.ix[gene],
                 within_diffs_mXs[gene],
                 within_diffs_sXm[gene],
                ) = res
                hyb_spatial_difference[gene] = (avg_hyb_diffs[gene]
                                 set(all_changes[target_gene].keys()))
                    comb = comb + ('const', )
                    if 'bcdP' in comb and 'bcdP2' not in comb:
                        comb = comb + ('bcdP2', )

                    X_tmp = (atlas_expr.ix[in_central, comb, time_point]
                             .T.copy()
                             .dropna(how='all', axis=1)
                            )
                    comb = tuple(X_tmp.columns)
                    if comb in pool: continue
                    pool[comb] = p.apply_async(fit_model, (X_tmp, Y_tmp, co))
                outs = {}
                pr2 = pd.Series(index=pool.keys(), data=np.nan)
                llrs =pd.Series(index=pool.keys(), data=np.nan)
                for comb in pbar()(pool):
                    outs[comb] = pool[comb].get()
                    pr2[comb] = outs[comb].prsquared
                    llrs[comb] = outs[comb].llr
            best_tfs = pr2.sort_values().index[-1]
            best_model = outs[best_tfs]

        print(best_model.summary().as_text())
        best_X = atlas_expr.ix[:, best_tfs, time_point].T

        small_atlas['in_central'] = in_central
        small_atlas['color'] = [
            'b' if not ic else 'k' if yy > co else 'w'
            for ic, yy in zip(small_atlas.in_central, small_atlas.c)
        ]
        for tf in best_tfs:
Exemple #14
0
    lib_size = get_lib_size(args.reads)
    ase_vals = {}
    if False and args.max_jobs != 1:
        # Early experiments suggest this doesn't actually make things faster, so
        # the if False automatically skips this branch.  But if someone later
        # wants to put it back i, it should be easy...
        with Pool(args.max_jobs or cpu_count()) as pool:
            for gene in gene_coords:
                ase_vals[gene] = pool.apply_async(get_ase_by_coords, (
                    gene_coords[gene][0],
                    gene_coords[gene][1],
                    reads,
                    snp_dict,
                ))

            prog = pbar()
            for gene in prog(ase_vals):
                ase_vals[gene] = ase_vals[gene].get()
            if 'finish' in dir(prog):
                prog.finish()
    else:
        prog = pbar()
        for gene in prog(gene_coords):
            if gene_coords[gene][0] not in snp_dict.keys():
                continue
            ase_vals[gene] = get_ase_by_coords(gene_coords[gene][0],
                                               gene_coords[gene][1], reads,
                                               snp_dict)
        if 'finish' in dir(prog):
            prog.finish()
    print("# Library size: " + str(lib_size), file=args.outfile, end='\n')
 def _parse_prayer_times_for_city(self):
     self._get_csv_file()
     csv_reader = csv.reader(self.csv_file.splitlines(), delimiter=";")
     for row in pbar(csv_reader):
         self._set_prayers_to_city(row)
Exemple #16
0
crops_coordinates_sequential = crops_coordinates.reshape(-1, 3, 2, order='F')  

logger.debug(f"{crops_coordinates_sequential.shape=}\n{crops_coordinates_sequential[0]=} ")


# # Crops (if `debug__materialize_crops`)

# In[ ]:


if debug__materialize_crops:
    logger.info("Materializing crops")
    
    crops_sequential = np.array([
        data_volume[tuple(slice(*coords_) for coords_ in coords)]
        for coords in pbar(crops_coordinates_sequential, max_value=crops_coordinates_sequential.shape[0])
    ])
    logger.debug(f"{crops_sequential.shape=}")

    crops_target_shape = list(crops_coordinates.shape[:3]) + list(crop_shape)
    logger.debug(f"{crops_target_shape=}")

    # 'F' reshapes with x varying fastest and z slowest
    # this option is necessary because `crops_coordinates` was reshaped with it
    crops = crops_sequential.reshape(crops_target_shape, order="F")
    del crops_sequential
    logger.debug(f"{crops.shape=}")
    
    if debug__save_processed_crops:
        fname = estimation_volume.debug__crops_coordinates_path
        logger.info(f"Saving crops coordinates at {fname=}")
Exemple #17
0
    # [model]
    modelin_target_shape = (batch_size_, crop_shape[0], crop_shape[1], crop_shape[2], 1)  # adjust nb. channels
    batch_probas = model.predict(
        batch_data.reshape(modelin_target_shape), 
        batch_size=batch_size_,
        steps=1,
    ).astype(args.probabilities_dtype)

    for slice_, crop_proba in zip(batch_slices, batch_probas):
        proba_volume[slice_] += crop_proba.reshape(crop_probas_target_shape)
        redundancies_count[slice_] += np.ones(crop_shape, dtype=np.int)
        
logger.debug("Predicting and summing up the crops' probabilities.")
for batch_idx in pbar(
    range(niterations), 
    prefix="predict-and-sum-probas", 
    max_value=niterations
):
    batch_start = batch_idx * batch_size
    process_batch(batch_start, batch_size)

if last_batch_size > 0:
    logger.info("Segmenting the last batch")
    batch_start = niterations * batch_size
    process_batch(batch_start, last_batch_size)


# In[ ]:


del data_volume
Exemple #18
0
                                  'melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2',
                                  'melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep1',
                                  'simXmel_cyc14C_rep2'
                              ])

    xs = np.linspace(0, 1, 20, endpoint=True)
    avgs = pd.DataFrame(index=hyb_spatial_difference.index,
                        columns=['avg_sl{}'.format(i + 1) for i in range(20)])

    if args.multi:
        with Pool() as p:
            results = (p.apply_async(get_diffs,
                                     (expr.ix[gene], mel_splines[gene],
                                      sim_splines[gene], avgs.columns))
                       for gene in hyb_spatial_difference.index)
            for gene in pbar()(hyb_spatial_difference.index):

                res = next(results).get()
                #res = get_diffs(expr.ix[gene], mel_splines[gene], sim_splines[gene],
                #avgs.columns)
                (
                    hyb_hyb_diffs[gene],
                    parental_diffs[gene],
                    mel_hyb_diffs[gene],
                    sim_hyb_diffs[gene],
                    avgs.ix[gene],
                    avg_hyb_diffs[gene],
                    avg_levels[gene],
                    hyb_levels.ix[gene],
                    within_diffs_mXs[gene],
                    within_diffs_sXm[gene],
    best_r2s = pd.Series.from_csv('analysis/results/svase_best', sep='\t')
    expr = pd.read_table('godot/summary.tsv', **ut.pd_kwargs).drop('---', axis=1)
    mel_tss, sim_tss = get_ortholog_TSS_data()

    has_svase = (ut.true_index(best_r2s.sort_values(ascending=False) > .25)
                 .intersection(mel_tss.keys()))
    no_svase = (best_r2s.index[best_r2s < .01]
                .intersection(expr.index)
                .intersection(mel_tss.keys()))
    median_expr = expr.T.median()

    num_sim_expr = pd.Series(index=has_svase, data=-1)
    already_used = set()
    best_match = pd.DataFrame(index=has_svase, data={'gene': '', 'emd': 1.0})
    p = Pool()
    for gene in pbar()(has_svase):
        similar_expr = ut.true_index((.5 * median_expr[gene] < median_expr[no_svase])
                                     & (median_expr[no_svase] < 2 * median_expr[gene]))
        similar_expr = similar_expr.difference(already_used)
        diff_jobs = {
            target: p.apply_async(dd.earth_mover_multi,
                                  (expr.loc[gene], expr.loc[target]))
            for target in similar_expr
        }
        pattern_diffs = pd.Series({target: diff_jobs[target].get()
                                   for target in similar_expr}).sort_values()
        best_match.loc[gene, 'gene'] = pattern_diffs.index[0]
        best_match.loc[gene, 'emd'] = pattern_diffs[0]
        already_used.add(pattern_diffs.index[0])

    best_match.index.name='svase_gene'
    simXmel_ase = ase.select(**sel_startswith('simXmel'))
    melXsim_is_expr = (melXsim_expr > EXPR_MIN)
    simXmel_is_expr = (simXmel_expr > EXPR_MIN)
    all_is_expr = expr > EXPR_MIN

    min_per_crossdir = 10
    expr_both = (
        (melXsim_ase.T.count() > min_per_crossdir)
        & (simXmel_ase.T.count() > min_per_crossdir)
        & (melXsim_is_expr.T.sum() > min_per_crossdir)
        & (simXmel_is_expr.T.sum() > min_per_crossdir)
    )
    ase_expr = ase.ix[expr_both]
    print("Found {} good genes".format(len(ase_expr)))
    n_reps = 1000
    mel_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps))
    sim_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps))
    with Pool() as p:
        results = [None for i in range(n_reps)]
        for i in range(n_reps):
            results[i] = p.apply_async(get_randomized_scores, (ase_expr, ))

        for i, res in pbar(max_value=n_reps)(enumerate(results)):
            mel_bias, sim_bias = res.get()
            mel_biases.ix[:, i] = mel_bias
            sim_biases.ix[:, i] = sim_bias




                  n_good_slices=np.nan,
                  r2=np.nan,
                  rmsdiff=np.nan),
        index=mel.index,
    )

    all_pred_ase_nan = pd.DataFrame(data=np.nan,
                                    index=ase.index,
                                    columns=ase.columns)

    all_mel_pred = all_pred_ase_nan.copy()
    all_sim_pred = all_pred_ase_nan.copy()

    all_pred_ase = []

    prog = pbar(maxval=len(ase_avgs.index))
    #prog = lambda x: x
    render_pool = Pool()
    renders = []
    for gene in prog(ase_avgs.index):
        if not locals().get('redraw', True):
            break
        good_ase = np.isfinite(ase.ix[gene]) & ~(ase.ix[gene] == ase_maternals)
        xg = ase_xs[good_ase]
        ase_avgs.ix[gene, 'n_good_slices'] = len(xg)
        ase_avgs.ix[gene, 'actual'] = ase.ix[gene].mean()
        sim_pred = pd.Series(sim_splines[gene](ase_xs).clip(1e-3, 1e10),
                             name='predicted_sim_' + gene,
                             index=ase_xs.index)
        mel_pred = pd.Series(mel_splines[gene](ase_xs).clip(1e-3, 1e10),
                             name='predicted_mel_' + gene,
    expr = pd.read_table('godot/summary.tsv', **ut.pd_kwargs).drop('---',
                                                                   axis=1)
    mel_tss, sim_tss = get_ortholog_TSS_data()

    has_svase = (ut.true_index(
        best_r2s.sort_values(ascending=False) > .25).intersection(
            mel_tss.keys()))
    no_svase = (best_r2s.index[best_r2s < .01].intersection(
        expr.index).intersection(mel_tss.keys()))
    median_expr = expr.T.median()

    num_sim_expr = pd.Series(index=has_svase, data=-1)
    already_used = set()
    best_match = pd.DataFrame(index=has_svase, data={'gene': '', 'emd': 1.0})
    p = Pool()
    for gene in pbar()(has_svase):
        similar_expr = ut.true_index(
            (.5 * median_expr[gene] < median_expr[no_svase])
            & (median_expr[no_svase] < 2 * median_expr[gene]))
        similar_expr = similar_expr.difference(already_used)
        diff_jobs = {
            target: p.apply_async(dd.earth_mover_multi,
                                  (expr.loc[gene], expr.loc[target]))
            for target in similar_expr
        }
        pattern_diffs = pd.Series(
            {target: diff_jobs[target].get()
             for target in similar_expr}).sort_values()
        best_match.loc[gene, 'gene'] = pattern_diffs.index[0]
        best_match.loc[gene, 'emd'] = pattern_diffs[0]
        already_used.add(pattern_diffs.index[0])
        sys.stderr.flush()
        for i in range(n_perms):
            print(i, end=' ')
            sys.stdout.flush()
            new_xs = pd.Series(index=xs.index, data=np.random.permutation(xs))
            waiting_jobs.put(Job(fit_and_eval,
                                 args=(ase, logistic, new_xs, colnames),
                                 kwargs={'pool': cluster_args['cpus']},
                                 suffix='_{}_{:04}'.format(func.__name__, i),
                                 **cluster_args
                                ))
            if i < cluster_joblimit:
                activate_job(waiting_jobs, active_jobs)

        sleep(60)
        for i in pbar(max_value=n_perms)(range(n_perms)):
            r2s.extend(active_jobs.get().get())
            if not waiting_jobs.empty():
                activate_job(waiting_jobs, active_jobs)

            dump({'logist': logist_r2s, 'peak': peak_r2s,
                  'last':'_{}_{:04}'.format(func.__name__, i), },
                 open('analysis/results/{prefix}fdr_{suffix}.pkl'
                      .format(prefix=args.prefix, suffix=args.suffix),
                      'wb'))

        np.save('analysis/results/{prefix}fdr_{name}{suffix}.numpy'
                             .format(prefix=args.prefix, name=func.__name__,
                                     suffix=args.suffix),
                np.array(r2s))
Exemple #24
0
        is_male = [col.startswith(males) for col in ase.columns]
        ase.ix[on_x, is_male] = np.nan

    melXsim_expr = expr.select(**sel_startswith('melXsim'))
    simXmel_expr = expr.select(**sel_startswith('simXmel'))
    melXsim_ase = ase.select(**sel_startswith('melXsim'))
    simXmel_ase = ase.select(**sel_startswith('simXmel'))
    melXsim_is_expr = (melXsim_expr > EXPR_MIN)
    simXmel_is_expr = (simXmel_expr > EXPR_MIN)
    all_is_expr = expr > EXPR_MIN

    min_per_crossdir = 10
    expr_both = ((melXsim_ase.T.count() > min_per_crossdir)
                 & (simXmel_ase.T.count() > min_per_crossdir)
                 & (melXsim_is_expr.T.sum() > min_per_crossdir)
                 & (simXmel_is_expr.T.sum() > min_per_crossdir))
    ase_expr = ase.ix[expr_both]
    print("Found {} good genes".format(len(ase_expr)))
    n_reps = 1000
    mel_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps))
    sim_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps))
    with Pool() as p:
        results = [None for i in range(n_reps)]
        for i in range(n_reps):
            results[i] = p.apply_async(get_randomized_scores, (ase_expr, ))

        for i, res in pbar(max_value=n_reps)(enumerate(results)):
            mel_bias, sim_bias = res.get()
            mel_biases.ix[:, i] = mel_bias
            sim_biases.ix[:, i] = sim_bias
                    comb = tuple(
                        set(comb) | set(all_changes[target_gene].keys()))
                    comb = comb + ('const', )
                    if 'bcdP' in comb and 'bcdP2' not in comb:
                        comb = comb + ('bcdP2', )

                    X_tmp = (atlas_expr.ix[in_central, comb,
                                           time_point].T.copy().dropna(
                                               how='all', axis=1))
                    comb = tuple(X_tmp.columns)
                    if comb in pool: continue
                    pool[comb] = p.apply_async(fit_model, (X_tmp, Y_tmp, co))
                outs = {}
                pr2 = pd.Series(index=pool.keys(), data=np.nan)
                llrs = pd.Series(index=pool.keys(), data=np.nan)
                for comb in pbar()(pool):
                    outs[comb] = pool[comb].get()
                    pr2[comb] = outs[comb].prsquared
                    llrs[comb] = outs[comb].llr
            best_tfs = pr2.sort_values().index[-1]
            best_model = outs[best_tfs]

        print(best_model.summary().as_text())
        best_X = atlas_expr.ix[:, best_tfs, time_point].T

        small_atlas['in_central'] = in_central
        small_atlas['color'] = [
            'b' if not ic else 'k' if yy > co else 'w'
            for ic, yy in zip(small_atlas.in_central, small_atlas.c)
        ]
        for tf in best_tfs:
Exemple #26
0
def dicomloaddir(files, filenamepattern='*.dcm', maxtoread=None, phasemode=None,\
    desiredinplansize=None, dformat='float'):
    '''
    dicomloaddir(files, filenamepattern='*.dcm', maxtoread=None, phasemode=None,\
        desiredinplansize=None, dformat='float'):

    load multiple dicom files in one or multi directories

    Input:
        <files>: can be:
            (1) a string of a directory
            (2) a list of (2)
            (3) a rzpath object
            (4) a list of (3) object
        <filenamepattern>: str, the wildcard for the dicom file in a directory
        <maxtoread>: int, maximum number of dicom files to read
        <phasemodel>: ...implement later, ignore for now...
        <desiredinplansize>: ...implement later, ignore for now...,
            a 1x2 array, desired inplace size, if dicom files
            do not follow this size, we resize it.
        <dformat>: ...implement later, ignore for now
            read in data format
    Output:
        <vollist>: a list of volume arrays for multiple runs, if just one run,
            we return the array
        <dicominfolist>: a list of dicom info dict for multiple runs, if just one run,
            we return the dicom info dict

    Note:
        1. This function currently works with Siemens Prisma 3T and Magnetom 7T, not sure
        other scanners like GE. For Siemens, we focus on these attributes (can update this):
            (0018, 0050) Slice Thickness
            (0028, 0030) Pixel Spacing
            (0051, 100b) AcquisitionMatrixText
            (0019, 100a) NumberOfImagesInMosaic, 1 if anatomical data
            (0018, 0080) Time (TR)
            (0018, 0081) Echo Time (TE)
            (0018, 1312) Inplane Phase Encoding Direction
            (0019, 1029) MosaicRefAcqTimes (slicetimeorder), None if anatomical data
            (0051, 1016) a str, check mosaic, read from the dicom file
            (0051, 100c) FOV
            We also add keys:
            'ismosaic': boolean, whether this is a mosaic image
            'voxelsize': 1x3 list, based on Slice Thickness and Pixel Spacing
            'AcquisitionMatrix': [phase, frequency] matrix, derived from AcquisitionMatrixText. Phase step
                has no meaning if data is structure?

            'FovSize':[phase_len, frequency_len] mm, derived from FOV
            'epireadouttime': calculated from rz.mri.dicom_readout_msec, only valid for epi, None if other files

        2. Note that all these keys are scanner specific. Most of these should work for
            Siemens scanner but might not work for GE or Phillipe scanner.


    Example:


    Todo:
        1. figure out how to add read phase data
        2. check if some of the fields do no exist
        3. resize image to accommodate desired inplane size
        4. save all metafile using pickel

    History:
        20180720 <files> now can accept path-like objects
        20180626 RZ fixed the bug for reading the anatomical files
        20180605 RZ use nibabel.nicom.csareader.get_csa_header() function to read
            csa file and get the [BandWidthPerPixelPhaseEncode]
        20180422 RZ change the stack images in the last step so user can see
            report while waiting for image stack
        20180420 RZ created this function

    '''

    from pydicom import dcmread
    from RZutilpy.rzio import matchfiles
    from RZutilpy.array import split2d
    from RZutilpy.mri import dicom_readout_msec
    from RZutilpy.system import rzpath
    from numpy import stack
    from progressbar import progressbar as pbar
    import re
    import time

    # deal with input
    files = [files] if not isinstance(files, list) else files
    # convert it to path-like object
    files = [rzpath(p) if not isinstance(p, rzpath) else p for p in files]

    # start to load
    dicominfolist = []
    vollist = []
    for iDir, filedir in enumerate(files):  # loop directory
        filepattern = filedir / filenamepattern
        dcmnames = matchfiles(filepattern.str)
        if len(dcmnames) == 0:
            print(
                'This {} does not appear to be a directory containing {} files, so skipping.\n'
                .format(filedir, filenamepattern))
            break
        else:
            print(
                'This {} appear to be a directory containing {} files, so loading.\n'
                .format(filedir, filenamepattern))

        dcmnames = dcmnames[:maxtoread]  # remove last couple of dcm files

        # ====== deal with dicom info, save a customized dicominfo dict =======
        ds = dcmread(dcmnames[0].str)  # read 1st vol for info purpose
        # note current we assume this dicom have all fields below!! And we save
        # the very raw dicom info here
        dcminfothisrun = dict()
        dcminfothisrun['SliceThickness'] = ds.SliceThickness
        dcminfothisrun['PixelSpacing'] = ds.PixelSpacing
        dcminfothisrun['AcquisitionMatrixText'] = ds.AcquisitionMatrixText
        dcminfothisrun['RepetitionTime'] = ds.RepetitionTime
        dcminfothisrun['EchoTime'] = ds.EchoTime
        dcminfothisrun[
            'InPlanePhaseEncodingDirection'] = ds.InPlanePhaseEncodingDirection
        dcminfothisrun['FOV'] = ds[int('0051', 16), int('100c', 16)].value
        dcminfothisrun['checkmosaic'] = ds[int('0051', 16),
                                           int('1016', 16)].value

        # figure out whether it is mosaic image
        if dcminfothisrun['checkmosaic'].find('MOSAIC') >= 0:
            dcminfothisrun['ismosaic'] = True  # indicate this is a epi file
            print(
                'We are loading some mosaic images, need to convert a mosaic image to 3d,\
                this directory might contain epi data ...\n')
        else:
            dcminfothisrun[
                'ismosaic'] = False  # indicate this is not a epi file
        if [int('0019', 16), int('100a', 16)] in ds:  # simense
            dcminfothisrun['NumberOfImagesInMosaic'] = ds[
                int('0019', 16),
                int('100a', 16)].value if dcminfothisrun['ismosaic'] else 1
        elif [int('0021', 16), int('104f', 16)] in ds:  # GE
            dcminfothisrun['NumberOfImagesInMosaic'] = ds[
                int('0021', 16),
                int('104f', 16)].value if dcminfothisrun['ismosaic'] else 1

        dcminfothisrun['MosaicRefAcqTimes'] = ds[
            int('0019', 16),
            int('1029', 16)].value if dcminfothisrun['ismosaic'] else None
        dcminfothisrun['epireadouttime'] = dicom_readout_msec(
            ds)[0] if dcminfothisrun['ismosaic'] else None

        # save voxel size
        dcminfothisrun['voxelsize'] = list(dcminfothisrun['PixelSpacing']) + [
            dcminfothisrun['SliceThickness']
        ]

        # figure out inplane matrix, not that we assume
        # note this regular expression might fail in normal resolution imaging

        p = re.compile(r'^(\d{1,4}).?\*(\d{1,4}).?$')
        matchgroup = p.match(dcminfothisrun['AcquisitionMatrixText'])
        if matchgroup:
            plines = int(
                matchgroup.group(1))  # step in phase encoding direction
            flines = int(
                matchgroup.group(2))  # step in frequency encoding direction
            dcminfothisrun['AcquisitionMatrix'] = [plines, flines]
        else:
            ValueError('can not find the phase encoding direction!')

        # figure out inplane matrix, not that we assume
        p = re.compile(r'^FoV (\d{1,6})\*(\d{1,6})$')
        matchgroup = p.match(dcminfothisrun['FOV'])
        p_len = int(matchgroup.group(1))  # step in phase encoding direction
        f_len = int(
            matchgroup.group(2))  # step in frequency encoding direction
        dcminfothisrun['FovSize'] = [
            p_len / 10, f_len / 10
        ] if dcminfothisrun['ismosaic'] else [p_len, f_len]
        # have to divide this number by 10 for epidata, not sure why....

        # save dicom info in this run
        dicominfolist.append(dcminfothisrun)
        # show some information
        print(dcminfothisrun)

        # ================  deal with the volumes ====================
        print('\nReading in dicoms ......')
        vol = [dcmread(i.str).pixel_array
               for i in pbar(dcmnames)]  # read pixel data
        # split mosaic images
        if dcminfothisrun['ismosaic']:
            # Note that we assume plines and flines will be exact divided by the image
            # this is typically true
            vol = [split2d(i, plines, flines)
                   for i in vol]  # split each 2d mosaic image to 3d image
            # only keep acquired slices, the last several images are sometimes black
            vol = [
                i[:, :, :dcminfothisrun['NumberOfImagesInMosaic']] for i in vol
            ]

        # stack images, take a while
        print('\n\nStack images ......\n')
        vol = stack(vol, axis=-1)  # stack to a 3d/4d file
        if vol.ndim == 3:  # expand to 4d if only 3d
            vol = vol[..., None]
        vollist.append(vol)

        # report info
        print(
            'The 3D dimensions of the final returned volume are {}.\n'.format(
                vol.shape[:3]))
        print('There are {} volumes in the fourth dimension.\n'.format(
            vol.shape[-1]))
        if dcminfothisrun['ismosaic']:
            print('These are mosaic images, might be epi data.\n')
        else:
            print('These are not mosaic images, might not be epi data.\n')
        print('The voxel size (mm) of the final returned volume is {}.\n'.format\
            (dcminfothisrun['voxelsize']))
        print('The in-plane matrix size (PE x FE) appears to be {}.\n'.format\
            (dcminfothisrun['AcquisitionMatrix']))
        print('The field-of-view (mm) of the final returned volume is {}.\n'.format\
            (dcminfothisrun['FovSize']))
        print('The TR is {} ms.\n\n\n\n'.format(
            dcminfothisrun['RepetitionTime']))

    if len(vollist) == 1:
        vollist = vollist[0]
    if len(dicominfolist) == 1:
        dicominfolist = dicominfolist[0]
    return vollist, dicominfolist