def compareDensity(data1, data2): ''' Compare two multi-dimensional arrays by the Wasserstein metric (https://en.wikipedia.org/wiki/Wasserstein_metric). The input data should have outliers removed before applying this funciton. The multidmensional input data is projected onto mutiple directions. The Wasserstein metric is computed on each projected result. This function returns the averaged metrics and its standard error. Parameters ---------- data1: the first multi-dimensional dataset. Each row is an observation. Each column is a covariate. data2: the second multi-dimensional dataset. numBins: the number of bins. K: the number of trial random projections. Outputs ------- mu, sigma: the average discrepency measure and its standard error. ''' K = 2000 result = np.zeros(K) pCovariate = data1.shape[1] for i in pbar(range(K)): # random projection onto one dimension transMat = np.random.normal(size=(pCovariate, 1)) transMat = transMat / np.linalg.norm(transMat, 'fro') data1_proj = data1 @ transMat data2_proj = data2 @ transMat # record the discrepency on the projected dimension # between two datasets. result[i] = wass1dim(data1_proj, data2_proj) return result.mean(), result.std() / np.sqrt(K)
def plot_MLE_mu_distributions(self): sample_sizes = [50, 100, 500, 1000, 5000] n = 400 estimates = [[self.get_loglikelihood_estimate_for_mu(self.sample(sample_size)) for _ in range(n)] \ for sample_size in pbar(sample_sizes)] plt.figure(figsize=(12, 4)) plt.subplot(121) r = np.r_[[[np.mean(i), np.std(i)] for i in estimates]] plt.plot(r[:, 0], lw=3, label="mean MLE $\mu$") plt.fill_between(range(len(r)), r[:, 0] + r[:, 1], r[:, 0] - r[:, 1], color="blue", alpha=.1, label="+-1 std MLE $\mu$") plt.axhline(self.mu, color="red", alpha=.4, label="true $\mu=%.2f$" % self.mu) plt.grid() plt.xticks(range(len(sample_sizes)), sample_sizes) plt.xlabel("data size") plt.ylabel("MLE $\mu$") plt.legend() plt.subplot(122) for size, i in zip(sample_sizes, estimates): plt.plot(*kdensity_smoothed_histogram(i), label="data size = %d" % size) plt.grid() plt.legend() plt.title("smoothed histograms for estimated $\mu$") plt.xlabel("MLE $\mu$") plt.xlim(0, self.mu * 2)
def rvs(self, n_samples): """ samples the histograms distribution: n = s + b """ nl, sl, bl = [], [], [] be = self.bin_edges iterator = pbar( range(n_samples)) if n_samples > 1 else range(n_samples) for _ in iterator: sample_s = self.s.rvs(self.stot) sample_b = self.b.rvs(self.btot) ks = np.r_[[ np.sum((sample_s > be[i]) & (sample_s < be[i + 1])) for i in range(0, len(be) - 1) ]] kb = np.r_[[ np.sum((sample_b > be[i]) & (sample_b < be[i + 1])) for i in range(0, len(be) - 1) ]] nl.append(self.mu * ks + kb) sl.append(ks) bl.append(kb) nl = pd.DataFrame(np.r_[nl]) nl.index.name = "sample_nb" nl.columns.name = "bin_nb" sl = pd.DataFrame(np.r_[sl]) sl.index.name = "sample_nb" sl.columns.name = "bin_nb" bl = pd.DataFrame(np.r_[bl]) bl.index.name = "sample_nb" bl.columns.name = "bin_nb" return nl, sl, bl
def radialfilter(xp, yp, zp, xb, yb, r = 1.0, method = 'linear'): n = len(zp) D = np.zeros((n, n)) pb = pbar(maxval = n*(n-1)/2) pb.start() j = 0 for i in range(n): for k in range(i): pb.update(j) j += 1 dik = np.sqrt((xp[i]-xp[k])**2+(yp[i]-yp[k])**2) D[i, k] = D[k, i] = dik pb.finish() t = np.zeros(n, dtype = 'bool') for i in range(n): di = D[i, :] b = di <= r if len(zp[b]) > 5: m = np.median(zp[b]) t += b * (zp <= m) # grid center dbx, dby = abs(xb[0]-xb[1]), abs(yb[0]-yb[1]) x, y = xb[:-1] + dbx/2, yb[:-1] + dby/2 X, Y = np.meshgrid(x, y) return griddata((xp[t], yp[t]), zp[t], (X, Y), method = method)
def _compute_null_dist(self,iterations=500): '''Compute the bootstrap null-distribution of MMD2u. ''' mmd2u_null = np.zeros(iterations) for i in pbar(range(iterations)): idx = np.random.permutation(self._n1 + self._n2) XY_i = self._XY[idx, :] mmd2u_null[i] = self._MMD2ufast(XY_i[:self._n1,:], XY_i[self._n1:,], self._scale) return mmd2u_null
def fit(self, x, n_steps=5000, use_pbar=False): assert self.use_tf, "must set use_tf=True" self.history_lastbatch = [] it = range(n_steps) it = pbar(it) if use_pbar else it for epoch in it: loss_value = self.train_step(x) if np.isnan(loss_value): break self.history_lastbatch.append(loss_value.numpy()) self.history.append(loss_value.numpy())
def parse_content_from_db(self): for ayat in pbar(Ayat.objects.all()): soup = get_soup(ayat.html) ayat.content = "".join( [x for x in self.get_content(soup)]).replace( # FIXME починить очистку текста "Ссылки на богословские первоисточники и комментарий:", "") ayat.arab_text = self.get_arab_text(soup) ayat.trans = self.get_transcription(soup) logger.debug(f"{ayat.arab_text=}") ayat.save()
def plot_curvature_for_2D_input_flat_manifold(self, x_range=[-1,1], y_range=[-1,1], n_points=1000, batch_size = 20, remove_percentile=3): """ plots the curvature of the output manifold of transformed 2D data. the output manifold can have any dimension, but curvature will be plotted in the flat input 2D space. """ assert self.model.input.shape[1]==2, "model must have 2D input only" from sklearn.preprocessing import MinMaxScaler x_range[0] -= np.abs(x_range[0])*.2 x_range[1] += np.abs(x_range[1])*.2 y_range[0] -= np.abs(y_range[0])*.2 y_range[1] += np.abs(y_range[1])*.2 z = [] xy = [] for i in pbar(range(n_points//batch_size)): data = (np.random.random(size=(batch_size, 2))-.5)*10 data[:,0] = MinMaxScaler(feature_range=x_range).fit_transform(data[:,0].reshape(-1,1))[:,0] data[:,1] = MinMaxScaler(feature_range=y_range).fit_transform(data[:,1].reshape(-1,1))[:,0] tu = tf.Variable(data, dtype=np.float32) try: r = self.get_ricci_scalar(tu).numpy() z.append(r) xy.append(tu.numpy()) except: # first call might fail in TF pass z = np.r_[z].flatten() # remove x% on top and bottom of distribution since outliers distort graphs zmin,zmax = np.percentile(z, [remove_percentile,100-remove_percentile]) keep_idxs = (z>=zmin)&(z<=zmax) z = z[keep_idxs] xy = np.vstack(xy)[keep_idxs] x,y = xy[:,0], xy[:,1] fig = plt.figure(figsize=(13,3.5)) ax1 = plt.subplot(121) ax1.tricontour(x, y, z, levels=14, linewidths=0.5, colors='k') cntr1 = ax1.tricontourf(x, y, z, levels=14, cmap="RdBu_r") fig.colorbar(cntr1, ax=ax1) ax1.plot(x, y, 'ko', ms=3, alpha=.1) ax1.set_title("Ricci curvature in transformed data space") ax2 = plt.subplot(122) ax2.hist(z, bins=30); ax2.set_title("distribution of Ricci curvature") return ax1
def fit_all_splines(expr, pool=None, progress=False): xs = get_xs(expr) is_good = (expr.isnull().sum() == 0) out = {} if progress: pb = pbar() else: pb = lambda x: x if pool is True: close = True pool = Pool() elif pool is None: for gene in pb(expr.index): expr_smooth = pd.rolling_mean(expr.ix[gene], 3, center=True, min_periods=1) is_good = ~expr_smooth.isnull() out[gene] = interpolate.UnivariateSpline(xs[is_good], expr_smooth[is_good]) return out else: close = False asyncs = {} for gene in expr.index: expr_smooth = pd.rolling_mean(expr.ix[gene], 3, center=True, min_periods=1) is_good = ~expr_smooth.isnull() asyncs[gene] = pool.apply_async(interpolate.UnivariateSpline, (xs[is_good], expr_smooth)) for gene in pb(asyncs): res = asyncs[gene] out[gene] = res.get() if close: pool.close() return out
def plot_bins_distributions(self, dataset_size, n_datasets, bin_edges): k = [ self.get_bins_counts(self.sample(dataset_size), bin_edges) for _ in pbar(range(n_datasets)) ] k = pd.DataFrame(k) k.index.name = "dataset_nb" k.columns.name = "bin_nb" plt.figure(figsize=(2.5 * len(k.columns), 2.5)) for i, col in enumerate(k.columns): plt.subplot(1, len(k.columns), i + 1) plot_kdensity_smoothed_histogram(k[col].values, plot_equivalent_poisson=True) plt.title("bin %d\n$M_{\gamma \gamma} \in [%.2f, %.2f]$" % (col, bin_edges[i], bin_edges[i + 1])) plt.yticks([]) plt.xlabel("nb events") if i == len(k.columns) // 2: plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.25))
def build_ts_distribution(self, n_events, n_experiments, show_pbar=True): from progressbar import progressbar as pbar pbar = pbar if show_pbar else list s = [self.rvs(n_events) for _ in range(n_experiments)] self.ts = np.r_[[-self.likelihood(i) for i in pbar(s)]]
'melXsim_cyc14C_rep2', 'melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep1', 'simXmel_cyc14C_rep2']) xs = np.linspace(0, 1, 20, endpoint=True) avgs = pd.DataFrame(index=hyb_spatial_difference.index, columns=['avg_sl{}'.format(i+1) for i in range(20)]) if args.multi: with Pool() as p: results = (p.apply_async(get_diffs, (expr.ix[gene], mel_splines[gene], sim_splines[gene], avgs.columns)) for gene in hyb_spatial_difference.index) for gene in pbar()(hyb_spatial_difference.index): res = next(results).get() #res = get_diffs(expr.ix[gene], mel_splines[gene], sim_splines[gene], #avgs.columns) (hyb_hyb_diffs[gene], parental_diffs[gene], mel_hyb_diffs[gene], sim_hyb_diffs[gene], avgs.ix[gene], avg_hyb_diffs[gene], avg_levels[gene], hyb_levels.ix[gene], within_diffs_mXs[gene], within_diffs_sXm[gene], ) = res hyb_spatial_difference[gene] = (avg_hyb_diffs[gene]
set(all_changes[target_gene].keys())) comb = comb + ('const', ) if 'bcdP' in comb and 'bcdP2' not in comb: comb = comb + ('bcdP2', ) X_tmp = (atlas_expr.ix[in_central, comb, time_point] .T.copy() .dropna(how='all', axis=1) ) comb = tuple(X_tmp.columns) if comb in pool: continue pool[comb] = p.apply_async(fit_model, (X_tmp, Y_tmp, co)) outs = {} pr2 = pd.Series(index=pool.keys(), data=np.nan) llrs =pd.Series(index=pool.keys(), data=np.nan) for comb in pbar()(pool): outs[comb] = pool[comb].get() pr2[comb] = outs[comb].prsquared llrs[comb] = outs[comb].llr best_tfs = pr2.sort_values().index[-1] best_model = outs[best_tfs] print(best_model.summary().as_text()) best_X = atlas_expr.ix[:, best_tfs, time_point].T small_atlas['in_central'] = in_central small_atlas['color'] = [ 'b' if not ic else 'k' if yy > co else 'w' for ic, yy in zip(small_atlas.in_central, small_atlas.c) ] for tf in best_tfs:
lib_size = get_lib_size(args.reads) ase_vals = {} if False and args.max_jobs != 1: # Early experiments suggest this doesn't actually make things faster, so # the if False automatically skips this branch. But if someone later # wants to put it back i, it should be easy... with Pool(args.max_jobs or cpu_count()) as pool: for gene in gene_coords: ase_vals[gene] = pool.apply_async(get_ase_by_coords, ( gene_coords[gene][0], gene_coords[gene][1], reads, snp_dict, )) prog = pbar() for gene in prog(ase_vals): ase_vals[gene] = ase_vals[gene].get() if 'finish' in dir(prog): prog.finish() else: prog = pbar() for gene in prog(gene_coords): if gene_coords[gene][0] not in snp_dict.keys(): continue ase_vals[gene] = get_ase_by_coords(gene_coords[gene][0], gene_coords[gene][1], reads, snp_dict) if 'finish' in dir(prog): prog.finish() print("# Library size: " + str(lib_size), file=args.outfile, end='\n')
def _parse_prayer_times_for_city(self): self._get_csv_file() csv_reader = csv.reader(self.csv_file.splitlines(), delimiter=";") for row in pbar(csv_reader): self._set_prayers_to_city(row)
crops_coordinates_sequential = crops_coordinates.reshape(-1, 3, 2, order='F') logger.debug(f"{crops_coordinates_sequential.shape=}\n{crops_coordinates_sequential[0]=} ") # # Crops (if `debug__materialize_crops`) # In[ ]: if debug__materialize_crops: logger.info("Materializing crops") crops_sequential = np.array([ data_volume[tuple(slice(*coords_) for coords_ in coords)] for coords in pbar(crops_coordinates_sequential, max_value=crops_coordinates_sequential.shape[0]) ]) logger.debug(f"{crops_sequential.shape=}") crops_target_shape = list(crops_coordinates.shape[:3]) + list(crop_shape) logger.debug(f"{crops_target_shape=}") # 'F' reshapes with x varying fastest and z slowest # this option is necessary because `crops_coordinates` was reshaped with it crops = crops_sequential.reshape(crops_target_shape, order="F") del crops_sequential logger.debug(f"{crops.shape=}") if debug__save_processed_crops: fname = estimation_volume.debug__crops_coordinates_path logger.info(f"Saving crops coordinates at {fname=}")
# [model] modelin_target_shape = (batch_size_, crop_shape[0], crop_shape[1], crop_shape[2], 1) # adjust nb. channels batch_probas = model.predict( batch_data.reshape(modelin_target_shape), batch_size=batch_size_, steps=1, ).astype(args.probabilities_dtype) for slice_, crop_proba in zip(batch_slices, batch_probas): proba_volume[slice_] += crop_proba.reshape(crop_probas_target_shape) redundancies_count[slice_] += np.ones(crop_shape, dtype=np.int) logger.debug("Predicting and summing up the crops' probabilities.") for batch_idx in pbar( range(niterations), prefix="predict-and-sum-probas", max_value=niterations ): batch_start = batch_idx * batch_size process_batch(batch_start, batch_size) if last_batch_size > 0: logger.info("Segmenting the last batch") batch_start = niterations * batch_size process_batch(batch_start, last_batch_size) # In[ ]: del data_volume
'melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2', 'melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep1', 'simXmel_cyc14C_rep2' ]) xs = np.linspace(0, 1, 20, endpoint=True) avgs = pd.DataFrame(index=hyb_spatial_difference.index, columns=['avg_sl{}'.format(i + 1) for i in range(20)]) if args.multi: with Pool() as p: results = (p.apply_async(get_diffs, (expr.ix[gene], mel_splines[gene], sim_splines[gene], avgs.columns)) for gene in hyb_spatial_difference.index) for gene in pbar()(hyb_spatial_difference.index): res = next(results).get() #res = get_diffs(expr.ix[gene], mel_splines[gene], sim_splines[gene], #avgs.columns) ( hyb_hyb_diffs[gene], parental_diffs[gene], mel_hyb_diffs[gene], sim_hyb_diffs[gene], avgs.ix[gene], avg_hyb_diffs[gene], avg_levels[gene], hyb_levels.ix[gene], within_diffs_mXs[gene], within_diffs_sXm[gene],
best_r2s = pd.Series.from_csv('analysis/results/svase_best', sep='\t') expr = pd.read_table('godot/summary.tsv', **ut.pd_kwargs).drop('---', axis=1) mel_tss, sim_tss = get_ortholog_TSS_data() has_svase = (ut.true_index(best_r2s.sort_values(ascending=False) > .25) .intersection(mel_tss.keys())) no_svase = (best_r2s.index[best_r2s < .01] .intersection(expr.index) .intersection(mel_tss.keys())) median_expr = expr.T.median() num_sim_expr = pd.Series(index=has_svase, data=-1) already_used = set() best_match = pd.DataFrame(index=has_svase, data={'gene': '', 'emd': 1.0}) p = Pool() for gene in pbar()(has_svase): similar_expr = ut.true_index((.5 * median_expr[gene] < median_expr[no_svase]) & (median_expr[no_svase] < 2 * median_expr[gene])) similar_expr = similar_expr.difference(already_used) diff_jobs = { target: p.apply_async(dd.earth_mover_multi, (expr.loc[gene], expr.loc[target])) for target in similar_expr } pattern_diffs = pd.Series({target: diff_jobs[target].get() for target in similar_expr}).sort_values() best_match.loc[gene, 'gene'] = pattern_diffs.index[0] best_match.loc[gene, 'emd'] = pattern_diffs[0] already_used.add(pattern_diffs.index[0]) best_match.index.name='svase_gene'
simXmel_ase = ase.select(**sel_startswith('simXmel')) melXsim_is_expr = (melXsim_expr > EXPR_MIN) simXmel_is_expr = (simXmel_expr > EXPR_MIN) all_is_expr = expr > EXPR_MIN min_per_crossdir = 10 expr_both = ( (melXsim_ase.T.count() > min_per_crossdir) & (simXmel_ase.T.count() > min_per_crossdir) & (melXsim_is_expr.T.sum() > min_per_crossdir) & (simXmel_is_expr.T.sum() > min_per_crossdir) ) ase_expr = ase.ix[expr_both] print("Found {} good genes".format(len(ase_expr))) n_reps = 1000 mel_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps)) sim_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps)) with Pool() as p: results = [None for i in range(n_reps)] for i in range(n_reps): results[i] = p.apply_async(get_randomized_scores, (ase_expr, )) for i, res in pbar(max_value=n_reps)(enumerate(results)): mel_bias, sim_bias = res.get() mel_biases.ix[:, i] = mel_bias sim_biases.ix[:, i] = sim_bias
n_good_slices=np.nan, r2=np.nan, rmsdiff=np.nan), index=mel.index, ) all_pred_ase_nan = pd.DataFrame(data=np.nan, index=ase.index, columns=ase.columns) all_mel_pred = all_pred_ase_nan.copy() all_sim_pred = all_pred_ase_nan.copy() all_pred_ase = [] prog = pbar(maxval=len(ase_avgs.index)) #prog = lambda x: x render_pool = Pool() renders = [] for gene in prog(ase_avgs.index): if not locals().get('redraw', True): break good_ase = np.isfinite(ase.ix[gene]) & ~(ase.ix[gene] == ase_maternals) xg = ase_xs[good_ase] ase_avgs.ix[gene, 'n_good_slices'] = len(xg) ase_avgs.ix[gene, 'actual'] = ase.ix[gene].mean() sim_pred = pd.Series(sim_splines[gene](ase_xs).clip(1e-3, 1e10), name='predicted_sim_' + gene, index=ase_xs.index) mel_pred = pd.Series(mel_splines[gene](ase_xs).clip(1e-3, 1e10), name='predicted_mel_' + gene,
expr = pd.read_table('godot/summary.tsv', **ut.pd_kwargs).drop('---', axis=1) mel_tss, sim_tss = get_ortholog_TSS_data() has_svase = (ut.true_index( best_r2s.sort_values(ascending=False) > .25).intersection( mel_tss.keys())) no_svase = (best_r2s.index[best_r2s < .01].intersection( expr.index).intersection(mel_tss.keys())) median_expr = expr.T.median() num_sim_expr = pd.Series(index=has_svase, data=-1) already_used = set() best_match = pd.DataFrame(index=has_svase, data={'gene': '', 'emd': 1.0}) p = Pool() for gene in pbar()(has_svase): similar_expr = ut.true_index( (.5 * median_expr[gene] < median_expr[no_svase]) & (median_expr[no_svase] < 2 * median_expr[gene])) similar_expr = similar_expr.difference(already_used) diff_jobs = { target: p.apply_async(dd.earth_mover_multi, (expr.loc[gene], expr.loc[target])) for target in similar_expr } pattern_diffs = pd.Series( {target: diff_jobs[target].get() for target in similar_expr}).sort_values() best_match.loc[gene, 'gene'] = pattern_diffs.index[0] best_match.loc[gene, 'emd'] = pattern_diffs[0] already_used.add(pattern_diffs.index[0])
sys.stderr.flush() for i in range(n_perms): print(i, end=' ') sys.stdout.flush() new_xs = pd.Series(index=xs.index, data=np.random.permutation(xs)) waiting_jobs.put(Job(fit_and_eval, args=(ase, logistic, new_xs, colnames), kwargs={'pool': cluster_args['cpus']}, suffix='_{}_{:04}'.format(func.__name__, i), **cluster_args )) if i < cluster_joblimit: activate_job(waiting_jobs, active_jobs) sleep(60) for i in pbar(max_value=n_perms)(range(n_perms)): r2s.extend(active_jobs.get().get()) if not waiting_jobs.empty(): activate_job(waiting_jobs, active_jobs) dump({'logist': logist_r2s, 'peak': peak_r2s, 'last':'_{}_{:04}'.format(func.__name__, i), }, open('analysis/results/{prefix}fdr_{suffix}.pkl' .format(prefix=args.prefix, suffix=args.suffix), 'wb')) np.save('analysis/results/{prefix}fdr_{name}{suffix}.numpy' .format(prefix=args.prefix, name=func.__name__, suffix=args.suffix), np.array(r2s))
is_male = [col.startswith(males) for col in ase.columns] ase.ix[on_x, is_male] = np.nan melXsim_expr = expr.select(**sel_startswith('melXsim')) simXmel_expr = expr.select(**sel_startswith('simXmel')) melXsim_ase = ase.select(**sel_startswith('melXsim')) simXmel_ase = ase.select(**sel_startswith('simXmel')) melXsim_is_expr = (melXsim_expr > EXPR_MIN) simXmel_is_expr = (simXmel_expr > EXPR_MIN) all_is_expr = expr > EXPR_MIN min_per_crossdir = 10 expr_both = ((melXsim_ase.T.count() > min_per_crossdir) & (simXmel_ase.T.count() > min_per_crossdir) & (melXsim_is_expr.T.sum() > min_per_crossdir) & (simXmel_is_expr.T.sum() > min_per_crossdir)) ase_expr = ase.ix[expr_both] print("Found {} good genes".format(len(ase_expr))) n_reps = 1000 mel_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps)) sim_biases = pd.DataFrame(index=ase_expr.index, columns=range(n_reps)) with Pool() as p: results = [None for i in range(n_reps)] for i in range(n_reps): results[i] = p.apply_async(get_randomized_scores, (ase_expr, )) for i, res in pbar(max_value=n_reps)(enumerate(results)): mel_bias, sim_bias = res.get() mel_biases.ix[:, i] = mel_bias sim_biases.ix[:, i] = sim_bias
comb = tuple( set(comb) | set(all_changes[target_gene].keys())) comb = comb + ('const', ) if 'bcdP' in comb and 'bcdP2' not in comb: comb = comb + ('bcdP2', ) X_tmp = (atlas_expr.ix[in_central, comb, time_point].T.copy().dropna( how='all', axis=1)) comb = tuple(X_tmp.columns) if comb in pool: continue pool[comb] = p.apply_async(fit_model, (X_tmp, Y_tmp, co)) outs = {} pr2 = pd.Series(index=pool.keys(), data=np.nan) llrs = pd.Series(index=pool.keys(), data=np.nan) for comb in pbar()(pool): outs[comb] = pool[comb].get() pr2[comb] = outs[comb].prsquared llrs[comb] = outs[comb].llr best_tfs = pr2.sort_values().index[-1] best_model = outs[best_tfs] print(best_model.summary().as_text()) best_X = atlas_expr.ix[:, best_tfs, time_point].T small_atlas['in_central'] = in_central small_atlas['color'] = [ 'b' if not ic else 'k' if yy > co else 'w' for ic, yy in zip(small_atlas.in_central, small_atlas.c) ] for tf in best_tfs:
def dicomloaddir(files, filenamepattern='*.dcm', maxtoread=None, phasemode=None,\ desiredinplansize=None, dformat='float'): ''' dicomloaddir(files, filenamepattern='*.dcm', maxtoread=None, phasemode=None,\ desiredinplansize=None, dformat='float'): load multiple dicom files in one or multi directories Input: <files>: can be: (1) a string of a directory (2) a list of (2) (3) a rzpath object (4) a list of (3) object <filenamepattern>: str, the wildcard for the dicom file in a directory <maxtoread>: int, maximum number of dicom files to read <phasemodel>: ...implement later, ignore for now... <desiredinplansize>: ...implement later, ignore for now..., a 1x2 array, desired inplace size, if dicom files do not follow this size, we resize it. <dformat>: ...implement later, ignore for now read in data format Output: <vollist>: a list of volume arrays for multiple runs, if just one run, we return the array <dicominfolist>: a list of dicom info dict for multiple runs, if just one run, we return the dicom info dict Note: 1. This function currently works with Siemens Prisma 3T and Magnetom 7T, not sure other scanners like GE. For Siemens, we focus on these attributes (can update this): (0018, 0050) Slice Thickness (0028, 0030) Pixel Spacing (0051, 100b) AcquisitionMatrixText (0019, 100a) NumberOfImagesInMosaic, 1 if anatomical data (0018, 0080) Time (TR) (0018, 0081) Echo Time (TE) (0018, 1312) Inplane Phase Encoding Direction (0019, 1029) MosaicRefAcqTimes (slicetimeorder), None if anatomical data (0051, 1016) a str, check mosaic, read from the dicom file (0051, 100c) FOV We also add keys: 'ismosaic': boolean, whether this is a mosaic image 'voxelsize': 1x3 list, based on Slice Thickness and Pixel Spacing 'AcquisitionMatrix': [phase, frequency] matrix, derived from AcquisitionMatrixText. Phase step has no meaning if data is structure? 'FovSize':[phase_len, frequency_len] mm, derived from FOV 'epireadouttime': calculated from rz.mri.dicom_readout_msec, only valid for epi, None if other files 2. Note that all these keys are scanner specific. Most of these should work for Siemens scanner but might not work for GE or Phillipe scanner. Example: Todo: 1. figure out how to add read phase data 2. check if some of the fields do no exist 3. resize image to accommodate desired inplane size 4. save all metafile using pickel History: 20180720 <files> now can accept path-like objects 20180626 RZ fixed the bug for reading the anatomical files 20180605 RZ use nibabel.nicom.csareader.get_csa_header() function to read csa file and get the [BandWidthPerPixelPhaseEncode] 20180422 RZ change the stack images in the last step so user can see report while waiting for image stack 20180420 RZ created this function ''' from pydicom import dcmread from RZutilpy.rzio import matchfiles from RZutilpy.array import split2d from RZutilpy.mri import dicom_readout_msec from RZutilpy.system import rzpath from numpy import stack from progressbar import progressbar as pbar import re import time # deal with input files = [files] if not isinstance(files, list) else files # convert it to path-like object files = [rzpath(p) if not isinstance(p, rzpath) else p for p in files] # start to load dicominfolist = [] vollist = [] for iDir, filedir in enumerate(files): # loop directory filepattern = filedir / filenamepattern dcmnames = matchfiles(filepattern.str) if len(dcmnames) == 0: print( 'This {} does not appear to be a directory containing {} files, so skipping.\n' .format(filedir, filenamepattern)) break else: print( 'This {} appear to be a directory containing {} files, so loading.\n' .format(filedir, filenamepattern)) dcmnames = dcmnames[:maxtoread] # remove last couple of dcm files # ====== deal with dicom info, save a customized dicominfo dict ======= ds = dcmread(dcmnames[0].str) # read 1st vol for info purpose # note current we assume this dicom have all fields below!! And we save # the very raw dicom info here dcminfothisrun = dict() dcminfothisrun['SliceThickness'] = ds.SliceThickness dcminfothisrun['PixelSpacing'] = ds.PixelSpacing dcminfothisrun['AcquisitionMatrixText'] = ds.AcquisitionMatrixText dcminfothisrun['RepetitionTime'] = ds.RepetitionTime dcminfothisrun['EchoTime'] = ds.EchoTime dcminfothisrun[ 'InPlanePhaseEncodingDirection'] = ds.InPlanePhaseEncodingDirection dcminfothisrun['FOV'] = ds[int('0051', 16), int('100c', 16)].value dcminfothisrun['checkmosaic'] = ds[int('0051', 16), int('1016', 16)].value # figure out whether it is mosaic image if dcminfothisrun['checkmosaic'].find('MOSAIC') >= 0: dcminfothisrun['ismosaic'] = True # indicate this is a epi file print( 'We are loading some mosaic images, need to convert a mosaic image to 3d,\ this directory might contain epi data ...\n') else: dcminfothisrun[ 'ismosaic'] = False # indicate this is not a epi file if [int('0019', 16), int('100a', 16)] in ds: # simense dcminfothisrun['NumberOfImagesInMosaic'] = ds[ int('0019', 16), int('100a', 16)].value if dcminfothisrun['ismosaic'] else 1 elif [int('0021', 16), int('104f', 16)] in ds: # GE dcminfothisrun['NumberOfImagesInMosaic'] = ds[ int('0021', 16), int('104f', 16)].value if dcminfothisrun['ismosaic'] else 1 dcminfothisrun['MosaicRefAcqTimes'] = ds[ int('0019', 16), int('1029', 16)].value if dcminfothisrun['ismosaic'] else None dcminfothisrun['epireadouttime'] = dicom_readout_msec( ds)[0] if dcminfothisrun['ismosaic'] else None # save voxel size dcminfothisrun['voxelsize'] = list(dcminfothisrun['PixelSpacing']) + [ dcminfothisrun['SliceThickness'] ] # figure out inplane matrix, not that we assume # note this regular expression might fail in normal resolution imaging p = re.compile(r'^(\d{1,4}).?\*(\d{1,4}).?$') matchgroup = p.match(dcminfothisrun['AcquisitionMatrixText']) if matchgroup: plines = int( matchgroup.group(1)) # step in phase encoding direction flines = int( matchgroup.group(2)) # step in frequency encoding direction dcminfothisrun['AcquisitionMatrix'] = [plines, flines] else: ValueError('can not find the phase encoding direction!') # figure out inplane matrix, not that we assume p = re.compile(r'^FoV (\d{1,6})\*(\d{1,6})$') matchgroup = p.match(dcminfothisrun['FOV']) p_len = int(matchgroup.group(1)) # step in phase encoding direction f_len = int( matchgroup.group(2)) # step in frequency encoding direction dcminfothisrun['FovSize'] = [ p_len / 10, f_len / 10 ] if dcminfothisrun['ismosaic'] else [p_len, f_len] # have to divide this number by 10 for epidata, not sure why.... # save dicom info in this run dicominfolist.append(dcminfothisrun) # show some information print(dcminfothisrun) # ================ deal with the volumes ==================== print('\nReading in dicoms ......') vol = [dcmread(i.str).pixel_array for i in pbar(dcmnames)] # read pixel data # split mosaic images if dcminfothisrun['ismosaic']: # Note that we assume plines and flines will be exact divided by the image # this is typically true vol = [split2d(i, plines, flines) for i in vol] # split each 2d mosaic image to 3d image # only keep acquired slices, the last several images are sometimes black vol = [ i[:, :, :dcminfothisrun['NumberOfImagesInMosaic']] for i in vol ] # stack images, take a while print('\n\nStack images ......\n') vol = stack(vol, axis=-1) # stack to a 3d/4d file if vol.ndim == 3: # expand to 4d if only 3d vol = vol[..., None] vollist.append(vol) # report info print( 'The 3D dimensions of the final returned volume are {}.\n'.format( vol.shape[:3])) print('There are {} volumes in the fourth dimension.\n'.format( vol.shape[-1])) if dcminfothisrun['ismosaic']: print('These are mosaic images, might be epi data.\n') else: print('These are not mosaic images, might not be epi data.\n') print('The voxel size (mm) of the final returned volume is {}.\n'.format\ (dcminfothisrun['voxelsize'])) print('The in-plane matrix size (PE x FE) appears to be {}.\n'.format\ (dcminfothisrun['AcquisitionMatrix'])) print('The field-of-view (mm) of the final returned volume is {}.\n'.format\ (dcminfothisrun['FovSize'])) print('The TR is {} ms.\n\n\n\n'.format( dcminfothisrun['RepetitionTime'])) if len(vollist) == 1: vollist = vollist[0] if len(dicominfolist) == 1: dicominfolist = dicominfolist[0] return vollist, dicominfolist