def radiometry_cy_ims(cy_ims, locs, reg_psf_samples, peak_mea): """ Compute radiometry on the stack of cycle images for one field on one channel Returns: output_radmat: ndarray(n_peaks, n_cycles, (sig, noi, bg_med, bg_std)) """ with context(cy_ims=cy_ims, locs=locs, reg_psf_samples=reg_psf_samples, peak_mea=peak_mea) as ctx: check.array_t(locs, ndim=2, dtype=np.float64) n_peaks = locs.shape[0] if n_peaks > 0: batches = zap.make_batch_slices(n_rows=locs.shape[0], _batch_size=100) with zap.Context(trap_exceptions=False, mode="thread"): zap.work_orders([ dict( fn=_do_radiometry_field_stack_peak_batch, ctx=ctx, peak_start_i=batch[0], peak_stop_i=batch[1], ) for batch in batches ]) return ctx._out_radiometry
def _run(radmat, radmat_filter_mask, dyemat, dyepeps, n_channels): with c_nn_v2.context( train_dyemat=dyemat, train_dyepeps=dyepeps, radmat=radmat, radmat_filter_mask=radmat_filter_mask, priors=params.priors, n_channels=n_channels, n_neighbors=params.n_neighbors, run_row_k_fit=params.run_row_k_fit, run_against_all_dyetracks=params.run_against_all_dyetracks, scoring_verbose=params.scoring_verbose, scoring_verbose_cc=params.scoring_verbose_cc, row_k_score_factor=params.row_k_score_factor, ) as nn_v2_context: # _nn_v2.c chokes if a batch is larger than 1024*16 batches = zap.make_batch_slices(n_rows=radmat.shape[0], _batch_size=_batch_size) with zap.Context(mode="thread", trap_exceptions=False, progress=progress): # This must be thread mode because it operates on the context in shared memory. zap.work_orders([ dict( fn=c_nn_v2.do_classify_radrows, radrow_start_i=batch[0], n_radrows=batch[1] - batch[0], nn_v2_context=nn_v2_context, ) for batch in batches ]) return nn_v2_context
def it_bubbles_exceptions(): with zest.mock(zap._show_work_order_exception) as m_ex: with zest.raises(ValueError): work_orders[0].fn = test2 zap.work_orders( work_orders, _process_mode=True, _trap_exceptions=False, ) assert m_ex.called_once()
def it_calls_progress(): progress = MockFunction() work_orders[0].fn = test2 zap.work_orders( work_orders, _debug_mode=True, _progress=progress, ) assert progress.calls == [ ((1, 2, False), {}), ((2, 2, False), {}), ]
def classify(self, test_X, keep_all_class_scores, progress=None): # TASK: There's some work to be done here to optimize the size # of this split to dial the memory usage n_rows = test_X.shape[0] if n_rows < 100: pred_y, scores, all_class_scores = _do_predict( classifier=self.classifier, X=test_X) else: n_work_orders = n_rows // 100 results = zap.work_orders( [ Munch(classifier=self.classifier, X=X, fn=_do_predict) for X in np.array_split(test_X, n_work_orders, axis=0) ], _trap_exceptions=False, _progress=progress, ) pred_y = utils.listi(results, 0) scores = utils.listi(results, 1) all_class_scores = utils.listi(results, 2) pred_y = np.concatenate(pred_y) scores = np.concatenate(scores) if keep_all_class_scores: all_class_scores = np.concatenate(all_class_scores) if not keep_all_class_scores: all_class_scores = None return pred_y, scores, all_class_scores
def classify(self, X, progress=None): check.array_t(X, ndim=2) n_rows = X.shape[0] if n_rows < 100: winner_y, winner_scores, runnerup_y, runnerup_scores = _do_predict( classifier=self.classifier, X=X) else: n_work_orders = n_rows // 100 with zap.Context(progress=progress, trap_exceptions=False): results = zap.work_orders([ Munch(classifier=self.classifier, X=X, fn=_do_predict) for X in np.array_split(X, n_work_orders, axis=0) ]) winner_y = utils.listi(results, 0) winner_scores = utils.listi(results, 1) runnerup_y = utils.listi(results, 2) runnerup_scores = utils.listi(results, 3) winner_y = np.concatenate(winner_y) winner_scores = np.concatenate(winner_scores) runnerup_y = np.concatenate(runnerup_y) runnerup_scores = np.concatenate(runnerup_scores) return winner_y, winner_scores, runnerup_y, runnerup_scores
def it_traps_exceptions(): work_orders[0].fn = test2 results = zap.work_orders( work_orders, _process_mode=True, _trap_exceptions=True, ) assert isinstance(results[0], ValueError) assert results[1] == 3 + 4 + 5
def it_retries(): progress = MockFunction() with zest.mock(zap._mock_BrokenProcessPool_exception) as m: m.exceptions(BrokenProcessPool) results = zap.work_orders(work_orders, _process_mode=True, _progress=progress) assert progress.calls == [ ((1, 2, True), {}), ((2, 2, True), {}), ]
def sigproc(sigproc_params, ims_import_result, progress=None): # CACHE n_channel, n_cycles, dim into sigproc_params by loading one field ims = ims_import_result.field_chcy_ims(field_i=0) n_inchannels, n_cycles, h, w = ims.shape assert h == w n_outchannels = sigproc_params.n_output_channels sigproc_params._outchannels_inchannels_cycles_dim = ( n_outchannels, n_inchannels, n_cycles, h, ) if not sigproc_params.channel_indices_for_alignment: sigproc_params.channel_indices_for_alignment = list(range(n_inchannels)) sigproc_result = SigprocV1Result( params=sigproc_params, n_input_channels=n_inchannels, n_channels=n_outchannels, n_cycles=n_cycles, ) n_fields = ims_import_result.n_fields n_fields_limit = sigproc_params.n_fields_limit if n_fields_limit is not None and n_fields_limit < n_fields: n_fields = n_fields_limit # TASK: I think this would be nicer with the parallel array map results = zap.work_orders( [ Munch( fn=_do_field, field_i=field_i, sigproc_params=sigproc_params, ims_import_result=ims_import_result, sigproc_result=sigproc_result, ) for field_i in range(n_fields) ], _process_mode=True, _trap_exceptions=False, _progress=progress, ) # SET the result n_channels (possibly different from input n_channels) n_inchannels = np.array(results) assert np.all(n_inchannels == n_inchannels[0]) sigproc_result.n_channels = int(n_inchannels[0]) return sigproc_result
def sigproc(sigproc_params, ims_import_result, progress=None): """ Analyze all fields """ calib = Calibration(sigproc_params.calibration) assert not calib.is_empty() channel_weights = _compute_channel_weights(sigproc_params) sigproc_result = SigprocV2Result( params=sigproc_params, n_input_channels=ims_import_result.n_channels, n_channels=sigproc_params.n_output_channels, n_cycles=ims_import_result.n_cycles, channel_weights=channel_weights, ) n_fields = ims_import_result.n_fields n_fields_limit = sigproc_params.n_fields_limit if n_fields_limit is not None and n_fields_limit < n_fields: n_fields = n_fields_limit zap.work_orders( [ Munch( fn=_do_sigproc_field, ims_import_result=ims_import_result, sigproc_params=sigproc_params, field_i=field_i, sigproc_result=sigproc_result, ) for field_i in range(n_fields) ], _trap_exceptions=False, _progress=progress, ) return sigproc_result
def all_dfs(self, fn, parallel=False): """ Run fn on every run, assert that each returns af DataFrame and then pd.concat all the results into one adding a run_i column to that DataFrame. Example: df = job.all_dfs(lambda run: run.prep.pros()) """ df_list = [] if parallel: def wrap_fn(run, run_i): res_df = fn(run) assert isinstance(res_df, pd.DataFrame) res_df["run_i"] = run_i res_df["run_name"] = run.manifest.run_name return res_df work_orders = [ {"fn": wrap_fn, "args": [run, run_i]} for run_i, run in enumerate(self._run_results.values()) ] # TODO: it would be nice to integrate this progress stuff into zap as an optional argument progress = tqdm(total=len(work_orders)) def progress_callback(i, j, retry): if not retry: progress.update() with zap.Context(trap_exceptions=False, progress=progress_callback): df_list = zap.work_orders(work_orders) progress.close() else: for run_i, run in enumerate(self._run_results.values()): res_df = fn(run) assert isinstance(res_df, pd.DataFrame) res_df["run_i"] = run_i res_df["run_name"] = run.manifest.run_name df_list += [res_df] return pd.concat(df_list).reset_index(drop=True)
def pmap_runstore(fn, work_orders, _clear_cache=False, **kws): """ Parallel run fn over the work_orders. Arguments: work_orders: a list of dicts. Each work_order dict MUST contain a 'run', 'key', and 'args' parameters that are used to update the appropriate run's store with that key. """ work_orders = [ dict( **wo, fn=_do_store_get_cache_or_execute, inner_fn=fn, _clear_cache=_clear_cache, ) for wo in work_orders ] p = zap.work_orders(work_orders, **kws) # UPDATE stores. This is done in the master process to avoid sync issues for wo, result in zip(work_orders, p.results): from_cache, result = result if not from_cache: wo["run"].store[wo["key"]] = result
def it_runs_serially(): results = zap.work_orders(work_orders, _process_mode=True) assert results[0] == 1 + 2 + 3 assert results[1] == 3 + 4 + 5
def pr_curve_by_pep(self, return_auc=False, pep_iz=None, force_compute=False, progress=None): """ Obtain pr_curves for every peptide. If all params are default, may returned cached information computed during the run. Returns: A (potentially HUGE) df of every P/R for every peptide A smaller df with just the pep_i and the Area-Under-Curve This uses the work_order system (as opposed to the higher-level array_split_map()) because the _do_pep_pr_curve returns 3 identical returns AND one scalar; array_split_map() doesn't like that. """ # The PR for all peptides is computed during the run (no auc). if not return_auc and not force_compute and self._cached_pr is not None: df = self._cached_pr if pep_iz is not None: df = df[df.pep_i.isin(pep_iz)] return df.copy() if pep_iz is None: pep_iz = self._prep_result.peps().pep_i.values if isinstance(pep_iz, np.ndarray): pep_iz = pep_iz.tolist() check.list_t(pep_iz, int) with zap.Context(mode="thread", trap_exceptions=False, progress=progress): results = zap.work_orders([ Munch( fn=_do_pep_pr_curve, pep_i=pep_i, bag=self, ) for pep_i in pep_iz ], ) df_per_pep = [ pd.DataFrame( dict( pep_i=np.repeat(np.array([pep_i]), prec.shape[0]), prec=prec, recall=recall, score=score, )) for pep_i, (prec, recall, score, _) in results ] if len(df_per_pep) > 0: pr_df = pd.concat(df_per_pep, axis=0) else: pr_df = None auc_df = pd.DataFrame( [(pep_i, auc) for pep_i, (_, _, _, auc) in results], columns=["pep_i", "auc"], ) if return_auc: return pr_df, auc_df else: return pr_df
def ims_import(src_dir: Path, ims_import_params: ImsImportParams, progress=None, pipeline=None): reference_nd2_file_for_metadata = None scan_result = _scan_files(src_dir) if len(scan_result.nd2_paths) > 0: reference_nd2_file_for_metadata = scan_result.nd2_paths[0] target_mea = max(scan_result.dim[0], scan_result.dim[1]) if not utils.is_power_of_2(target_mea): new_dim = utils.next_power_of_2(target_mea) _convert_message(target_mea, new_dim) target_mea = new_dim def clamp_fields(n_fields_true: int) -> Tuple[int, int]: n_fields = n_fields_true n_fields_limit = ims_import_params.get("n_fields_limit") if n_fields_limit is not None: n_fields = n_fields_limit start_field = ims_import_params.get("start_field", 0) if start_field + n_fields > n_fields_true: n_fields = n_fields_true - start_field return start_field, n_fields def clamp_cycles(n_cycles_true: int) -> Tuple[int, int]: n_cycles = n_cycles_true n_cycles_limit = ims_import_params.get("n_cycles_limit") if n_cycles_limit is not None: n_cycles = n_cycles_limit start_cycle = ims_import_params.get("start_cycle", 0) if start_cycle is None: start_cycle = 0 if start_cycle + n_cycles > n_cycles_true: n_cycles = n_cycles_true - start_cycle return start_cycle, n_cycles tsv_data = tsv.load_tsv_for_folder(src_dir) # ALLOCATE the ImsImportResult ims_import_result = ImsImportResult(params=ims_import_params, tsv_data=Munch(tsv_data)) dst_ch_i_to_src_ch_i = ims_import_params.dst_ch_i_to_src_ch_i if dst_ch_i_to_src_ch_i is None: dst_ch_i_to_src_ch_i = [i for i in range(scan_result.n_channels)] n_out_channels = len(dst_ch_i_to_src_ch_i) # Sanity check that we didn't end up with any src_channels outside of the channel range assert all([ 0 <= src_ch_i < scan_result.n_channels for src_ch_i in dst_ch_i_to_src_ch_i ]) if ims_import_params.is_z_stack_single_file: field_iz, n_cycles_found = _z_stack_import( scan_result.nd2_paths[0], target_mea, ims_import_result, dst_ch_i_to_src_ch_i, ims_import_params.z_stack_n_slices_per_field, ) n_cycles = ims_import_params.z_stack_n_slices_per_field elif ims_import_params.is_movie: if scan_result.mode == ScanFileMode.nd2: # "Movie mode" means that there aren't any chemical cycles, but rather we are using "cycles" to represent different images in a zstack start_field, n_fields = clamp_fields(len(scan_result.nd2_paths)) # In movie mode, the n_fields from the .nd2 file is becoming n_cycles scan_result.n_cycles = scan_result.n_fields start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles) with zap.Context(progress=progress): field_iz, n_cycles_found = zap.arrays( _do_movie_import_nd2, dict( input_field_i=list( range(start_field, start_field + n_fields)), output_field_i=list(range(n_fields)), ), _stack=True, scan_result=scan_result, start_cycle=start_cycle, n_cycles=n_cycles, target_mea=target_mea, import_result=ims_import_result, dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i, ) elif scan_result.mode == ScanFileMode.npy: start_field, n_fields = clamp_fields(scan_result.n_fields) start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles) with zap.Context(progress=progress): field_iz, n_cycles_found = zap.arrays( _do_movie_import_npy, dict( input_field_i=list( range(start_field, start_field + n_fields)), output_field_i=list(range(n_fields)), ), _stack=True, scan_result=scan_result, start_cycle=start_cycle, n_cycles=n_cycles, target_mea=target_mea, import_result=ims_import_result, dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i, ) else: raise NotImplementedError() else: start_field, n_fields = clamp_fields(scan_result.n_fields) if pipeline: pipeline.set_phase(0, 2) if scan_result.mode == ScanFileMode.nd2: scan_result.n_cycles = len(scan_result.nd2_paths) # SCATTER with zap.Context(mode="thread", progress=progress): zap.arrays( _do_nd2_scatter, dict( cycle_i=list(range(len(scan_result.nd2_paths))), src_path=scan_result.nd2_paths, ), _stack=True, start_field=start_field, n_fields=n_fields, n_channels=scan_result.n_channels, target_mea=target_mea, ) elif scan_result.mode == ScanFileMode.tif: # SCATTER work_orders = [ Munch(field_i=k[0], channel_i=k[1], cycle_i=k[2], path=path) for k, path in scan_result.tif_paths_by_field_channel_cycle.items() ] with zap.Context(trap_exceptions=False): results = zap.work_orders(_do_tif_scatter, work_orders) # CHECK that every file exists for f in range(n_fields): for ch in range(scan_result.n_channels): for cy in range(scan_result.n_cycles): expected = f"__{f:03d}-{ch:02d}-{cy:02d}.npy" if expected not in results: raise FileNotFoundError( f"File is missing in tif pattern: {expected}") elif scan_result.mode == ScanFileMode.npy: # In npy mode there's no scatter as the files are already fully scattered pass else: raise ValueError(f"Unknown im import mode {scan_result.mode}") if pipeline: pipeline.set_phase(1, 2) # GATHER start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles) with zap.Context(progress=progress): field_iz = zap.arrays( _do_gather, dict( input_field_i=list( range(start_field, start_field + n_fields)), output_field_i=list(range(0, n_fields)), ), _stack=True, start_cycle=start_cycle, n_cycles=n_cycles, dim=target_mea, import_result=ims_import_result, mode=scan_result.mode, npy_paths_by_field_channel_cycle=scan_result. npy_paths_by_field_channel_cycle, dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i, ) if reference_nd2_file_for_metadata: with _nd2(reference_nd2_file_for_metadata) as nd2: if hasattr(nd2, "metadata"): full = Munch( metadata=nd2.metadata, metadata_seq=nd2.metadata_seq, ) ims_import_result._nd2_metadata_full = full def me(block_name, default=None): return utils.block_search(full.metadata.SLxExperiment, block_name, default) def mp(block_name, default=None): return utils.block_search( full.metadata_seq.SLxPictureMetadata, block_name, default) n_channels = mp("sPicturePlanes.uiSampleCount", 1) ims_import_result._nd2_metadata = Munch( calibrated_pixel_size=mp("dCalibration"), experiment_type="movie" if me("eType") == 1 else "edman", n_cycles=me("uLoopPars.uiCount"), cmd_before=me("wsCommandBeforeCapture"), cmd_after=me("wsCommandAfterCapture"), n_channels=n_channels, ) per_channel = [] for ch_i in range(n_channels): laser_wavelength = None laser_power = None n_lasers = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_uiMultiLaserLines0", 0, ) for i in range(n_lasers): is_used = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_bMultiLaserLineUsed0-{i:02d}", 0, ) if is_used == 1: laser_wavelength = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_uiMultiLaserLineWavelength0-{i:02d}", 0, ) laser_power = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dMultiLaserLinePower0-{i:02d}", 0, ) ch_munch = Munch( laser_wavelength=laser_wavelength, laser_power=laser_power, camera_name=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.CameraUniqueName" ), sensor_pixels_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorPixels.cx" ), sensor_pixels_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorPixels.cy" ), sensor_microns_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorMicrons.cx" ), sensor_microns_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorMicrons.cy" ), bin_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.dBinningX" ), bin_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.dBinningY" ), format=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.wszFormatDesc" ), roi_l=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.left" ), roi_r=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.right" ), roi_t=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.top" ), roi_b=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.bottom" ), averaging=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.PropertiesQuality.Average" ), integration=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.PropertiesQuality.Integrate" ), name=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.Metadata.Channels.Channel_0.Name" ), dichroic_filter=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_sFilterName0" ), emission_filter=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_sFilterName1" ), optivar=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dZoomPosition" ), tirf_focus=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionFocus" ), tirf_align_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionX" ), tirf_align_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionY" ), objective_mag=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dObjectiveMag" ), objective_na=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dObjectiveNA" ), objective_refractive_index=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dRefractIndex" ), settings_name=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sOpticalConfigs.\x02.sOpticalConfigName" ), readout_mode=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Readout Mode" ), readout_rate=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Readout Rate" ), noise_filter=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Noise Filter" ), temperature=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Temperature" ), exposure=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.dExposureTime" ), ) per_channel += [ch_munch] ims_import_result._nd2_metadata.update(**Munch( per_channel=per_channel)) if me("eType") == 1: # Movie mode ims_import_result._nd2_metadata.update(**Munch( movie_start=me("dStart"), movie_period=me("dPeriod"), movie_duration=me("dDuration"), movie_duration_pref=me("bDurationPref"), movie_max_period_diff=me("dMaxPeriodDiff"), movie_min_period_diff=me("dMinPeriodDiff"), movie_avg_period_diff=me("dAvgPeriodDiff"), )) ims_import_result.n_fields = len(field_iz) ims_import_result.n_channels = n_out_channels ims_import_result.n_cycles = n_cycles ims_import_result.dim = target_mea ims_import_result.dtype = np.dtype(OUTPUT_NP_TYPE).name ims_import_result.src_dir = src_dir # CLEAN for file in local.cwd // "__*": file.delete() return ims_import_result
def _step_1_create_neighbors_lookup_multiprocess(dyemat, output_dt_mat): """ The dyemat may have many duplicate rows, each from some number of peps. These duplicate rows are consolidated so that each coordinate in dyemat space is given a unique "dye_i". The unique (sorted) dyetracks are written to output_dt_mat which is expected to be large enough to hold them. In thie multiprocess version I use all the cores to break the set into seprate unqies and then compbine them. This tends to be at least twice as fast. Returns: dyetracks_df: DF(dye_i, weight). Where weight is the sum of all rows that pointed to this dyetrack dt_pep_sources_df: DF(dye_i, pep_i, n_rows) Records how many times each peptide generated dye_i where count > 0. flann: A fast Approximate Nearest Neighbors lookup using PYFLANN. n_dts: Number of actual unique dts """ check.array_t(dyemat, ndim=4) # (n_peps, n_samples, n_channels, n_cycles): uint8 # A multithreaded version of uniqueification # The idea is to divide the list into blocks, unique them # then unique this much smaller set. # This is tricky because we have to keep track of the # counts and inverse. n_peps, n_samples, n_channels, n_cycles = dyemat.shape true_pep_iz = np.repeat(np.arange(n_peps), n_samples) n_rows = n_peps * n_samples n_cols = n_channels * n_cycles flat_dyemat = dyemat.reshape((n_rows, n_cols)) n_batches = _cpu_count() batch_size = max(1, (n_rows // n_batches) + 1) batch_slices = [] for batch_i in range(n_batches): start = batch_i * batch_size stop = min((batch_i + 1) * batch_size, n_rows) if stop > start: batch_slices += [slice(start, stop)] # prof() result_batches = zap.work_orders( [ Munch(fn=_do_batch_unique, rng=batch_slice, dyemat=flat_dyemat) for batch_slice in batch_slices ], _process_mode=True, _trap_exceptions=False, ) # prof() # At this point results has a unique results from each batch # and now we need to merge them. # First we concatenate them all into a new array cat_dts = np.concatenate([batch[0] for batch in result_batches]) # prof() # Stack all the true_dt_iz (which comes from the inverse of unique) # but then each of these has to be incremented to index into the # concatenated stack i = 0 cat_true_dt_iz = [] for batch in result_batches: true_dt_iz = batch[1] cat_true_dt_iz += [true_dt_iz + i] i += batch[0].shape[0] cat_true_dt_iz = np.concatenate(cat_true_dt_iz) # prof() # Stack all the counts cat_dt_counts = np.concatenate([batch[2] for batch in result_batches]) # prof() # Unique on the batches dt_mat, true_dt_iz, dt_counts = np.unique(cat_dts, return_inverse=True, return_counts=True, axis=0) # prof() dt_counts = np.array([ cat_dt_counts[np.argwhere(true_dt_iz == i)].sum() for i in range(dt_mat.shape[0]) ]) # prof() true_dt_iz = true_dt_iz[cat_true_dt_iz] # prof() n_dts = dt_mat.shape[0] output_dt_mat[0:n_dts] = dt_mat.reshape((n_dts, n_channels, n_cycles)) # prof() flann = _create_flann(dt_mat) dyetracks_df, dt_pep_sources_df, dye_to_best_pep_df = _setup_pep_source_dfs( true_dt_iz, true_pep_iz, dt_counts) return dyetracks_df, dt_pep_sources_df, dye_to_best_pep_df, flann, n_dts
def ims_import(src_dir, ims_import_params, progress=None, pipeline=None): ( mode, nd2_paths, tif_paths_by_field_channel_cycle, npy_paths_by_field_channel_cycle, n_fields_true, n_channels, n_cycles_true, dim, ) = _scan_files(src_dir) target_dim = max(dim[0], dim[1]) if not utils.is_power_of_2(target_dim): new_dim = utils.next_power_of_2(target_dim) _convert_message(target_dim, new_dim) target_dim = new_dim src_channels = list(range(n_channels)) def clamp_fields(n_fields_true): n_fields = n_fields_true n_fields_limit = ims_import_params.get("n_fields_limit") if n_fields_limit is not None: n_fields = n_fields_limit start_field = ims_import_params.get("start_field", 0) if start_field + n_fields > n_fields_true: n_fields = n_fields_true - start_field return start_field, n_fields def clamp_cycles(n_cycles_true): n_cycles = n_cycles_true n_cycles_limit = ims_import_params.get("n_cycles_limit") if n_cycles_limit is not None: n_cycles = n_cycles_limit start_cycle = ims_import_params.get("start_cycle", 0) if start_cycle + n_cycles > n_cycles_true: n_cycles = n_cycles_true - start_cycle return start_cycle, n_cycles tsv_data = tsv.load_tsv_for_folder(src_dir) ims_import_result = ImsImportResult(params=ims_import_params, tsv_data=Munch(tsv_data)) if ims_import_params.is_movie: start_field, n_fields = clamp_fields(len(nd2_paths)) # In movie mode, the n_fields from the .nd2 file is becoming n_cycles n_cycles_true = n_fields_true start_cycle, n_cycles = clamp_cycles(n_cycles_true) field_iz, n_cycles_found = zap.arrays( _do_movie_import, dict( nd2_path=nd2_paths[start_field:start_field + n_fields], output_field_i=list(range(n_fields)), ), _process_mode=True, _progress=progress, _stack=True, start_cycle=start_cycle, n_cycles=n_cycles, target_dim=target_dim, nd2_import_result=ims_import_result, ) else: start_field, n_fields = clamp_fields(n_fields_true) if pipeline: pipeline.set_phase(0, 2) if mode == "nd2": n_cycles_true = len(nd2_paths) # SCATTER zap.arrays( _do_nd2_scatter, dict(cycle_i=list(range(len(nd2_paths))), src_path=nd2_paths), _process_mode=True, _progress=progress, _stack=True, start_field=start_field, n_fields=n_fields, n_channels=n_channels, target_dim=target_dim, ) elif mode == "tif": # SCATTER work_orders = [ Munch(field_i=k[0], channel_i=k[1], cycle_i=k[2], path=path) for k, path in tif_paths_by_field_channel_cycle.items() ] results = zap.work_orders(_do_tif_scatter, work_orders, _trap_exceptions=False) # CHECK that every file exists for f in range(n_fields): for ch in range(n_channels): for cy in range(n_cycles_true): expected = f"__{f:03d}-{ch:02d}-{cy:02d}.npy" if expected not in results: raise FileNotFoundError( f"File is missing in tif pattern: {expected}") elif mode == "npy": # In npy mode there's no scatter as the files are already fully scattered pass else: raise ValueError(f"Unknown im import mode {mode}") if pipeline: pipeline.set_phase(1, 2) # GATHER start_cycle, n_cycles = clamp_cycles(n_cycles_true) field_iz = zap.arrays( _do_gather, dict( input_field_i=list(range(start_field, start_field + n_fields)), output_field_i=list(range(0, n_fields)), ), _process_mode=True, _progress=progress, _stack=True, src_channels=src_channels, start_cycle=start_cycle, n_cycles=n_cycles, dim=target_dim, nd2_import_result=ims_import_result, mode=mode, npy_paths_by_field_channel_cycle=npy_paths_by_field_channel_cycle, ) ims_import_result.n_fields = len(field_iz) ims_import_result.n_channels = n_channels ims_import_result.n_cycles = n_cycles ims_import_result.dim = target_dim # CLEAN for file in local.cwd // "__*": file.delete() return ims_import_result