def it_limits_slices_with_int(): res_a, res_b = zap.arrays( test6, dict(a=np.arange(10), b=np.arange(10)), c=3, _batch_size=2, _limit_slice=3, ) assert len(res_a) == 3 and len(res_b) == 3
def it_limits_slices(): res_a, res_b = zap.arrays( test4, dict(a=np.arange(10), b=np.arange(10)), c=3, _batch_size=2, _limit_slice=slice(3, 6), ) assert len(res_a) == 3 and len(res_b) == 3
def it_stacks_one_field(): res = zap.arrays(test5, dict(a=[1, 2], b=[3, 4]), c=3, _batch_size=2, _stack=True) assert isinstance(res, np.ndarray) assert np.all(res == np.array([[2 * 1, 2 * 3, 2 * 3], [2 * 2, 2 * 4, 2 * 3]]))
def it_maintains_returned_tuples(): res = zap.arrays( test4, dict(a=[1, 2], b=[3, 4]), c=3, _batch_size=2, ) assert isinstance(res, tuple) assert res == ([1 + 1, 2 + 1], [3 + 2, 4 + 2])
def false_rates_all_peps(self, at_prec, n_false=4): pep_iz = self._prep_result.peps().pep_i.values return pd.concat( zap.arrays( _do_false_rates_by_pep, dict(pep_i=pep_iz), bag=self, at_prec=at_prec, n_false=n_false, )).reset_index(drop=True)
def it_maintains_array_returns(): res = zap.arrays( test5, dict(a=[1, 2], b=[3, 4]), c=3, _batch_size=2, ) assert isinstance(res, list) assert np.all(res[0] == np.array([2 * 1, 2 * 3, 2 * 3])) assert np.all(res[1] == np.array([2 * 2, 2 * 4, 2 * 3]))
def it_stacks_all_fields(): res = zap.arrays(test4, dict(a=[1, 2], b=[3, 4]), c=3, _batch_size=2, _stack=True) assert isinstance(res, tuple) assert isinstance(res[0], np.ndarray) assert isinstance(res[1], np.ndarray) assert np.all(res[0] == np.array([[1 + 1, 2 + 1]])) assert np.all(res[1] == np.array([[3 + 2, 4 + 2]]))
def it_stacks_some_fields(): res = zap.arrays(test6, dict(a=[1, 2], b=[3, 4]), c=3, _batch_size=2, _stack=[True, False]) assert isinstance(res, tuple) assert isinstance(res[0], np.ndarray) assert isinstance(res[1], list) assert np.all(res[0] == np.array([[1 * 2, 3 * 2, 3 * 2], [2 * 2, 4 * 2, 3 * 2]])) assert res[1] == ["foo", "foo"]
def it_eliminates_batch_lists(): res = zap.arrays( test3, dict(a=[1, 2], b=[3, 4]), c=3, _batch_size=2, ) assert isinstance(res, list) assert res == [ [2 * 1, 2 * 3, 2 * 3], [2 * 2, 2 * 4, 2 * 3], ]
def _step_4_gmm_classify( radmat, dyemat, dt_mat, dt_inv_var_mat, dt_weights, flann, n_neighbors, dt_score_mode, dt_filter_threshold, dt_score_metric, dt_score_bias, penalty_coefs, rare_penalty, radius, progress, ): """ The dyemat is passed so that we can get the true_dt_iz for debugging """ check.array_t(radmat, ndim=3) true_dt_iz, pred_dt_iz, scores, vdists = zap.arrays( _do_nn_and_gmm, dict(unit_radrow=radmat, dyerow=dyemat), dt_mat=dt_mat, dt_inv_var_mat=dt_inv_var_mat, dt_weights=dt_weights, flann=flann, n_neighbors=n_neighbors, dt_score_mode=dt_score_mode, dt_filter_threshold=dt_filter_threshold, dt_score_metric=dt_score_metric, dt_score_bias=dt_score_bias, penalty_coefs=penalty_coefs, rare_penalty=rare_penalty, radius=radius, _progress=progress, _stack=True, ) # I use the dt counts as a weighting factor on the PDFs # which means that the scores can be > 1.0. # To ensure that all rows get an equal treatment in # normalization I simply divide them through by the # max value to put them into 0-1 range. scores = scores.flatten() scores /= np.max(scores) return true_dt_iz.flatten(), pred_dt_iz.flatten(), scores, vdists
def psf_fields_one_channel(ims_import_result, sigproc_v2_params, field_iz, channel_i, progress=None) -> priors.RegPSFPrior: """ Build up a regional PSF for one channel on the RAW images. Implemented in a parallel zap over every field and then combine the fields into a single RegPSF which stores: (divs, divs, peak_mea, peak_mea) """ if ims_import_result.n_fields == 0: return None with zap.Context(progress=progress): region_to_psf_per_field = zap.arrays( _do_psf_one_field_one_channel, dict(field_i=field_iz), _stack=True, peak_mea=sigproc_v2_params.peak_mea, divs=sigproc_v2_params.divs, bandpass_kwargs=dict( low_inflection=sigproc_v2_params.low_inflection, low_sharpness=sigproc_v2_params.low_sharpness, high_inflection=sigproc_v2_params.high_inflection, high_sharpness=sigproc_v2_params.high_sharpness, ), ims_import_result=ims_import_result, channel_i=channel_i, n_cycles_limit=sigproc_v2_params.n_cycles_limit, ) # SUM over fields psf_ims = np.sum(region_to_psf_per_field, axis=0) psf_ims = psf_normalize(psf_ims) # At this point psf_ims is a pixel image of the PSF at each reg div. # ie, 4 dimensional: (divs_y, divs_x, n_pixels_h, n_pixels_w) # Now we convert it to Gaussian Parameters by fitting so we don't have # to store the pixels anymore: just the 3 critical shape parameters: # sigma_x, sigma_y, and rho. # Use one frame of ims_import_result to sort out dimensions im = ims_import_result.ims[0, 0, 0] check.array_t(im, is_square=True) reg_psf = priors.RegPSFPrior.from_psf_ims(im.shape[-1], psf_ims) return reg_psf
def _step_2_create_pros_and_pro_seqs_dfs(pro_spec_df): """ Create pros_df and pro_seqs_df. Converts the sequence as a string into normalzied DataFrames """ # Sort proteins such that the protein(s) being 'reported' are at the top, which means # the most interesting peptides start at pep_i==1. _pro_spec_df = pro_spec_df.sort_values(by=["report", "name"], ascending=False) # pro_lists = parallel_array_split_map( # aa_str_to_list, dict(seqstr=_pro_spec_df.sequence.values) # ) pro_lists = zap.arrays(aa_str_to_list, dict(seqstr=_pro_spec_df.sequence.values)) # Make a full-df with columns "aa", "pro_i", "pro_name", and "ptm_locs", "pro_report" # Then split this into the two fully normalized dfs df = pd.DataFrame( [(i, pro_i + 1, pro_name, pro_ptm_locs, pro_report) for pro_i, (pro, pro_name, pro_ptm_locs, pro_report) in enumerate( zip( pro_lists, _pro_spec_df.name, _pro_spec_df.ptm_locs, _pro_spec_df.report, )) for i in pro], columns=["aa", "pro_i", "pro_name", "pro_ptm_locs", "pro_report"], ) # ADD reserved nul row nul = pd.DataFrame( [dict(aa=".", pro_i=0, pro_name="nul", pro_ptm_locs="", pro_report=0)]) df = pd.concat((nul, df)) pros_df = (df[["pro_i", "pro_name", "pro_ptm_locs", "pro_report"]].drop_duplicates().reset_index( drop=True).rename(columns=dict(pro_name="pro_id"))) pros_df["pro_is_decoy"] = False pro_seqs_df = df[["pro_i", "aa"]].reset_index(drop=True) return pros_df, pro_seqs_df
def ims_import(src_dir: Path, ims_import_params: ImsImportParams, progress=None, pipeline=None): reference_nd2_file_for_metadata = None scan_result = _scan_files(src_dir) if len(scan_result.nd2_paths) > 0: reference_nd2_file_for_metadata = scan_result.nd2_paths[0] target_mea = max(scan_result.dim[0], scan_result.dim[1]) if not utils.is_power_of_2(target_mea): new_dim = utils.next_power_of_2(target_mea) _convert_message(target_mea, new_dim) target_mea = new_dim def clamp_fields(n_fields_true: int) -> Tuple[int, int]: n_fields = n_fields_true n_fields_limit = ims_import_params.get("n_fields_limit") if n_fields_limit is not None: n_fields = n_fields_limit start_field = ims_import_params.get("start_field", 0) if start_field + n_fields > n_fields_true: n_fields = n_fields_true - start_field return start_field, n_fields def clamp_cycles(n_cycles_true: int) -> Tuple[int, int]: n_cycles = n_cycles_true n_cycles_limit = ims_import_params.get("n_cycles_limit") if n_cycles_limit is not None: n_cycles = n_cycles_limit start_cycle = ims_import_params.get("start_cycle", 0) if start_cycle is None: start_cycle = 0 if start_cycle + n_cycles > n_cycles_true: n_cycles = n_cycles_true - start_cycle return start_cycle, n_cycles tsv_data = tsv.load_tsv_for_folder(src_dir) # ALLOCATE the ImsImportResult ims_import_result = ImsImportResult(params=ims_import_params, tsv_data=Munch(tsv_data)) dst_ch_i_to_src_ch_i = ims_import_params.dst_ch_i_to_src_ch_i if dst_ch_i_to_src_ch_i is None: dst_ch_i_to_src_ch_i = [i for i in range(scan_result.n_channels)] n_out_channels = len(dst_ch_i_to_src_ch_i) # Sanity check that we didn't end up with any src_channels outside of the channel range assert all([ 0 <= src_ch_i < scan_result.n_channels for src_ch_i in dst_ch_i_to_src_ch_i ]) if ims_import_params.is_z_stack_single_file: field_iz, n_cycles_found = _z_stack_import( scan_result.nd2_paths[0], target_mea, ims_import_result, dst_ch_i_to_src_ch_i, ims_import_params.z_stack_n_slices_per_field, ) n_cycles = ims_import_params.z_stack_n_slices_per_field elif ims_import_params.is_movie: if scan_result.mode == ScanFileMode.nd2: # "Movie mode" means that there aren't any chemical cycles, but rather we are using "cycles" to represent different images in a zstack start_field, n_fields = clamp_fields(len(scan_result.nd2_paths)) # In movie mode, the n_fields from the .nd2 file is becoming n_cycles scan_result.n_cycles = scan_result.n_fields start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles) with zap.Context(progress=progress): field_iz, n_cycles_found = zap.arrays( _do_movie_import_nd2, dict( input_field_i=list( range(start_field, start_field + n_fields)), output_field_i=list(range(n_fields)), ), _stack=True, scan_result=scan_result, start_cycle=start_cycle, n_cycles=n_cycles, target_mea=target_mea, import_result=ims_import_result, dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i, ) elif scan_result.mode == ScanFileMode.npy: start_field, n_fields = clamp_fields(scan_result.n_fields) start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles) with zap.Context(progress=progress): field_iz, n_cycles_found = zap.arrays( _do_movie_import_npy, dict( input_field_i=list( range(start_field, start_field + n_fields)), output_field_i=list(range(n_fields)), ), _stack=True, scan_result=scan_result, start_cycle=start_cycle, n_cycles=n_cycles, target_mea=target_mea, import_result=ims_import_result, dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i, ) else: raise NotImplementedError() else: start_field, n_fields = clamp_fields(scan_result.n_fields) if pipeline: pipeline.set_phase(0, 2) if scan_result.mode == ScanFileMode.nd2: scan_result.n_cycles = len(scan_result.nd2_paths) # SCATTER with zap.Context(mode="thread", progress=progress): zap.arrays( _do_nd2_scatter, dict( cycle_i=list(range(len(scan_result.nd2_paths))), src_path=scan_result.nd2_paths, ), _stack=True, start_field=start_field, n_fields=n_fields, n_channels=scan_result.n_channels, target_mea=target_mea, ) elif scan_result.mode == ScanFileMode.tif: # SCATTER work_orders = [ Munch(field_i=k[0], channel_i=k[1], cycle_i=k[2], path=path) for k, path in scan_result.tif_paths_by_field_channel_cycle.items() ] with zap.Context(trap_exceptions=False): results = zap.work_orders(_do_tif_scatter, work_orders) # CHECK that every file exists for f in range(n_fields): for ch in range(scan_result.n_channels): for cy in range(scan_result.n_cycles): expected = f"__{f:03d}-{ch:02d}-{cy:02d}.npy" if expected not in results: raise FileNotFoundError( f"File is missing in tif pattern: {expected}") elif scan_result.mode == ScanFileMode.npy: # In npy mode there's no scatter as the files are already fully scattered pass else: raise ValueError(f"Unknown im import mode {scan_result.mode}") if pipeline: pipeline.set_phase(1, 2) # GATHER start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles) with zap.Context(progress=progress): field_iz = zap.arrays( _do_gather, dict( input_field_i=list( range(start_field, start_field + n_fields)), output_field_i=list(range(0, n_fields)), ), _stack=True, start_cycle=start_cycle, n_cycles=n_cycles, dim=target_mea, import_result=ims_import_result, mode=scan_result.mode, npy_paths_by_field_channel_cycle=scan_result. npy_paths_by_field_channel_cycle, dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i, ) if reference_nd2_file_for_metadata: with _nd2(reference_nd2_file_for_metadata) as nd2: if hasattr(nd2, "metadata"): full = Munch( metadata=nd2.metadata, metadata_seq=nd2.metadata_seq, ) ims_import_result._nd2_metadata_full = full def me(block_name, default=None): return utils.block_search(full.metadata.SLxExperiment, block_name, default) def mp(block_name, default=None): return utils.block_search( full.metadata_seq.SLxPictureMetadata, block_name, default) n_channels = mp("sPicturePlanes.uiSampleCount", 1) ims_import_result._nd2_metadata = Munch( calibrated_pixel_size=mp("dCalibration"), experiment_type="movie" if me("eType") == 1 else "edman", n_cycles=me("uLoopPars.uiCount"), cmd_before=me("wsCommandBeforeCapture"), cmd_after=me("wsCommandAfterCapture"), n_channels=n_channels, ) per_channel = [] for ch_i in range(n_channels): laser_wavelength = None laser_power = None n_lasers = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_uiMultiLaserLines0", 0, ) for i in range(n_lasers): is_used = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_bMultiLaserLineUsed0-{i:02d}", 0, ) if is_used == 1: laser_wavelength = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_uiMultiLaserLineWavelength0-{i:02d}", 0, ) laser_power = mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dMultiLaserLinePower0-{i:02d}", 0, ) ch_munch = Munch( laser_wavelength=laser_wavelength, laser_power=laser_power, camera_name=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.CameraUniqueName" ), sensor_pixels_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorPixels.cx" ), sensor_pixels_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorPixels.cy" ), sensor_microns_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorMicrons.cx" ), sensor_microns_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorMicrons.cy" ), bin_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.dBinningX" ), bin_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.dBinningY" ), format=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.wszFormatDesc" ), roi_l=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.left" ), roi_r=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.right" ), roi_t=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.top" ), roi_b=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.bottom" ), averaging=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.PropertiesQuality.Average" ), integration=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.PropertiesQuality.Integrate" ), name=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.Metadata.Channels.Channel_0.Name" ), dichroic_filter=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_sFilterName0" ), emission_filter=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_sFilterName1" ), optivar=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dZoomPosition" ), tirf_focus=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionFocus" ), tirf_align_x=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionX" ), tirf_align_y=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionY" ), objective_mag=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dObjectiveMag" ), objective_na=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dObjectiveNA" ), objective_refractive_index=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dRefractIndex" ), settings_name=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sOpticalConfigs.\x02.sOpticalConfigName" ), readout_mode=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Readout Mode" ), readout_rate=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Readout Rate" ), noise_filter=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Noise Filter" ), temperature=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Temperature" ), exposure=mp( f"sPicturePlanes.sSampleSetting.a{ch_i}.dExposureTime" ), ) per_channel += [ch_munch] ims_import_result._nd2_metadata.update(**Munch( per_channel=per_channel)) if me("eType") == 1: # Movie mode ims_import_result._nd2_metadata.update(**Munch( movie_start=me("dStart"), movie_period=me("dPeriod"), movie_duration=me("dDuration"), movie_duration_pref=me("bDurationPref"), movie_max_period_diff=me("dMaxPeriodDiff"), movie_min_period_diff=me("dMinPeriodDiff"), movie_avg_period_diff=me("dAvgPeriodDiff"), )) ims_import_result.n_fields = len(field_iz) ims_import_result.n_channels = n_out_channels ims_import_result.n_cycles = n_cycles ims_import_result.dim = target_mea ims_import_result.dtype = np.dtype(OUTPUT_NP_TYPE).name ims_import_result.src_dir = src_dir # CLEAN for file in local.cwd // "__*": file.delete() return ims_import_result
def nn(nn_params, sim_result, radmat, true_dyemat=None, progress=None): """ Main entrypoint for nearest_neighbors. Arguments: nn_params: TestNNParams sim_result: SimResult -- Uses the train_* values radmat: The radmat to classify. true_dyemat: Optional for debugging -- the dyemat of the radmat ie. the dyerow that corresponds to each radrow. progress: Optional progress callback Returns: pred_pep_iz scores This is composed of the following steps: 1. Create a unit radmat 2. Create a unique dyetrack mat (dt_mat); these are the "neighbors" that will be searched. 3. Create inverse variance for each row of dt_mat; inv_var_dt_mat 4. Classify each row of the unit radmat with the Gaussian Mixture Model. """ # Allocate the dt_mat as large as it COULD possibly be # and then after populating it with the unique values # we can resize if using dt_mat.base.resize(n_bytes) # The max size is the (extremely unlikely) value of # n_peps * n_samples check.array_t(radmat, ndim=3, dtype=RadType) check.array_t(sim_result.train_dyemat, ndim=4) shape = sim_result.train_dyemat.shape n_dts_max = shape[0] * shape[1] n_channels, n_cycles = shape[2:] dt_mat = ArrayResult("dt_mat", DyeType, shape=(n_dts_max, n_channels, n_cycles), mode="w+") # prof() _step_1_create_neighbors_lookup = _step_1_create_neighbors_lookup_multiprocess #_step_1_create_neighbors_lookup = _step_1_create_neighbors_lookup_singleprocess ( dyetracks_df, dt_pep_sources_df, dye_to_best_pep_df, flann, n_dts, ) = _step_1_create_neighbors_lookup( sim_result.train_dyemat, output_dt_mat=dt_mat.arr(), ) # prof("create neighbors") # dyetracks_df: (dye_i, weight) # dt_pep_sources_df: (dye_i, pep_i, n_rows) assert n_dts <= n_dts_max and n_dts == dyetracks_df.dye_i.max() + 1 # Collapse the dt_mat to the actual number of rows. # This will cause the memmap file to truncate in size. dt_mat.reshape((n_dts, n_channels, n_cycles)) # dt_mat is the dyetrack mat of the TARGETS as build by the training set # Not to be confused with dyemat which is the dyemat of the test points # There is no guarantee that the dyerow of a test point is even *in* # the training set. dt_inv_var_mat = _step_2_create_inverse_variances( dt_mat.arr(), np.array(sim_result.params.channel_i_to_vpd)) dt_weights = dyetracks_df.reindex(np.arange(n_dts), fill_value=0).weight.values channel_i_to_gain_inv = ( 1.0 / np.array(sim_result.params.channel_i_to_gain)).astype(RadType) # Now classify each radrow check.array_t(radmat, ndim=3) n_rows = radmat.shape[0] if true_dyemat is not None: assert true_dyemat.shape == radmat.shape pred_dt_scores = ArrayResult("pred_dt_scores", ScoreType, (n_rows, ), mode="w+") pred_scores = ArrayResult("pred_scores", ScoreType, (n_rows, ), mode="w+") pred_pep_iz = ArrayResult("pred_pep_iz", IndexType, (n_rows, ), mode="w+") pred_dt_iz = ArrayResult("pred_dt_iz", IndexType, (n_rows, ), mode="w+") true_dt_iz = ArrayResult("true_dt_iz", IndexType, (n_rows, ), mode="w+") # Score normalization requires knowing about the distribution of # scores but I do not want to make two full passes over the dataset. # To avoid this, I randomly sample a fraction of the dataset # to collect the score distribution and then I pass in a normalization # term into the second pass. if nn_params.random_seed is None: nn_params.random_seed = int(time.time()) # prof() zap.arrays( _do_nn, dict(i=np.arange(n_rows)), nn_params=nn_params, radmat=radmat, dt_mat=dt_mat.arr(), dt_inv_var_mat=dt_inv_var_mat, dt_weights=dt_weights, flann=flann, channel_i_to_gain_inv=channel_i_to_gain_inv, score_normalization=1.0, dye_to_best_pep_df=dye_to_best_pep_df, output_pred_dt_scores=pred_dt_scores.arr(), output_pred_scores=pred_scores.arr(), output_pred_pep_iz=pred_pep_iz.arr(), output_pred_dt_iz=pred_dt_iz.arr(), output_true_dt_iz=true_dt_iz.arr(), true_dyemat=true_dyemat, _progress=progress, ) return Munch( dt_mat=dt_mat, dyetracks_df=dyetracks_df, dt_pep_sources_df=dt_pep_sources_df, true_dt_iz=true_dt_iz, pred_dt_iz=pred_dt_iz, dt_scores=pred_dt_scores, scores=pred_scores, pred_pep_iz=pred_pep_iz, )
def ims_import(src_dir, ims_import_params, progress=None, pipeline=None): ( mode, nd2_paths, tif_paths_by_field_channel_cycle, npy_paths_by_field_channel_cycle, n_fields_true, n_channels, n_cycles_true, dim, ) = _scan_files(src_dir) target_dim = max(dim[0], dim[1]) if not utils.is_power_of_2(target_dim): new_dim = utils.next_power_of_2(target_dim) _convert_message(target_dim, new_dim) target_dim = new_dim src_channels = list(range(n_channels)) def clamp_fields(n_fields_true): n_fields = n_fields_true n_fields_limit = ims_import_params.get("n_fields_limit") if n_fields_limit is not None: n_fields = n_fields_limit start_field = ims_import_params.get("start_field", 0) if start_field + n_fields > n_fields_true: n_fields = n_fields_true - start_field return start_field, n_fields def clamp_cycles(n_cycles_true): n_cycles = n_cycles_true n_cycles_limit = ims_import_params.get("n_cycles_limit") if n_cycles_limit is not None: n_cycles = n_cycles_limit start_cycle = ims_import_params.get("start_cycle", 0) if start_cycle + n_cycles > n_cycles_true: n_cycles = n_cycles_true - start_cycle return start_cycle, n_cycles tsv_data = tsv.load_tsv_for_folder(src_dir) ims_import_result = ImsImportResult(params=ims_import_params, tsv_data=Munch(tsv_data)) if ims_import_params.is_movie: start_field, n_fields = clamp_fields(len(nd2_paths)) # In movie mode, the n_fields from the .nd2 file is becoming n_cycles n_cycles_true = n_fields_true start_cycle, n_cycles = clamp_cycles(n_cycles_true) field_iz, n_cycles_found = zap.arrays( _do_movie_import, dict( nd2_path=nd2_paths[start_field:start_field + n_fields], output_field_i=list(range(n_fields)), ), _process_mode=True, _progress=progress, _stack=True, start_cycle=start_cycle, n_cycles=n_cycles, target_dim=target_dim, nd2_import_result=ims_import_result, ) else: start_field, n_fields = clamp_fields(n_fields_true) if pipeline: pipeline.set_phase(0, 2) if mode == "nd2": n_cycles_true = len(nd2_paths) # SCATTER zap.arrays( _do_nd2_scatter, dict(cycle_i=list(range(len(nd2_paths))), src_path=nd2_paths), _process_mode=True, _progress=progress, _stack=True, start_field=start_field, n_fields=n_fields, n_channels=n_channels, target_dim=target_dim, ) elif mode == "tif": # SCATTER work_orders = [ Munch(field_i=k[0], channel_i=k[1], cycle_i=k[2], path=path) for k, path in tif_paths_by_field_channel_cycle.items() ] results = zap.work_orders(_do_tif_scatter, work_orders, _trap_exceptions=False) # CHECK that every file exists for f in range(n_fields): for ch in range(n_channels): for cy in range(n_cycles_true): expected = f"__{f:03d}-{ch:02d}-{cy:02d}.npy" if expected not in results: raise FileNotFoundError( f"File is missing in tif pattern: {expected}") elif mode == "npy": # In npy mode there's no scatter as the files are already fully scattered pass else: raise ValueError(f"Unknown im import mode {mode}") if pipeline: pipeline.set_phase(1, 2) # GATHER start_cycle, n_cycles = clamp_cycles(n_cycles_true) field_iz = zap.arrays( _do_gather, dict( input_field_i=list(range(start_field, start_field + n_fields)), output_field_i=list(range(0, n_fields)), ), _process_mode=True, _progress=progress, _stack=True, src_channels=src_channels, start_cycle=start_cycle, n_cycles=n_cycles, dim=target_dim, nd2_import_result=ims_import_result, mode=mode, npy_paths_by_field_channel_cycle=npy_paths_by_field_channel_cycle, ) ims_import_result.n_fields = len(field_iz) ims_import_result.n_channels = n_channels ims_import_result.n_cycles = n_cycles ims_import_result.dim = target_dim # CLEAN for file in local.cwd // "__*": file.delete() return ims_import_result