def test_nn_call_bag(self, use_train_data=False): """ Get a CallBag for the NN classifier on this plaster.run. use_train_data=True when you want to look at over-fitting. """ if use_train_data: true_pep_iz = self.test_nn.train_true_pep_iz pred_pep_iz = self.test_nn.train_pred_pep_iz check.affirm( true_pep_iz is not None and pred_pep_iz is not None, "The test_nn task was not run with the training_set", ) cached_pr = self.test_nn.train_peps_pr else: true_pep_iz = self.test_nn.test_true_pep_iz pred_pep_iz = self.test_nn.test_pred_pep_iz cached_pr = self.test_nn.test_peps_pr return CallBag( true_pep_iz=true_pep_iz, pred_pep_iz=pred_pep_iz, scores=self.test_nn.test_scores, prep_result=self.prep, sim_result=self.sim, cached_pr=cached_pr, classifier_name="nn", )
def test_rf_call_bag(self, use_train_data=False): """ Get a CallBag for the RF classifier on this plaster.run. use_train_data=True when you want to look at over-fitting. """ if use_train_data: true_pep_iz = self.test_rf.train_true_pep_iz pred_pep_iz = self.test_rf.train_pred_pep_iz scores = self.test_rf.train_scores all_class_scores = self.test_rf.train_all_class_scores cached_pr = self.test_rf.train_peps_pr cached_pr_abund = self.test_rf.train_peps_pr_abund check.affirm( true_pep_iz is not None and pred_pep_iz is not None, "The test_rf task was not run with the training_set", ) else: true_pep_iz = self.test_rf.test_true_pep_iz pred_pep_iz = self.test_rf.test_pred_pep_iz scores = self.test_rf.test_scores all_class_scores = self.test_rf.test_all_class_scores cached_pr = self.test_rf.test_peps_pr cached_pr_abund = self.test_rf.test_peps_pr_abund return CallBag( true_pep_iz=true_pep_iz, pred_pep_iz=pred_pep_iz, scores=scores, all_class_scores=all_class_scores, prep_result=self.prep, sim_result=self.sim, cached_pr=cached_pr, cached_pr_abund=cached_pr_abund, classifier_name="rf", )
def it_handles_protease_none(): gen = gen_klass(n_edmans=1, protease=None, label_set=["A", "B"]) perms = list(gen.run_parameter_permutator()) check.affirm(len(perms) == 2, "permutator should return 2 schemes") check.affirm( perms[0][0] == None and perms[1][0] == None, "both schemes should have protease=None", )
def _z_stack_import( nd2_path: Path, target_mea: int, import_result: ImsImportResult, dst_ch_i_to_src_ch_i: List[int], movie_n_slices_per_field, ): """ A single ND2 file with multiple fields """ working_im = np.zeros((target_mea, target_mea), OUTPUT_NP_TYPE) with _nd2(nd2_path) as nd2: n_actual_cycles = nd2.n_fields n_dst_channels = len(dst_ch_i_to_src_ch_i) actual_dim = nd2.dim assert n_actual_cycles % movie_n_slices_per_field == 0 n_fields = n_actual_cycles // movie_n_slices_per_field for field_i in range(n_fields): chcy_arr = import_result.allocate_field( field_i, (n_dst_channels, movie_n_slices_per_field, target_mea, target_mea), OUTPUT_NP_TYPE, ) chcy_ims = chcy_arr.arr() check.affirm( actual_dim[0] <= target_mea and actual_dim[1] <= target_mea, f"nd2 scatter requested {target_mea} which is smaller than {actual_dim}", ) for dst_ch_i in range(n_dst_channels): src_ch_i = dst_ch_i_to_src_ch_i[dst_ch_i] for cy_out_i, cy_in_i in enumerate( range( field_i * movie_n_slices_per_field, (field_i + 1) * movie_n_slices_per_field, )): im = nd2.get_field(cy_in_i, src_ch_i).astype(OUTPUT_NP_TYPE) if actual_dim[0] != target_mea or actual_dim[ 1] != target_mea: # CONVERT into a zero pad working_im[0:actual_dim[0], 0:actual_dim[1]] = im im = working_im chcy_ims[dst_ch_i, cy_out_i, :, :] = im # Task: Add quality import_result.save_field(field_i, chcy_arr) return list(range(n_fields)), movie_n_slices_per_field
def rolling_window(im, window_dim, n_samples, return_coords=False): """ Sample im in windows of shape window_dim n_sample number of times; this may require overlapping the sample windows. Arguments: im: ndarray of ndim==2 window_dim: 2-tuple, the size of the window (smaller than im.shape) n_samples: 2-tuple, the number of sample along each dimension """ check.affirm(im.ndim >= 2) check.list_or_tuple_t(window_dim, int, expected_len=2) check.list_or_tuple_t(n_samples, int, expected_len=2) extra_dims = im.shape[0:-2] n_extra_dims = len(extra_dims) start = [None, None] stop = [None, None] slices = [None, None] for d in range(2): if window_dim[d] * n_samples[d] < im.shape[n_extra_dims + d]: raise ValueError( f"Dimension of {im.shape[n_extra_dims+d]} can not be spanned by {n_samples[d]} spans of length {window_dim[d]}." ) start[d] = np.linspace(0, im.shape[n_extra_dims + d] - window_dim[d], n_samples[d], dtype=int) stop[d] = start[d] + window_dim[d] slices[d] = [slice(i, i + window_dim[d]) for i in start[d]] ims = np.zeros( (*extra_dims, n_samples[0], n_samples[1], window_dim[0], window_dim[1]), dtype=im.dtype, ) coords = np.zeros((n_samples[0], n_samples[1], 2)) for y, yy in enumerate(slices[0]): for x, xx in enumerate(slices[1]): coords[y, x] = (yy.start, xx.start) if n_extra_dims > 0: ims[:, y, x, :, :] = im[:, yy, xx] else: ims[y, x, :, :] = im[yy, xx] if return_coords: return ims, coords else: return ims
def _do_movie_import_npy( scan_result, input_field_i, output_field_i, start_cycle, n_cycles, target_mea, import_result, dst_ch_i_to_src_ch_i, ): """ In this mode, each field is a collection of images taken sequentially without moving stage. """ n_dst_channels = len(dst_ch_i_to_src_ch_i) actual_dim = scan_result.dim working_im = np.zeros((target_mea, target_mea), OUTPUT_NP_TYPE) chcy_arr = import_result.allocate_field( output_field_i, (n_dst_channels, n_cycles, target_mea, target_mea), OUTPUT_NP_TYPE, ) chcy_ims = chcy_arr.arr() assert start_cycle + n_cycles <= scan_result.n_cycles check.affirm( actual_dim[0] <= target_mea and actual_dim[1] <= target_mea, f"npy requested {target_mea} which is smaller than {actual_dim}", ) for dst_ch_i in range(n_dst_channels): src_ch_i = dst_ch_i_to_src_ch_i[dst_ch_i] for cy_in_i in range(start_cycle, start_cycle + n_cycles): cy_out_i = cy_in_i - start_cycle im_path = scan_result.npy_paths_by_field_channel_cycle[ input_field_i, src_ch_i, cy_in_i] im = np.load(str(im_path)).astype(OUTPUT_NP_TYPE) assert im.shape == actual_dim if actual_dim != (target_mea, target_mea): # CONVERT into a zero pad working_im[0:actual_dim[0], 0:actual_dim[1]] = im im = working_im chcy_ims[dst_ch_i, cy_out_i, :, :] = im # Task: Add quality import_result.save_field(output_field_i, chcy_arr) return output_field_i, n_cycles
def get_pro_ptm_locs(self, protein_id): """ Returns the ptm list for the given protein_id. Note that this information is stored in a DataFrame maintained on a per-run basis, so we sanity check here that ptms reported by all runs are the same. """ ptms_by_run = self.all_lists( lambda run: run.prep.get_pro_ptm_locs(protein_id=protein_id)) check.affirm( all([ptms_by_run[0] == p for p in ptms_by_run[1:]]), "PTMs differ in runs!", ValueError, ) return ptms_by_run[0]
def _do_movie_import(nd2_path, output_field_i, start_cycle, n_cycles, target_dim, nd2_import_result): """ Import Nikon ND2 "movie" files. In this mode, each .nd2 file is a collection of images taken sequentially for a single field. This is in contrast to the typical mode where each .nd2 file is a chemical cycle spanning all fields/channels. Since all data for a given field is already in a single file, the parallel scatter/gather employed by the "normal" ND2 import task is not necessary. The "fields" from the .nd2 file become "cycles" as if the instrument had taken 1 field with a lot of cycles. """ nd2 = _nd2(nd2_path) ims = nd2.get_fields() n_actual_cycles = ims.shape[0] n_channels = ims.shape[1] actual_dim = ims.shape[2:4] # The .nd2 file is usually of shape (n_fields, n_channels, dim, dim) # but in a movie, the n_fields is becoming the n_cycles so swap the fields and channel # putting ims into (n_channels, n_cycles, dim, dim) chcy_ims = np.swapaxes(ims, 0, 1) assert start_cycle + n_cycles <= n_actual_cycles chcy_ims = chcy_ims[:, start_cycle:start_cycle + n_cycles, :, :] check.affirm( actual_dim[0] <= target_dim and actual_dim[1] <= target_dim, f"nd2 scatter requested {target_dim} which is smaller than {actual_dim}", ) if actual_dim[0] != target_dim or actual_dim[1] != target_dim: # CONVERT into a zero pad new_chcy_ims = np.zeros((n_channels, n_cycles, target_dim, target_dim), dtype=ims.dtype) new_chcy_ims[:, :, 0:actual_dim[0], 0:actual_dim[1]] = chcy_ims[:, :, :, :] chcy_ims = new_chcy_ims # TODO Add quality nd2_import_result.save_field(output_field_i, chcy_ims) return output_field_i, n_actual_cycles
def _do_nd2_scatter(src_path, start_field, n_fields, cycle_i, n_channels, target_dim): """ Scatter a cycle .nd2 into individual numpy files. target_dim is a scalar. The target will be put into this square form. """ nd2 = _nd2(src_path) ims = nd2.get_fields() _n_channels = ims.shape[1] actual_dim = ims.shape[2:4] assert n_channels == _n_channels check.affirm( actual_dim[0] <= target_dim and actual_dim[1] <= target_dim, f"nd2 scatter requested {target_dim} which is smaller than {actual_dim}", ) if actual_dim[0] != target_dim or actual_dim[1] != target_dim: # CONVERT into a zero pad new_ims = np.zeros((n_fields, _n_channels, target_dim, target_dim), dtype=ims.dtype) new_ims[:, :, 0:actual_dim[0], 0:actual_dim[1]] = ims[:, :, :, :] ims = new_ims dst_files = [] for field_i in range(start_field, start_field + n_fields): info = Munch( x=nd2.x[field_i], y=nd2.y[field_i], z=nd2.z[field_i], pfs_status=nd2.pfs_status[field_i], pfs_offset=nd2.pfs_offset[field_i], exposure_time=nd2.exposure_time[field_i], camera_temp=nd2.camera_temp[field_i], cycle_i=cycle_i, field_i=field_i, ) info_dst_file = _metadata_filename_by_field_cycle(field_i, cycle_i) utils.json_save(info_dst_file, info) for channel_i in range(n_channels): dst_file = _npy_filename_by_field_channel_cycle( field_i, channel_i, cycle_i) dst_files += [dst_file] np.save(dst_file, ims[field_i, channel_i]) return dst_files
def mat_lessflat(mat, dim1=None, dim2=None): """ To unflatten you must know either dim1 or dim2 Example, suppose mat is (2, 6) m = mat_lessflat(mat, dim2=3) assert m.shape == (2, 2, 3) """ check.array_t(mat, ndim=2) check.affirm(dim1 is not None or dim2 is not None) if dim1 is None: dim1 = mat.shape[1] // dim2 if dim2 is None: dim2 = mat.shape[1] // dim1 return mat.reshape(mat.shape[0], dim1, dim2)
def context(cy_ims, locs, reg_psf_samples, peak_mea): """ with radiometry.context(...) as ctx: zap.work_orders(do_radiometry, ...) """ lib = load_lib() check.array_t(cy_ims, ndim=3, dtype=np.float64) n_cycles, height, width = cy_ims.shape check.array_t(locs, ndim=2, dtype=np.float64) check.affirm(locs.shape[1] == 2) n_peaks = locs.shape[0] check.array_t(reg_psf_samples, ndim=3) n_divs, n_divs_w, n_params = reg_psf_samples.shape assert n_divs == n_divs_w assert n_params == 3 out_radiometry = np.zeros((n_peaks, n_cycles, 4), dtype=np.float64) ctx = RadiometryContext( cy_ims=F64Arr.from_ndarray(cy_ims), locs=F64Arr.from_ndarray(locs), _locs=locs, n_cycles=n_cycles, n_peaks=n_peaks, n_divs=n_divs, peak_mea=peak_mea, height=height, width=width, reg_psf_samples=F64Arr.from_ndarray(reg_psf_samples), out_radiometry=F64Arr.from_ndarray(out_radiometry), _out_radiometry=out_radiometry, ) error = lib.context_init(ctx) if error is not None: raise CException(error) try: yield ctx finally: lib.context_free(ctx)
def intersection_roi_from_aln_offsets(aln_offsets, raw_dim): """ Compute the ROI that contains pixels from all frames given the aln_offsets (returned from align) and the dim of the original images. """ aln_offsets = np.array(aln_offsets) check.affirm(np.all(aln_offsets[0] == (0, 0)), "intersection roi must start with (0,0)") # intersection_roi is the ROI in the coordinate space of # the [0] frame that has pixels from every cycle. clip_dim = ( np.min(aln_offsets[:, 0] + raw_dim[0]) - np.max(aln_offsets[:, 0]), np.min(aln_offsets[:, 1] + raw_dim[1]) - np.max(aln_offsets[:, 1]), ) b = max(0, -np.min(aln_offsets[:, 0])) t = min(raw_dim[0], b + clip_dim[0]) l = max(0, -np.min(aln_offsets[:, 1])) r = min(raw_dim[1], l + clip_dim[1]) return ROI(loc=YX(b, l), dim=HW(t - b, r - l))
def sim(sim_params, prep_result, progress=None, pipeline=None): """ Map the simulation over the peptides in prep_result. This is actually performed twice in order to get a train and (different!) test set The "train" set includes decoys, the test set does not; furthermore the the error modes and radiometry noise is different in each set. """ if sim_params.random_seed is None: sim_params.random_seed = int(time.time()) np.random.seed(sim_params.random_seed) # CREATE a *training-set* for all peptides (real and decoy) if pipeline: pipeline.set_phase(0, 2) # Sanity check that all the peps are accounted for pep_seqs_with_decoys = prep_result.pepseqs__with_decoys() n_peps = pep_seqs_with_decoys.pep_i.nunique() assert n_peps == prep_result.n_peps ( train_dyemat, train_radmat, train_recalls, train_flus, train_flu_remainders, ) = _run_sim( sim_params, pep_seqs_with_decoys, name="train", n_peps=n_peps, n_samples=sim_params.n_samples_train, progress=progress, ) if sim_params.is_survey: test_dyemat = None test_radmat = None test_recalls = None test_flus = None test_flu_remainders = None else: # CREATE a *test-set* for real-only peptides if pipeline: pipeline.set_phase(1, 2) ( test_dyemat, test_radmat, test_recalls, test_flus, test_flu_remainders, ) = _run_sim( sim_params, prep_result.pepseqs__no_decoys(), name="test", n_peps=n_peps, n_samples=sim_params.n_samples_test, progress=progress, ) # CHECK that the train and test are not identical in SOME non_zero_row # If they are, there was some sort of RNG seed errors which might happen # for example if sub-processes failed to re-init their RNG seeds. # Test this by looking at pep_i==1 non_zero_rows = np.any(train_radmat[1] > 0, axis=(1, 2)) non_zero_row_args = np.argwhere(non_zero_rows)[0:100] train_rows = train_radmat[1, non_zero_row_args].reshape(( non_zero_row_args.shape[0], non_zero_row_args.shape[1] * train_radmat.shape[2] * train_radmat.shape[3], )) test_rows = test_radmat[1, non_zero_row_args].reshape(( non_zero_row_args.shape[0], non_zero_row_args.shape[1] * test_radmat.shape[2] * test_radmat.shape[3], )) if train_rows.shape[ 0] > 0 and not sim_params.allow_train_test_to_be_identical: any_differences = np.any( np.diagonal(cdist(train_rows, test_rows)) != 0.0) check.affirm(any_differences, "Train and test sets are identical") return SimResult( params=sim_params, train_dyemat=train_dyemat, train_radmat=train_radmat, train_recalls=train_recalls, train_flus=train_flus, train_flu_remainders=train_flu_remainders, test_dyemat=test_dyemat, test_radmat=test_radmat, test_recalls=test_recalls, test_flus=test_flus, test_flu_remainders=test_flu_remainders, )
def sim_v1(sim_params, prep_result, progress=None, pipeline=None): """ Map the simulation over the peptides in prep_result. This is actually performed twice in order to get a train and (different!) test set The "train" set includes decoys, the test set does not; furthermore the the error modes and radiometry noise is different in each set. """ if sim_params.random_seed is None: sim_params.random_seed = int(time.time()) np.random.seed(sim_params.random_seed) # CREATE a *training-set* for all peptides (real and decoy) if pipeline: pipeline.set_phase(0, 2) # Sanity check that all the peps are accounted for pep_seqs_with_decoys = prep_result.pepseqs__with_decoys() n_peps = pep_seqs_with_decoys.pep_i.nunique() assert n_peps == prep_result.n_peps ( train_dyemat, train_radmat, train_pep_recalls, train_flus, train_flu_remainders, train_true_pep_iz, ) = _run_sim( sim_params, pep_seqs_with_decoys, name="train", n_peps=n_peps, n_samples=sim_params.n_samples_train, progress=progress, ) if sim_params.is_survey: test_dyemat = None test_radmat = None test_recalls = None test_flus = None test_flu_remainders = None test_true_pep_iz = None else: # CREATE a *test-set* for real-only peptides if pipeline: pipeline.set_phase(1, 2) ( test_dyemat, test_radmat, test_recalls, test_flus, test_flu_remainders, test_true_pep_iz, ) = _run_sim( sim_params, prep_result.pepseqs__no_decoys(), name="test", n_peps=n_peps, n_samples=sim_params.n_samples_test, progress=progress, ) # CHECK that the train and test are not identical in SOME non_zero_row # If they are, there was some sort of RNG seed errors which might happen # for example if sub-processes failed to re-init their RNG seeds. # Test this by looking at pep_i==1 non_zero_rows = np.any(train_radmat[1] > 0, axis=(1, 2)) non_zero_row_args = np.argwhere(non_zero_rows)[0:100] train_rows = train_radmat[1, non_zero_row_args].reshape(( non_zero_row_args.shape[0], non_zero_row_args.shape[1] * train_radmat.shape[2] * train_radmat.shape[3], )) test_rows = test_radmat[1, non_zero_row_args].reshape(( non_zero_row_args.shape[0], non_zero_row_args.shape[1] * test_radmat.shape[2] * test_radmat.shape[3], )) if train_rows.shape[ 0] > 0 and not sim_params.allow_train_test_to_be_identical: any_differences = np.any( np.diagonal(cdist(train_rows, test_rows)) != 0.0) check.affirm(any_differences, "Train and test sets are identical") if train_dyemat is not None: train_dyemat.reshape((train_dyemat.shape[0] * train_dyemat.shape[1], *train_dyemat.shape[2:])) if train_radmat is not None: train_radmat.reshape((train_radmat.shape[0] * train_radmat.shape[1], *train_radmat.shape[2:])) if test_dyemat is not None: test_dyemat.reshape((test_dyemat.shape[0] * test_dyemat.shape[1], *test_dyemat.shape[2:])) if test_radmat is not None: test_radmat.reshape((test_radmat.shape[0] * test_radmat.shape[1], *test_radmat.shape[2:])) # REMOVE all-zero rows (EXCEPT THE FIRST which is the nul row) assert np.all(train_dyemat[0, :, :] == 0) some_non_zero_row_args = np.argwhere( ~np.all(train_dyemat[:, :, :] == 0, axis=(1, 2))).flatten() some_non_zero_row_args = np.concatenate(([0], some_non_zero_row_args)) # TASK: Plucking out the non-zero rows doesn't work well # with Arrtay results -- I need to rethink that. # For now, I'm converting this back to np.ndarray train_dyemat = train_dyemat[some_non_zero_row_args] train_radmat = train_radmat[some_non_zero_row_args] train_true_pep_iz = train_true_pep_iz[some_non_zero_row_args] if test_dyemat is not None: assert np.all(test_dyemat[0, :, :] == 0) some_non_zero_row_args = np.argwhere( ~np.all(test_dyemat[:, :, :] == 0, axis=(1, 2))).flatten() # DO not add a nul row into the test data # some_non_zero_row_args = np.concatenate(([0], some_non_zero_row_args)) test_dyemat = test_dyemat[some_non_zero_row_args] test_radmat = test_radmat[some_non_zero_row_args] test_true_pep_iz = test_true_pep_iz[some_non_zero_row_args] return SimV1Result( params=sim_params, train_dyemat=train_dyemat, train_radmat=train_radmat, train_pep_recalls=train_pep_recalls, train_flus=train_flus, train_flu_remainders=train_flu_remainders, train_true_pep_iz=train_true_pep_iz, test_dyemat=test_dyemat, test_radmat=test_radmat, test_recalls=test_recalls, test_flus=test_flus, test_true_pep_iz=test_true_pep_iz, test_flu_remainders=test_flu_remainders, )
def it_pushes_msg(): with zest.raises(check.CheckAffirmError) as e: check.affirm(False, "abc") assert e.exception.message == "abc"
def it_accepts_exception_instance(): with zest.raises(ValueError) as e: check.affirm(False, exp=ValueError())
def it_accepts_exception_type(): with zest.raises(ValueError): check.affirm(False, exp=ValueError)
def it_raises_checkerror_by_default(): with zest.raises(check.CheckAffirmError): check.affirm(False)
def it_passes(): check.affirm(True)
def false_calls(self, elem_i, n_false): """ For a nice viz of the confusion matrix, see here: https://stackoverflow.com/a/50671617 (Except that viz is transposed compared to our mats.) There's two kinds of off-diagonal failures wrt to any element "A": * FALSE-POSITIVES: Elements that are called A but are not A. I use a mnemonic: "im-POS-ters", ie the false "POS-itives" * FALSE-NEGATIVES: Elements that are not A but that steal calls from true A's. I think of these as "thieves" stealing from the truth. These are symmetric relationships: If B is an imposter of A then A is a thief of B. True negatives wrt to "A" are all of the elements outside the row and col of A. """ n_dim = self.shape[0] assert self.shape[1] == n_dim if n_false >= n_dim: return None check.affirm(0 <= elem_i < n_dim, "elem_i out of range") # For now, only square matrices are supported assert self.shape[0] == self.shape[1] # Grab sums BEFORE removing the diagonal row_sum = self[elem_i, :].sum() col_sum = self[:, elem_i].sum() # FETCH the top falses (imposters and thieves) with the diag removed # to avoid self-collision. Make a copy first copy = np.copy(self) np.fill_diagonal(copy, 0) sorted_false_pos_pep_iz = np.argsort(copy[elem_i, :])[::-1] sorted_false_neg_pep_iz = np.argsort(copy[:, elem_i])[::-1] false_positive_tuples = [ ( f"FP{i}", sorted_false_pos_pep_iz[i], float( utils.np_safe_divide( copy[elem_i, sorted_false_pos_pep_iz[i]], row_sum, default=0.0 ) ), ) for i in range(n_false) if sorted_false_pos_pep_iz[i] > 0 ] false_negative_tuples = [ ( f"FN{i}", sorted_false_neg_pep_iz[i], float( utils.np_safe_divide( copy[sorted_false_neg_pep_iz[i], elem_i], col_sum, default=0.0 ) ), ) for i in range(n_false) if sorted_false_neg_pep_iz[i] > 0 ] return false_positive_tuples + false_negative_tuples
def _do_movie_import_nd2( scan_result, input_field_i, output_field_i, start_cycle, n_cycles, target_mea, import_result, dst_ch_i_to_src_ch_i, ): """ Import Nikon ND2 "movie" files. In this mode, each .nd2 file is a collection of images taken sequentially for a single field. This is in contrast to the typical mode where each .nd2 file is a chemical cycle spanning all fields/channels. Since all data for a given field is already in a single file, the parallel scatter/gather employed by the "normal" ND2 import task is not necessary. The "fields" from the .nd2 file become "cycles" as if the instrument had taken 1 field with a lot of cycles. """ working_im = np.zeros((target_mea, target_mea), OUTPUT_NP_TYPE) nd2_path = scan_result.nd2_paths[input_field_i] with _nd2(nd2_path) as nd2: n_actual_cycles = nd2.n_fields n_dst_channels = len(dst_ch_i_to_src_ch_i) actual_dim = nd2.dim chcy_arr = import_result.allocate_field( output_field_i, (n_dst_channels, n_cycles, target_mea, target_mea), OUTPUT_NP_TYPE, ) chcy_ims = chcy_arr.arr() assert start_cycle + n_cycles <= n_actual_cycles check.affirm( actual_dim[0] <= target_mea and actual_dim[1] <= target_mea, f"nd2 scatter requested {target_mea} which is smaller than {actual_dim}", ) for dst_ch_i in range(n_dst_channels): src_ch_i = dst_ch_i_to_src_ch_i[dst_ch_i] for cy_in_i in range(start_cycle, start_cycle + n_cycles): cy_out_i = cy_in_i - start_cycle im = nd2.get_field(cy_in_i, src_ch_i).astype(OUTPUT_NP_TYPE) if actual_dim[0] != target_mea or actual_dim[1] != target_mea: # CONVERT into a zero pad working_im[0:actual_dim[0], 0:actual_dim[1]] = im im = working_im chcy_ims[dst_ch_i, cy_out_i, :, :] = im # Task: Add quality import_result.save_field(output_field_i, chcy_arr) return output_field_i, n_actual_cycles
def sim_v2(sim_v2_params, prep_result, progress=None, pipeline=None): test_dyemat = None test_radmat = None test_true_pep_iz = None test_true_dye_iz = None test_true_row_ks = None train_radmat = None train_true_pep_iz = None train_true_dye_iz = None train_true_row_ks = None phase_i = 0 n_phases = 1 if sim_v2_params.train_includes_radmat: n_phases += 1 if not sim_v2_params.is_survey: n_phases += 2 # Training data # * always includes decoys # * may include radiometry # ----------------------------------------------------------------------- # debug("gen flus") # train_flus, train_pi_brights = _gen_flus(sim_v2_params, prep_result.pepseqs()) # debug("gen flus done") # RANDOM cleanup # Make the pipeline have a stub so I don't have to if pipeline... # Get rid of phases and just pass in a name to display if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 n_channels, n_cycles = sim_v2_params.n_channels_and_cycles train_dyemat, train_dyepeps, train_pep_recalls = prep_result.get_photobleaching( ) if train_dyemat is None: # This is a regular, non-photo-bleaching run pepseqs = prep_result.pepseqs__with_decoys() check.t(pepseqs, pd.DataFrame) # (pep_i, aa, pep_off_in_pro) pcbs = sim_v2_params.pcbs( pepseqs) # (p)ep_i, (c)hannel_i, (b)right_probability train_dyemat, train_dyepeps, train_pep_recalls = _dyemat_sim( sim_v2_params, pcbs, sim_v2_params.n_samples_train, progress, ) n_dyts = train_dyemat.shape[0] check.array_t( train_dyemat, shape=( n_dyts, n_channels * n_cycles, ), # unique dyetracks (n_rows, n_channels * n_cycles) ) # dyepeps are a map between dyetracks and peptides with a count # Example: # (2, 5, 110) => dyt_i=2 was generated by pep_i==5 110 times # (2, 7, 50) => dyt_i=2 was generated by pep_i==7 50 times check.array_t(train_dyepeps, shape=(None, 3)) # (dyt_i, pep_i, count) assert np.max(train_dyepeps[:, 0]) + 1 == n_dyts # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2) # Note that np.lexsort puts the primary sort key LAST in the argument # Seems like this sorting should be in _dyemat_sim? train_dyepeps = train_dyepeps[np.lexsort( (-train_dyepeps[:, 2], train_dyepeps[:, 0]))] if sim_v2_params.train_includes_radmat: if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 ( train_radmat, train_true_pep_iz, train_true_dye_iz, train_true_rows_ks, ) = _radmat_sim( train_dyemat.reshape(( train_dyemat.shape[0], sim_v2_params.n_channels, sim_v2_params.n_cycles, )), train_dyepeps, sim_v2_params.by_channel(), sim_v2_params.n_samples_train, sim_v2_params.n_channels, sim_v2_params.n_cycles, sim_v2_params.use_lognormal_model, progress, ) # Test data # * does not include decoys # * always includes radiometry # * may include dyetracks # * skipped if is_survey # ----------------------------------------------------------------------- if not sim_v2_params.is_survey: # test_flus, test_pi_brights = _gen_flus( # sim_v2_params, prep_result.pepseqs__no_decoys() # ) if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 test_dyemat, test_dyepeps, test_pep_recalls = prep_result.get_photobleaching( ) if test_dyemat is None: # This is a regular, non-photo-bleaching run test_dyemat, test_dyepeps, test_pep_recalls = _dyemat_sim( sim_v2_params, sim_v2_params.pcbs(prep_result.pepseqs__no_decoys()), sim_v2_params.n_samples_test, progress, ) # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2) # Note that np.lexsort puts the primary sort key LAST in the argument test_dyepeps = test_dyepeps[np.lexsort( (-test_dyepeps[:, 2], test_dyepeps[:, 0]))] if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 ( test_radmat, test_true_pep_iz, test_true_dye_iz, test_true_row_ks, ) = _radmat_sim( test_dyemat.reshape( (test_dyemat.shape[0], sim_v2_params.n_channels, sim_v2_params.n_cycles)), test_dyepeps, sim_v2_params.channel__priors(), sim_v2_params.n_samples_test, sim_v2_params.n_channels, sim_v2_params.n_cycles, sim_v2_params.use_lognormal_model, progress, ) if not sim_v2_params.allow_train_test_to_be_identical: # Move to a standalone _method # TASK: Add a dyepeps check # train_dyepeps_df = pd.DataFrame(train_dyepeps, columns=["dye_i", "pep_i", "count"]) # test_dyepeps_df = pd.DataFrame(test_dyepeps, columns=["dye_i", "pep_i", "count"]) # joined_df = train_dyepeps_df.set_index("pep_i").join( # test_dyepeps_df.set_index("pep_i") # ) if (train_radmat is not None and train_radmat.shape[0] == test_radmat.shape[0]): check.affirm( not _any_identical_non_zero_rows( train_radmat.reshape(( train_radmat.shape[0], train_radmat.shape[1] * train_radmat.shape[2], )), test_radmat.reshape(( test_radmat.shape[0], test_radmat.shape[1] * test_radmat.shape[2], )), ), "Train and test sets are identical. Probably RNG bug.", ) # REMOVE all-zero rows (EXCEPT THE FIRST which is the nul row) # Seems liek the remove should go into _dye non_zero_rows = np.argwhere(test_true_pep_iz != 0).flatten() test_radmat = test_radmat[non_zero_rows] test_true_pep_iz = test_true_pep_iz[non_zero_rows] test_true_dye_iz = test_true_dye_iz[non_zero_rows] if test_true_row_ks is not None: test_true_row_ks = test_true_row_ks[non_zero_rows] sim_result_v2 = SimV2Result( params=sim_v2_params, train_dyemat=train_dyemat, train_radmat=train_radmat, train_pep_recalls=train_pep_recalls, train_true_pep_iz=train_true_pep_iz, train_true_dye_iz=train_true_dye_iz, train_dyepeps=train_dyepeps, train_true_row_ks=train_true_row_ks, test_dyemat=test_dyemat, test_radmat=test_radmat, test_true_pep_iz=test_true_pep_iz, test_true_dye_iz=test_true_dye_iz, test_true_row_ks=test_true_row_ks, _flus=None, ) if sim_v2_params.generate_flus: # Why optional? Should it be optimized? sim_result_v2._generate_flu_info(prep_result) return sim_result_v2