Beispiel #1
0
    def test_nn_call_bag(self, use_train_data=False):
        """
        Get a CallBag for the NN classifier on this plaster.run.
        use_train_data=True when you want to look at over-fitting.
        """
        if use_train_data:
            true_pep_iz = self.test_nn.train_true_pep_iz
            pred_pep_iz = self.test_nn.train_pred_pep_iz
            check.affirm(
                true_pep_iz is not None and pred_pep_iz is not None,
                "The test_nn task was not run with the training_set",
            )
            cached_pr = self.test_nn.train_peps_pr
        else:
            true_pep_iz = self.test_nn.test_true_pep_iz
            pred_pep_iz = self.test_nn.test_pred_pep_iz
            cached_pr = self.test_nn.test_peps_pr

        return CallBag(
            true_pep_iz=true_pep_iz,
            pred_pep_iz=pred_pep_iz,
            scores=self.test_nn.test_scores,
            prep_result=self.prep,
            sim_result=self.sim,
            cached_pr=cached_pr,
            classifier_name="nn",
        )
Beispiel #2
0
    def test_rf_call_bag(self, use_train_data=False):
        """
        Get a CallBag for the RF classifier on this plaster.run.
        use_train_data=True when you want to look at over-fitting.
        """
        if use_train_data:
            true_pep_iz = self.test_rf.train_true_pep_iz
            pred_pep_iz = self.test_rf.train_pred_pep_iz
            scores = self.test_rf.train_scores
            all_class_scores = self.test_rf.train_all_class_scores
            cached_pr = self.test_rf.train_peps_pr
            cached_pr_abund = self.test_rf.train_peps_pr_abund
            check.affirm(
                true_pep_iz is not None and pred_pep_iz is not None,
                "The test_rf task was not run with the training_set",
            )
        else:
            true_pep_iz = self.test_rf.test_true_pep_iz
            pred_pep_iz = self.test_rf.test_pred_pep_iz
            scores = self.test_rf.test_scores
            all_class_scores = self.test_rf.test_all_class_scores
            cached_pr = self.test_rf.test_peps_pr
            cached_pr_abund = self.test_rf.test_peps_pr_abund

        return CallBag(
            true_pep_iz=true_pep_iz,
            pred_pep_iz=pred_pep_iz,
            scores=scores,
            all_class_scores=all_class_scores,
            prep_result=self.prep,
            sim_result=self.sim,
            cached_pr=cached_pr,
            cached_pr_abund=cached_pr_abund,
            classifier_name="rf",
        )
Beispiel #3
0
 def it_handles_protease_none():
     gen = gen_klass(n_edmans=1, protease=None, label_set=["A", "B"])
     perms = list(gen.run_parameter_permutator())
     check.affirm(len(perms) == 2, "permutator should return 2 schemes")
     check.affirm(
         perms[0][0] == None and perms[1][0] == None,
         "both schemes should have protease=None",
     )
Beispiel #4
0
def _z_stack_import(
    nd2_path: Path,
    target_mea: int,
    import_result: ImsImportResult,
    dst_ch_i_to_src_ch_i: List[int],
    movie_n_slices_per_field,
):
    """
    A single ND2 file with multiple fields
    """
    working_im = np.zeros((target_mea, target_mea), OUTPUT_NP_TYPE)

    with _nd2(nd2_path) as nd2:
        n_actual_cycles = nd2.n_fields
        n_dst_channels = len(dst_ch_i_to_src_ch_i)
        actual_dim = nd2.dim

        assert n_actual_cycles % movie_n_slices_per_field == 0
        n_fields = n_actual_cycles // movie_n_slices_per_field

        for field_i in range(n_fields):
            chcy_arr = import_result.allocate_field(
                field_i,
                (n_dst_channels, movie_n_slices_per_field, target_mea,
                 target_mea),
                OUTPUT_NP_TYPE,
            )
            chcy_ims = chcy_arr.arr()

            check.affirm(
                actual_dim[0] <= target_mea and actual_dim[1] <= target_mea,
                f"nd2 scatter requested {target_mea} which is smaller than {actual_dim}",
            )

            for dst_ch_i in range(n_dst_channels):
                src_ch_i = dst_ch_i_to_src_ch_i[dst_ch_i]
                for cy_out_i, cy_in_i in enumerate(
                        range(
                            field_i * movie_n_slices_per_field,
                            (field_i + 1) * movie_n_slices_per_field,
                        )):
                    im = nd2.get_field(cy_in_i,
                                       src_ch_i).astype(OUTPUT_NP_TYPE)
                    if actual_dim[0] != target_mea or actual_dim[
                            1] != target_mea:
                        # CONVERT into a zero pad
                        working_im[0:actual_dim[0], 0:actual_dim[1]] = im
                        im = working_im

                    chcy_ims[dst_ch_i, cy_out_i, :, :] = im

            # Task: Add quality
            import_result.save_field(field_i, chcy_arr)

    return list(range(n_fields)), movie_n_slices_per_field
Beispiel #5
0
def rolling_window(im, window_dim, n_samples, return_coords=False):
    """
    Sample im in windows of shape window_dim n_sample number of times;
    this may require overlapping the sample windows.

    Arguments:
        im: ndarray of ndim==2
        window_dim: 2-tuple, the size of the window (smaller than im.shape)
        n_samples: 2-tuple, the number of sample along each dimension
    """
    check.affirm(im.ndim >= 2)
    check.list_or_tuple_t(window_dim, int, expected_len=2)
    check.list_or_tuple_t(n_samples, int, expected_len=2)
    extra_dims = im.shape[0:-2]
    n_extra_dims = len(extra_dims)

    start = [None, None]
    stop = [None, None]
    slices = [None, None]
    for d in range(2):
        if window_dim[d] * n_samples[d] < im.shape[n_extra_dims + d]:
            raise ValueError(
                f"Dimension of {im.shape[n_extra_dims+d]} can not be spanned by {n_samples[d]} spans of length {window_dim[d]}."
            )

        start[d] = np.linspace(0,
                               im.shape[n_extra_dims + d] - window_dim[d],
                               n_samples[d],
                               dtype=int)
        stop[d] = start[d] + window_dim[d]

        slices[d] = [slice(i, i + window_dim[d]) for i in start[d]]

    ims = np.zeros(
        (*extra_dims, n_samples[0], n_samples[1], window_dim[0],
         window_dim[1]),
        dtype=im.dtype,
    )
    coords = np.zeros((n_samples[0], n_samples[1], 2))

    for y, yy in enumerate(slices[0]):
        for x, xx in enumerate(slices[1]):
            coords[y, x] = (yy.start, xx.start)
            if n_extra_dims > 0:
                ims[:, y, x, :, :] = im[:, yy, xx]
            else:
                ims[y, x, :, :] = im[yy, xx]

    if return_coords:
        return ims, coords
    else:
        return ims
Beispiel #6
0
def _do_movie_import_npy(
    scan_result,
    input_field_i,
    output_field_i,
    start_cycle,
    n_cycles,
    target_mea,
    import_result,
    dst_ch_i_to_src_ch_i,
):
    """
    In this mode, each field is a collection of images taken sequentially without moving stage.
    """
    n_dst_channels = len(dst_ch_i_to_src_ch_i)
    actual_dim = scan_result.dim

    working_im = np.zeros((target_mea, target_mea), OUTPUT_NP_TYPE)

    chcy_arr = import_result.allocate_field(
        output_field_i,
        (n_dst_channels, n_cycles, target_mea, target_mea),
        OUTPUT_NP_TYPE,
    )
    chcy_ims = chcy_arr.arr()

    assert start_cycle + n_cycles <= scan_result.n_cycles
    check.affirm(
        actual_dim[0] <= target_mea and actual_dim[1] <= target_mea,
        f"npy requested {target_mea} which is smaller than {actual_dim}",
    )

    for dst_ch_i in range(n_dst_channels):
        src_ch_i = dst_ch_i_to_src_ch_i[dst_ch_i]
        for cy_in_i in range(start_cycle, start_cycle + n_cycles):
            cy_out_i = cy_in_i - start_cycle

            im_path = scan_result.npy_paths_by_field_channel_cycle[
                input_field_i, src_ch_i, cy_in_i]
            im = np.load(str(im_path)).astype(OUTPUT_NP_TYPE)
            assert im.shape == actual_dim
            if actual_dim != (target_mea, target_mea):
                # CONVERT into a zero pad
                working_im[0:actual_dim[0], 0:actual_dim[1]] = im
                im = working_im

            chcy_ims[dst_ch_i, cy_out_i, :, :] = im

        # Task: Add quality
        import_result.save_field(output_field_i, chcy_arr)

    return output_field_i, n_cycles
Beispiel #7
0
 def get_pro_ptm_locs(self, protein_id):
     """
     Returns the ptm list for the given protein_id.
     Note that this information is stored in a DataFrame maintained on a per-run
     basis, so we sanity check here that ptms reported by all runs are the same.
     """
     ptms_by_run = self.all_lists(
         lambda run: run.prep.get_pro_ptm_locs(protein_id=protein_id))
     check.affirm(
         all([ptms_by_run[0] == p for p in ptms_by_run[1:]]),
         "PTMs differ in runs!",
         ValueError,
     )
     return ptms_by_run[0]
Beispiel #8
0
def _do_movie_import(nd2_path, output_field_i, start_cycle, n_cycles,
                     target_dim, nd2_import_result):
    """
    Import Nikon ND2 "movie" files.

    In this mode, each .nd2 file is a collection of images taken sequentially for a single field.
    This is in contrast to the typical mode where each .nd2 file is a chemical cycle spanning
    all fields/channels.

    Since all data for a given field is already in a single file, the parallel
    scatter/gather employed by the "normal" ND2 import task is not necessary.

    The "fields" from the .nd2 file become "cycles" as if the instrument had
    taken 1 field with a lot of cycles.
    """

    nd2 = _nd2(nd2_path)

    ims = nd2.get_fields()
    n_actual_cycles = ims.shape[0]
    n_channels = ims.shape[1]
    actual_dim = ims.shape[2:4]

    # The .nd2 file is usually of shape (n_fields, n_channels, dim, dim)
    # but in a movie, the n_fields is becoming the n_cycles so swap the fields and channel
    # putting ims into (n_channels, n_cycles, dim, dim)
    chcy_ims = np.swapaxes(ims, 0, 1)

    assert start_cycle + n_cycles <= n_actual_cycles
    chcy_ims = chcy_ims[:, start_cycle:start_cycle + n_cycles, :, :]

    check.affirm(
        actual_dim[0] <= target_dim and actual_dim[1] <= target_dim,
        f"nd2 scatter requested {target_dim} which is smaller than {actual_dim}",
    )

    if actual_dim[0] != target_dim or actual_dim[1] != target_dim:
        # CONVERT into a zero pad
        new_chcy_ims = np.zeros((n_channels, n_cycles, target_dim, target_dim),
                                dtype=ims.dtype)
        new_chcy_ims[:, :, 0:actual_dim[0],
                     0:actual_dim[1]] = chcy_ims[:, :, :, :]
        chcy_ims = new_chcy_ims

    # TODO Add quality

    nd2_import_result.save_field(output_field_i, chcy_ims)

    return output_field_i, n_actual_cycles
Beispiel #9
0
def _do_nd2_scatter(src_path, start_field, n_fields, cycle_i, n_channels,
                    target_dim):
    """
    Scatter a cycle .nd2 into individual numpy files.

    target_dim is a scalar. The target will be put into this square form.
    """
    nd2 = _nd2(src_path)

    ims = nd2.get_fields()
    _n_channels = ims.shape[1]
    actual_dim = ims.shape[2:4]
    assert n_channels == _n_channels

    check.affirm(
        actual_dim[0] <= target_dim and actual_dim[1] <= target_dim,
        f"nd2 scatter requested {target_dim} which is smaller than {actual_dim}",
    )

    if actual_dim[0] != target_dim or actual_dim[1] != target_dim:
        # CONVERT into a zero pad
        new_ims = np.zeros((n_fields, _n_channels, target_dim, target_dim),
                           dtype=ims.dtype)
        new_ims[:, :, 0:actual_dim[0], 0:actual_dim[1]] = ims[:, :, :, :]
        ims = new_ims

    dst_files = []
    for field_i in range(start_field, start_field + n_fields):
        info = Munch(
            x=nd2.x[field_i],
            y=nd2.y[field_i],
            z=nd2.z[field_i],
            pfs_status=nd2.pfs_status[field_i],
            pfs_offset=nd2.pfs_offset[field_i],
            exposure_time=nd2.exposure_time[field_i],
            camera_temp=nd2.camera_temp[field_i],
            cycle_i=cycle_i,
            field_i=field_i,
        )
        info_dst_file = _metadata_filename_by_field_cycle(field_i, cycle_i)
        utils.json_save(info_dst_file, info)

        for channel_i in range(n_channels):
            dst_file = _npy_filename_by_field_channel_cycle(
                field_i, channel_i, cycle_i)
            dst_files += [dst_file]
            np.save(dst_file, ims[field_i, channel_i])
    return dst_files
Beispiel #10
0
def mat_lessflat(mat, dim1=None, dim2=None):
    """
    To unflatten you must know either dim1 or dim2

    Example, suppose mat is (2, 6)

        m = mat_lessflat(mat, dim2=3)
        assert m.shape == (2, 2, 3)
    """
    check.array_t(mat, ndim=2)
    check.affirm(dim1 is not None or dim2 is not None)
    if dim1 is None:
        dim1 = mat.shape[1] // dim2
    if dim2 is None:
        dim2 = mat.shape[1] // dim1
    return mat.reshape(mat.shape[0], dim1, dim2)
Beispiel #11
0
def context(cy_ims, locs, reg_psf_samples, peak_mea):
    """
    with radiometry.context(...) as ctx:
        zap.work_orders(do_radiometry, ...)

    """
    lib = load_lib()

    check.array_t(cy_ims, ndim=3, dtype=np.float64)
    n_cycles, height, width = cy_ims.shape

    check.array_t(locs, ndim=2, dtype=np.float64)
    check.affirm(locs.shape[1] == 2)
    n_peaks = locs.shape[0]

    check.array_t(reg_psf_samples, ndim=3)
    n_divs, n_divs_w, n_params = reg_psf_samples.shape
    assert n_divs == n_divs_w
    assert n_params == 3

    out_radiometry = np.zeros((n_peaks, n_cycles, 4), dtype=np.float64)

    ctx = RadiometryContext(
        cy_ims=F64Arr.from_ndarray(cy_ims),
        locs=F64Arr.from_ndarray(locs),
        _locs=locs,
        n_cycles=n_cycles,
        n_peaks=n_peaks,
        n_divs=n_divs,
        peak_mea=peak_mea,
        height=height,
        width=width,
        reg_psf_samples=F64Arr.from_ndarray(reg_psf_samples),
        out_radiometry=F64Arr.from_ndarray(out_radiometry),
        _out_radiometry=out_radiometry,
    )

    error = lib.context_init(ctx)
    if error is not None:
        raise CException(error)

    try:
        yield ctx
    finally:
        lib.context_free(ctx)
Beispiel #12
0
def intersection_roi_from_aln_offsets(aln_offsets, raw_dim):
    """
    Compute the ROI that contains pixels from all frames
    given the aln_offsets (returned from align)
    and the dim of the original images.
    """
    aln_offsets = np.array(aln_offsets)
    check.affirm(np.all(aln_offsets[0] == (0, 0)),
                 "intersection roi must start with (0,0)")

    # intersection_roi is the ROI in the coordinate space of
    # the [0] frame that has pixels from every cycle.
    clip_dim = (
        np.min(aln_offsets[:, 0] + raw_dim[0]) - np.max(aln_offsets[:, 0]),
        np.min(aln_offsets[:, 1] + raw_dim[1]) - np.max(aln_offsets[:, 1]),
    )

    b = max(0, -np.min(aln_offsets[:, 0]))
    t = min(raw_dim[0], b + clip_dim[0])
    l = max(0, -np.min(aln_offsets[:, 1]))
    r = min(raw_dim[1], l + clip_dim[1])
    return ROI(loc=YX(b, l), dim=HW(t - b, r - l))
Beispiel #13
0
def sim(sim_params, prep_result, progress=None, pipeline=None):
    """
    Map the simulation over the peptides in prep_result.

    This is actually performed twice in order to get a train and (different!) test set
    The "train" set includes decoys, the test set does not; furthermore
    the the error modes and radiometry noise is different in each set.
    """

    if sim_params.random_seed is None:
        sim_params.random_seed = int(time.time())

    np.random.seed(sim_params.random_seed)

    # CREATE a *training-set* for all peptides (real and decoy)
    if pipeline:
        pipeline.set_phase(0, 2)

    # Sanity check that all the peps are accounted for
    pep_seqs_with_decoys = prep_result.pepseqs__with_decoys()
    n_peps = pep_seqs_with_decoys.pep_i.nunique()
    assert n_peps == prep_result.n_peps

    (
        train_dyemat,
        train_radmat,
        train_recalls,
        train_flus,
        train_flu_remainders,
    ) = _run_sim(
        sim_params,
        pep_seqs_with_decoys,
        name="train",
        n_peps=n_peps,
        n_samples=sim_params.n_samples_train,
        progress=progress,
    )

    if sim_params.is_survey:
        test_dyemat = None
        test_radmat = None
        test_recalls = None
        test_flus = None
        test_flu_remainders = None
    else:
        # CREATE a *test-set* for real-only peptides
        if pipeline:
            pipeline.set_phase(1, 2)

        (
            test_dyemat,
            test_radmat,
            test_recalls,
            test_flus,
            test_flu_remainders,
        ) = _run_sim(
            sim_params,
            prep_result.pepseqs__no_decoys(),
            name="test",
            n_peps=n_peps,
            n_samples=sim_params.n_samples_test,
            progress=progress,
        )

        # CHECK that the train and test are not identical in SOME non_zero_row
        # If they are, there was some sort of RNG seed errors which might happen
        # for example if sub-processes failed to re-init their RNG seeds.
        # Test this by looking at pep_i==1
        non_zero_rows = np.any(train_radmat[1] > 0, axis=(1, 2))
        non_zero_row_args = np.argwhere(non_zero_rows)[0:100]
        train_rows = train_radmat[1, non_zero_row_args].reshape((
            non_zero_row_args.shape[0],
            non_zero_row_args.shape[1] * train_radmat.shape[2] *
            train_radmat.shape[3],
        ))
        test_rows = test_radmat[1, non_zero_row_args].reshape((
            non_zero_row_args.shape[0],
            non_zero_row_args.shape[1] * test_radmat.shape[2] *
            test_radmat.shape[3],
        ))

        if train_rows.shape[
                0] > 0 and not sim_params.allow_train_test_to_be_identical:
            any_differences = np.any(
                np.diagonal(cdist(train_rows, test_rows)) != 0.0)
            check.affirm(any_differences, "Train and test sets are identical")

    return SimResult(
        params=sim_params,
        train_dyemat=train_dyemat,
        train_radmat=train_radmat,
        train_recalls=train_recalls,
        train_flus=train_flus,
        train_flu_remainders=train_flu_remainders,
        test_dyemat=test_dyemat,
        test_radmat=test_radmat,
        test_recalls=test_recalls,
        test_flus=test_flus,
        test_flu_remainders=test_flu_remainders,
    )
Beispiel #14
0
def sim_v1(sim_params, prep_result, progress=None, pipeline=None):
    """
    Map the simulation over the peptides in prep_result.

    This is actually performed twice in order to get a train and (different!) test set
    The "train" set includes decoys, the test set does not; furthermore
    the the error modes and radiometry noise is different in each set.
    """

    if sim_params.random_seed is None:
        sim_params.random_seed = int(time.time())

    np.random.seed(sim_params.random_seed)

    # CREATE a *training-set* for all peptides (real and decoy)
    if pipeline:
        pipeline.set_phase(0, 2)

    # Sanity check that all the peps are accounted for
    pep_seqs_with_decoys = prep_result.pepseqs__with_decoys()
    n_peps = pep_seqs_with_decoys.pep_i.nunique()
    assert n_peps == prep_result.n_peps

    (
        train_dyemat,
        train_radmat,
        train_pep_recalls,
        train_flus,
        train_flu_remainders,
        train_true_pep_iz,
    ) = _run_sim(
        sim_params,
        pep_seqs_with_decoys,
        name="train",
        n_peps=n_peps,
        n_samples=sim_params.n_samples_train,
        progress=progress,
    )

    if sim_params.is_survey:
        test_dyemat = None
        test_radmat = None
        test_recalls = None
        test_flus = None
        test_flu_remainders = None
        test_true_pep_iz = None
    else:
        # CREATE a *test-set* for real-only peptides
        if pipeline:
            pipeline.set_phase(1, 2)

        (
            test_dyemat,
            test_radmat,
            test_recalls,
            test_flus,
            test_flu_remainders,
            test_true_pep_iz,
        ) = _run_sim(
            sim_params,
            prep_result.pepseqs__no_decoys(),
            name="test",
            n_peps=n_peps,
            n_samples=sim_params.n_samples_test,
            progress=progress,
        )

        # CHECK that the train and test are not identical in SOME non_zero_row
        # If they are, there was some sort of RNG seed errors which might happen
        # for example if sub-processes failed to re-init their RNG seeds.
        # Test this by looking at pep_i==1
        non_zero_rows = np.any(train_radmat[1] > 0, axis=(1, 2))
        non_zero_row_args = np.argwhere(non_zero_rows)[0:100]
        train_rows = train_radmat[1, non_zero_row_args].reshape((
            non_zero_row_args.shape[0],
            non_zero_row_args.shape[1] * train_radmat.shape[2] *
            train_radmat.shape[3],
        ))
        test_rows = test_radmat[1, non_zero_row_args].reshape((
            non_zero_row_args.shape[0],
            non_zero_row_args.shape[1] * test_radmat.shape[2] *
            test_radmat.shape[3],
        ))

        if train_rows.shape[
                0] > 0 and not sim_params.allow_train_test_to_be_identical:
            any_differences = np.any(
                np.diagonal(cdist(train_rows, test_rows)) != 0.0)
            check.affirm(any_differences, "Train and test sets are identical")

    if train_dyemat is not None:
        train_dyemat.reshape((train_dyemat.shape[0] * train_dyemat.shape[1],
                              *train_dyemat.shape[2:]))
    if train_radmat is not None:
        train_radmat.reshape((train_radmat.shape[0] * train_radmat.shape[1],
                              *train_radmat.shape[2:]))
    if test_dyemat is not None:
        test_dyemat.reshape((test_dyemat.shape[0] * test_dyemat.shape[1],
                             *test_dyemat.shape[2:]))
    if test_radmat is not None:
        test_radmat.reshape((test_radmat.shape[0] * test_radmat.shape[1],
                             *test_radmat.shape[2:]))

    # REMOVE all-zero rows (EXCEPT THE FIRST which is the nul row)
    assert np.all(train_dyemat[0, :, :] == 0)
    some_non_zero_row_args = np.argwhere(
        ~np.all(train_dyemat[:, :, :] == 0, axis=(1, 2))).flatten()
    some_non_zero_row_args = np.concatenate(([0], some_non_zero_row_args))

    # TASK: Plucking out the non-zero rows doesn't work well
    # with Arrtay results -- I need to rethink that.
    # For now, I'm converting this back to np.ndarray
    train_dyemat = train_dyemat[some_non_zero_row_args]
    train_radmat = train_radmat[some_non_zero_row_args]
    train_true_pep_iz = train_true_pep_iz[some_non_zero_row_args]

    if test_dyemat is not None:
        assert np.all(test_dyemat[0, :, :] == 0)
        some_non_zero_row_args = np.argwhere(
            ~np.all(test_dyemat[:, :, :] == 0, axis=(1, 2))).flatten()
        # DO not add a nul row into the test data
        # some_non_zero_row_args = np.concatenate(([0], some_non_zero_row_args))
        test_dyemat = test_dyemat[some_non_zero_row_args]
        test_radmat = test_radmat[some_non_zero_row_args]
        test_true_pep_iz = test_true_pep_iz[some_non_zero_row_args]

    return SimV1Result(
        params=sim_params,
        train_dyemat=train_dyemat,
        train_radmat=train_radmat,
        train_pep_recalls=train_pep_recalls,
        train_flus=train_flus,
        train_flu_remainders=train_flu_remainders,
        train_true_pep_iz=train_true_pep_iz,
        test_dyemat=test_dyemat,
        test_radmat=test_radmat,
        test_recalls=test_recalls,
        test_flus=test_flus,
        test_true_pep_iz=test_true_pep_iz,
        test_flu_remainders=test_flu_remainders,
    )
Beispiel #15
0
 def it_pushes_msg():
     with zest.raises(check.CheckAffirmError) as e:
         check.affirm(False, "abc")
     assert e.exception.message == "abc"
Beispiel #16
0
 def it_accepts_exception_instance():
     with zest.raises(ValueError) as e:
         check.affirm(False, exp=ValueError())
Beispiel #17
0
 def it_accepts_exception_type():
     with zest.raises(ValueError):
         check.affirm(False, exp=ValueError)
Beispiel #18
0
 def it_raises_checkerror_by_default():
     with zest.raises(check.CheckAffirmError):
         check.affirm(False)
Beispiel #19
0
 def it_passes():
     check.affirm(True)
Beispiel #20
0
    def false_calls(self, elem_i, n_false):
        """
        For a nice viz of the confusion matrix, see here:
        https://stackoverflow.com/a/50671617
        (Except that viz is transposed compared to our mats.)

        There's two kinds of off-diagonal failures wrt to any element "A":
            * FALSE-POSITIVES: Elements that are called A but are not A.
              I use a mnemonic: "im-POS-ters", ie the false "POS-itives"
            * FALSE-NEGATIVES: Elements that are not A but that steal calls
              from true A's. I think of these as "thieves" stealing from
              the truth.

            These are symmetric relationships:
            If B is an imposter of A then A is a thief of B.

            True negatives wrt to "A" are all of the elements outside the row
            and col of A.
        """

        n_dim = self.shape[0]
        assert self.shape[1] == n_dim

        if n_false >= n_dim:
            return None

        check.affirm(0 <= elem_i < n_dim, "elem_i out of range")

        # For now, only square matrices are supported
        assert self.shape[0] == self.shape[1]

        # Grab sums BEFORE removing the diagonal
        row_sum = self[elem_i, :].sum()
        col_sum = self[:, elem_i].sum()

        # FETCH the top falses (imposters and thieves) with the diag removed
        # to avoid self-collision. Make a copy first
        copy = np.copy(self)
        np.fill_diagonal(copy, 0)
        sorted_false_pos_pep_iz = np.argsort(copy[elem_i, :])[::-1]
        sorted_false_neg_pep_iz = np.argsort(copy[:, elem_i])[::-1]

        false_positive_tuples = [
            (
                f"FP{i}",
                sorted_false_pos_pep_iz[i],
                float(
                    utils.np_safe_divide(
                        copy[elem_i, sorted_false_pos_pep_iz[i]], row_sum, default=0.0
                    )
                ),
            )
            for i in range(n_false)
            if sorted_false_pos_pep_iz[i] > 0
        ]

        false_negative_tuples = [
            (
                f"FN{i}",
                sorted_false_neg_pep_iz[i],
                float(
                    utils.np_safe_divide(
                        copy[sorted_false_neg_pep_iz[i], elem_i], col_sum, default=0.0
                    )
                ),
            )
            for i in range(n_false)
            if sorted_false_neg_pep_iz[i] > 0
        ]

        return false_positive_tuples + false_negative_tuples
Beispiel #21
0
def _do_movie_import_nd2(
    scan_result,
    input_field_i,
    output_field_i,
    start_cycle,
    n_cycles,
    target_mea,
    import_result,
    dst_ch_i_to_src_ch_i,
):
    """
    Import Nikon ND2 "movie" files.

    In this mode, each .nd2 file is a collection of images taken sequentially for a single field.
    This is in contrast to the typical mode where each .nd2 file is a chemical cycle spanning
    all fields/channels.

    Since all data for a given field is already in a single file, the parallel
    scatter/gather employed by the "normal" ND2 import task is not necessary.

    The "fields" from the .nd2 file become "cycles" as if the instrument had
    taken 1 field with a lot of cycles.
    """
    working_im = np.zeros((target_mea, target_mea), OUTPUT_NP_TYPE)

    nd2_path = scan_result.nd2_paths[input_field_i]
    with _nd2(nd2_path) as nd2:
        n_actual_cycles = nd2.n_fields
        n_dst_channels = len(dst_ch_i_to_src_ch_i)
        actual_dim = nd2.dim

        chcy_arr = import_result.allocate_field(
            output_field_i,
            (n_dst_channels, n_cycles, target_mea, target_mea),
            OUTPUT_NP_TYPE,
        )
        chcy_ims = chcy_arr.arr()

        assert start_cycle + n_cycles <= n_actual_cycles
        check.affirm(
            actual_dim[0] <= target_mea and actual_dim[1] <= target_mea,
            f"nd2 scatter requested {target_mea} which is smaller than {actual_dim}",
        )

        for dst_ch_i in range(n_dst_channels):
            src_ch_i = dst_ch_i_to_src_ch_i[dst_ch_i]
            for cy_in_i in range(start_cycle, start_cycle + n_cycles):
                cy_out_i = cy_in_i - start_cycle

                im = nd2.get_field(cy_in_i, src_ch_i).astype(OUTPUT_NP_TYPE)

                if actual_dim[0] != target_mea or actual_dim[1] != target_mea:
                    # CONVERT into a zero pad
                    working_im[0:actual_dim[0], 0:actual_dim[1]] = im
                    im = working_im

                chcy_ims[dst_ch_i, cy_out_i, :, :] = im

        # Task: Add quality
        import_result.save_field(output_field_i, chcy_arr)

    return output_field_i, n_actual_cycles
Beispiel #22
0
def sim_v2(sim_v2_params, prep_result, progress=None, pipeline=None):
    test_dyemat = None
    test_radmat = None
    test_true_pep_iz = None
    test_true_dye_iz = None
    test_true_row_ks = None
    train_radmat = None
    train_true_pep_iz = None
    train_true_dye_iz = None
    train_true_row_ks = None

    phase_i = 0
    n_phases = 1
    if sim_v2_params.train_includes_radmat:
        n_phases += 1
    if not sim_v2_params.is_survey:
        n_phases += 2

    # Training data
    #   * always includes decoys
    #   * may include radiometry
    # -----------------------------------------------------------------------
    # debug("gen flus")
    # train_flus, train_pi_brights = _gen_flus(sim_v2_params, prep_result.pepseqs())
    # debug("gen flus done")

    # RANDOM cleanup
    # Make the pipeline have a stub so I don't have to if pipeline...
    # Get rid of phases and just pass in a name to display

    if pipeline:
        pipeline.set_phase(phase_i, n_phases)
        phase_i += 1

    n_channels, n_cycles = sim_v2_params.n_channels_and_cycles

    train_dyemat, train_dyepeps, train_pep_recalls = prep_result.get_photobleaching(
    )
    if train_dyemat is None:
        # This is a regular, non-photo-bleaching run
        pepseqs = prep_result.pepseqs__with_decoys()
        check.t(pepseqs, pd.DataFrame)  # (pep_i, aa, pep_off_in_pro)
        pcbs = sim_v2_params.pcbs(
            pepseqs)  # (p)ep_i, (c)hannel_i, (b)right_probability
        train_dyemat, train_dyepeps, train_pep_recalls = _dyemat_sim(
            sim_v2_params,
            pcbs,
            sim_v2_params.n_samples_train,
            progress,
        )

    n_dyts = train_dyemat.shape[0]

    check.array_t(
        train_dyemat,
        shape=(
            n_dyts,
            n_channels * n_cycles,
        ),  # unique dyetracks (n_rows, n_channels * n_cycles)
    )

    # dyepeps are a map between dyetracks and peptides with a count
    # Example:
    #   (2, 5, 110) => dyt_i=2 was generated by pep_i==5 110 times
    #   (2, 7, 50)  => dyt_i=2 was generated by pep_i==7 50 times
    check.array_t(train_dyepeps, shape=(None, 3))  # (dyt_i, pep_i, count)
    assert np.max(train_dyepeps[:, 0]) + 1 == n_dyts

    # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2)
    # Note that np.lexsort puts the primary sort key LAST in the argument

    # Seems like this sorting should be in _dyemat_sim?
    train_dyepeps = train_dyepeps[np.lexsort(
        (-train_dyepeps[:, 2], train_dyepeps[:, 0]))]

    if sim_v2_params.train_includes_radmat:
        if pipeline:
            pipeline.set_phase(phase_i, n_phases)
            phase_i += 1

        (
            train_radmat,
            train_true_pep_iz,
            train_true_dye_iz,
            train_true_rows_ks,
        ) = _radmat_sim(
            train_dyemat.reshape((
                train_dyemat.shape[0],
                sim_v2_params.n_channels,
                sim_v2_params.n_cycles,
            )),
            train_dyepeps,
            sim_v2_params.by_channel(),
            sim_v2_params.n_samples_train,
            sim_v2_params.n_channels,
            sim_v2_params.n_cycles,
            sim_v2_params.use_lognormal_model,
            progress,
        )

    # Test data
    #   * does not include decoys
    #   * always includes radiometry
    #   * may include dyetracks
    #   * skipped if is_survey
    # -----------------------------------------------------------------------
    if not sim_v2_params.is_survey:
        # test_flus, test_pi_brights = _gen_flus(
        #     sim_v2_params, prep_result.pepseqs__no_decoys()
        # )

        if pipeline:
            pipeline.set_phase(phase_i, n_phases)
            phase_i += 1

        test_dyemat, test_dyepeps, test_pep_recalls = prep_result.get_photobleaching(
        )
        if test_dyemat is None:
            # This is a regular, non-photo-bleaching run
            test_dyemat, test_dyepeps, test_pep_recalls = _dyemat_sim(
                sim_v2_params,
                sim_v2_params.pcbs(prep_result.pepseqs__no_decoys()),
                sim_v2_params.n_samples_test,
                progress,
            )

        # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2)
        # Note that np.lexsort puts the primary sort key LAST in the argument
        test_dyepeps = test_dyepeps[np.lexsort(
            (-test_dyepeps[:, 2], test_dyepeps[:, 0]))]

        if pipeline:
            pipeline.set_phase(phase_i, n_phases)
            phase_i += 1

        (
            test_radmat,
            test_true_pep_iz,
            test_true_dye_iz,
            test_true_row_ks,
        ) = _radmat_sim(
            test_dyemat.reshape(
                (test_dyemat.shape[0], sim_v2_params.n_channels,
                 sim_v2_params.n_cycles)),
            test_dyepeps,
            sim_v2_params.channel__priors(),
            sim_v2_params.n_samples_test,
            sim_v2_params.n_channels,
            sim_v2_params.n_cycles,
            sim_v2_params.use_lognormal_model,
            progress,
        )

        if not sim_v2_params.allow_train_test_to_be_identical:
            # Move to a standalone _method
            # TASK: Add a dyepeps check
            # train_dyepeps_df = pd.DataFrame(train_dyepeps, columns=["dye_i", "pep_i", "count"])
            # test_dyepeps_df = pd.DataFrame(test_dyepeps, columns=["dye_i", "pep_i", "count"])
            # joined_df = train_dyepeps_df.set_index("pep_i").join(
            #     test_dyepeps_df.set_index("pep_i")
            # )

            if (train_radmat is not None
                    and train_radmat.shape[0] == test_radmat.shape[0]):
                check.affirm(
                    not _any_identical_non_zero_rows(
                        train_radmat.reshape((
                            train_radmat.shape[0],
                            train_radmat.shape[1] * train_radmat.shape[2],
                        )),
                        test_radmat.reshape((
                            test_radmat.shape[0],
                            test_radmat.shape[1] * test_radmat.shape[2],
                        )),
                    ),
                    "Train and test sets are identical. Probably RNG bug.",
                )

        # REMOVE all-zero rows (EXCEPT THE FIRST which is the nul row)
        # Seems liek the remove should go into _dye
        non_zero_rows = np.argwhere(test_true_pep_iz != 0).flatten()
        test_radmat = test_radmat[non_zero_rows]
        test_true_pep_iz = test_true_pep_iz[non_zero_rows]
        test_true_dye_iz = test_true_dye_iz[non_zero_rows]
        if test_true_row_ks is not None:
            test_true_row_ks = test_true_row_ks[non_zero_rows]

    sim_result_v2 = SimV2Result(
        params=sim_v2_params,
        train_dyemat=train_dyemat,
        train_radmat=train_radmat,
        train_pep_recalls=train_pep_recalls,
        train_true_pep_iz=train_true_pep_iz,
        train_true_dye_iz=train_true_dye_iz,
        train_dyepeps=train_dyepeps,
        train_true_row_ks=train_true_row_ks,
        test_dyemat=test_dyemat,
        test_radmat=test_radmat,
        test_true_pep_iz=test_true_pep_iz,
        test_true_dye_iz=test_true_dye_iz,
        test_true_row_ks=test_true_row_ks,
        _flus=None,
    )

    if sim_v2_params.generate_flus:
        # Why optional? Should it be optimized?
        sim_result_v2._generate_flu_info(prep_result)

    return sim_result_v2