Exemple #1
0
def _default_read_ranges_tile_block(
    slices_arr, fileset_arr, slice_sig_sizes, sig_origins,
    inner_indices_start, inner_indices_stop, frame_indices, sig_size,
    px_to_bytes, bpp, frame_header_bytes, frame_footer_bytes, file_idxs,
    slice_offset, extra, sig_shape,
):
    result = NumbaList()

    # positions in the signal dimensions:
    for slice_idx in range(slices_arr.shape[0]):
        # (offset, size) arrays defining what data to read (in pixels)
        # NOTE: assumes contiguous tiling scheme
        # (i.e. a shape like (1, 1, ..., 1, X1, ..., XN))
        # where X1 is <= the dataset shape at that index, and X2, ..., XN are
        # equal to the dataset shape at that index
        slice_origin = slices_arr[slice_idx][0]
        slice_shape = slices_arr[slice_idx][1]
        slice_sig_size = slice_sig_sizes[slice_idx]
        sig_origin = sig_origins[slice_idx]

        read_ranges = NumbaList()

        # inner "depth" loop along the (flat) navigation axis of a tile:
        for i, inner_frame_idx in enumerate(range(inner_indices_start, inner_indices_stop)):
            inner_frame = frame_indices[inner_frame_idx]

            file_idx = file_idxs[i]
            f = fileset_arr[file_idx]

            frame_in_file_idx = inner_frame - f[0]

            px_to_bytes(
                bpp=bpp,
                frame_in_file_idx=frame_in_file_idx,
                slice_sig_size=slice_sig_size,
                sig_size=sig_size,
                sig_origin=sig_origin,
                frame_footer_bytes=frame_footer_bytes,
                frame_header_bytes=frame_header_bytes,
                file_idx=file_idx,
                read_ranges=read_ranges,
            )

        # the indices are compressed to the selected frames
        compressed_slice = np.array([
            [slice_offset + inner_indices_start] + [i for i in slice_origin],
            [inner_indices_stop - inner_indices_start] + [i for i in slice_shape],
        ])
        result.append((slice_idx, compressed_slice, read_ranges))

    return result
Exemple #2
0
    def transform(
        self, data: Union[str, Iterable[str]]
    ) -> np.ndarray:
        """Transform nucleotide sequences to tf-idf weighted bow representations

        Takes either a single sequence (string) or an iterable of them.

        Parameters
        ----------
        data: a sequence or list of sequences (strings)

        Returns
        -------
        array: an array containing (possibly one) tf-idf weighted representation(s).
        """
        seqs = NumbaList()
        if isinstance(data, str):
            seqs.append(data)
        elif isinstance(data, Iterable):
            for seq in data:
                seqs.append(seq)
        else:
            raise TypeError(
                f"Expected either a str or iterable of str, got {type(data)} instead."
            )
        result = multiple_oligofreq(seqs, self.k) * self.idfs
        result = result / np.linalg.norm(result, axis=1).reshape((-1, 1))
        result = result.reshape(len(data), -1)
        return result
Exemple #3
0
def _create_group_indexer(states: pd.DataFrame,
                          assort_by: Dict[str, List[str]]) -> nb.typed.List:
    """Create the group indexer.

    The indexer is a list where the positions correspond to the group number defined by
    assortative variables. The values inside the list are one-dimensional integer arrays
    containing the indices of states belonging to the group.

    If there are no assortative variables, all individuals are assigned to a single
    group with code 0 and the indexer is a list where the first position contains all
    indices of states.

    For efficiency reasons, we assign each group a number instead of identifying by
    the values of the assort_by variables directly.

    Note: This function is from sid commit 206886a14eeb3257deb71db91aba4e7fb2385fc2.

    Args:
        states (pandas.DataFrame): The states.
        assort_by (List[str]): List of variables that influence matching probabilities.

    Returns:
        indexer (numba.typed.List): The i_th entry are the indices of the i_th group.

    """
    states = states.reset_index()
    if assort_by:
        groups = states.groupby(assort_by).groups
        _, group_codes_values = factorize_assortative_variables(
            states, assort_by)

        indexer = NumbaList()
        for group in group_codes_values:
            # the keys of groups are not tuples if there was just one assort_by variable
            # but the group_codes_values are.
            group = group[0] if isinstance(
                group, tuple) and len(group) == 1 else group
            indexer.append(groups[group].to_numpy(dtype=DTYPE_INDEX))

    else:
        indexer = NumbaList()
        indexer.append(states.index.to_numpy(DTYPE_INDEX))

    return indexer
Exemple #4
0
def create_group_indexer(states, assort_by):
    """Create the group indexer.

    The indexer is a list where the positions correspond to the group number defined by
    assortative variables. The values inside the list are one-dimensional integer arrays
    containing the indices of states belonging to the group.

    If there are no assortative variables, all individuals are assigned to a single
    group with code 0 and the indexer is a list where the first position contains all
    indices of states.

    For efficiency reasons, we assign each group a number instead of identifying by
    the values of the assort_by variables directly.

    Args:
        states (pandas.DataFrame): See :ref:`states`
        assort_by (list): List of variables that influence matching probabilities.

    Returns:
        indexer (numba.typed.List): The i_th entry are the indices of the i_th group.

    """
    if assort_by:
        groups = states.groupby(assort_by).groups
        _, group_codes_values = factorize_assortative_variables(
            states, assort_by)

        indexer = NumbaList()
        for group in group_codes_values:
            # the keys of groups are not tuples if there was just one assort_by variable
            # but the group_codes_values are.
            group = group[0] if len(group) == 1 else group
            indexer.append(groups[group].to_numpy(dtype=DTYPE_INDEX))

    else:
        indexer = NumbaList()
        indexer.append(states.index.to_numpy(DTYPE_INDEX))

    return indexer
Exemple #5
0
    def test_ex_jitclass_type_hints(self):
        # magictoken.ex_jitclass_type_hints.begin
        from typing import List
        from numba.experimental import jitclass
        from numba.typed import List as NumbaList

        @jitclass
        class Counter:
            value: int

            def __init__(self):
                self.value = 0

            def get(self) -> int:
                ret = self.value
                self.value += 1
                return ret

        @jitclass
        class ListLoopIterator:
            counter: Counter
            items: List[float]

            def __init__(self, items: List[float]):
                self.items = items
                self.counter = Counter()

            def get(self) -> float:
                idx = self.counter.get() % len(self.items)
                return self.items[idx]

        items = NumbaList([3.14, 2.718, 0.123, -4.])
        loop_itr = ListLoopIterator(items)
        # magictoken.ex_jitclass_type_hints.end

        for idx in range(10):
            self.assertEqual(loop_itr.counter.value, idx)
            self.assertAlmostEqual(loop_itr.get(), items[idx % len(items)])
            self.assertEqual(loop_itr.counter.value, idx + 1)
Exemple #6
0
    def _get_read_ranges_inner(
        start_at_frame,
        stop_before_frame,
        roi,
        depth,
        slices_arr,
        fileset_arr,
        sig_shape,
        bpp,
        extra=None,
        frame_header_bytes=0,
        frame_footer_bytes=0,
    ):
        result = NumbaList()

        sig_size = np.prod(np.array(sig_shape))

        if roi is None:
            frame_indices = np.arange(start_at_frame, stop_before_frame)
            slice_offset = start_at_frame
        else:
            frame_indices = _roi_to_indices(roi, start_at_frame,
                                            stop_before_frame)
            slice_offset = np.count_nonzero(
                roi.reshape((-1, ))[:start_at_frame])

        num_indices = frame_indices.shape[0]

        # indices into `frame_indices`:
        inner_indices_start = 0
        inner_indices_stop = min(depth, num_indices)

        # this should be `np.prod(..., axis=-1)``, which is not supported by numba yet:
        # slices that divide the signal dimensions:
        slice_sig_sizes = np.array([
            np.prod(slices_arr[slice_idx, 1, :])
            for slice_idx in range(slices_arr.shape[0])
        ])

        sig_origins = np.array([
            _ravel_multi_index(slices_arr[slice_idx][0], sig_shape)
            for slice_idx in range(slices_arr.shape[0])
        ])

        # outer "depth" loop skipping over `depth` frames at a time:
        while inner_indices_start < num_indices:
            file_idxs = np.array([
                _find_file_for_frame_idx(fileset_arr,
                                         frame_indices[inner_frame_idx])
                for inner_frame_idx in range(inner_indices_start,
                                             inner_indices_stop)
            ])

            for slice_idx, compressed_slice, read_ranges in read_ranges_tile_block(
                    slices_arr,
                    fileset_arr,
                    slice_sig_sizes,
                    sig_origins,
                    inner_indices_start,
                    inner_indices_stop,
                    frame_indices,
                    sig_size,
                    px_to_bytes,
                    bpp,
                    frame_header_bytes,
                    frame_footer_bytes,
                    file_idxs,
                    slice_offset,
                    extra=extra,
                    sig_shape=sig_shape,
            ):
                result.append((compressed_slice, read_ranges, slice_idx))

            inner_indices_start = inner_indices_start + depth
            inner_indices_stop = min(inner_indices_stop + depth, num_indices)

        result_slices = np.zeros((len(result), 2, 1 + len(sig_shape)),
                                 dtype=np.int64)
        for tile_idx, res in enumerate(result):
            result_slices[tile_idx] = res[0]

        if len(result) == 0:
            return (
                result_slices,
                np.zeros((len(result), depth, 3), dtype=np.int64),
                np.zeros((len(result)), dtype=np.int64),
            )

        max_rr_per_tile = max([len(res[1]) for res in result])

        slice_indices = np.zeros(len(result), dtype=np.int64)

        # read_ranges_tile_block can decide how many entries there are per read range,
        # so we need to generate a result array with the correct size:
        rr_num_entries = max(3, len(result[0][1][0]))
        result_ranges = np.zeros(
            (len(result), max_rr_per_tile, rr_num_entries), dtype=np.int64)
        for tile_idx, res in enumerate(result):
            for depth_idx, read_range in enumerate(res[1]):
                result_ranges[tile_idx][depth_idx] = read_range
            slice_indices[tile_idx] = res[2]

        return result_slices, result_ranges, slice_indices
Exemple #7
0
def _mib_2x2_tile_block(
    slices_arr, fileset_arr, slice_sig_sizes, sig_origins,
    inner_indices_start, inner_indices_stop, frame_indices, sig_size,
    px_to_bytes, bpp, frame_header_bytes, frame_footer_bytes, file_idxs,
    slice_offset, extra, sig_shape,
):
    """
    Generate read ranges for 2x2 Merlin Quad raw data.

    The arrangement means that reading a contiguous block of data from the file,
    we get data from all four quadrants. The arrangement, and thus resulting
    array, looks like this:

    _________
    | 1 | 2 |
    ---------
    | 3 | 4 |
    ---------

    with the original data layed out like this:

    [4 | 3 | 2 | 1]

    (note that quadrants 3 and 4 are also flipped in x and y direction in the
    resulting array, compared to the original data)

    So if we read one row of raw data, we first get the bottom-most rows from 4
    and 3 first, then the top-most rows from 2 and 1.

    This is similar to how FRMS6 works, and we generate the read ranges in a
    similar way here. In addition to the cut-and-flip from FRMS6, we also have
    the split in x direction in quadrants.
    """
    result = NumbaList()

    # positions in the signal dimensions:
    for slice_idx in range(slices_arr.shape[0]):
        # (offset, size) arrays defining what data to read (in pixels)
        # NOTE: assumes contiguous tiling scheme
        # (i.e. a shape like (1, 1, ..., 1, X1, ..., XN))
        # where X1 is <= the dataset shape at that index, and X2, ..., XN are
        # equal to the dataset shape at that index
        slice_origin = slices_arr[slice_idx][0]
        slice_shape = slices_arr[slice_idx][1]

        read_ranges = NumbaList()

        x_shape = slice_shape[1]
        x_size = x_shape * bpp // 8  # back in bytes
        x_size_half = x_size // 2
        stride = x_size_half * 4

        sig_size_bytes = sig_size * bpp // 8

        y_start = slice_origin[0]
        y_stop = slice_origin[0] + slice_shape[0]

        y_size_half = sig_shape[0] // 2
        y_size = sig_shape[0]

        # inner "depth" loop along the (flat) navigation axis of a tile:
        for i, inner_frame_idx in enumerate(range(inner_indices_start, inner_indices_stop)):
            inner_frame = frame_indices[inner_frame_idx]
            file_idx = file_idxs[i]
            f = fileset_arr[file_idx]
            frame_in_file_idx = inner_frame - f[0]
            file_header_bytes = f[3]

            # we are reading a part of a single frame, so we first need to find
            # the offset caused by headers:
            header_offset = file_header_bytes + frame_header_bytes * (frame_in_file_idx + 1)

            # now let's figure in the current frame index:
            # (go down into the file by full frames; `sig_size`)
            offset = header_offset + frame_in_file_idx * sig_size_bytes

            # in total, we generate depth * 2 * (y_stop - y_start) read ranges per tile
            for y in range(y_start, y_stop):
                if y < y_size_half:

                    # top: no y-flip, no x-flip
                    flip = 0

                    # quadrant 1, left part of the result: we have the three other blocks
                    # in the original data in front of us
                    start, stop = _get_row_start_stop(
                        offset, 3 * x_size_half, stride, y, x_size_half
                    )
                    read_ranges.append((
                        file_idx,
                        start,
                        stop,
                        flip,
                    ))
                    # quadrant 2, right part of the result: we have the two other blocks
                    # in the original data in front of us
                    start, stop = _get_row_start_stop(
                        offset, 2 * x_size_half, stride, y, x_size_half
                    )
                    read_ranges.append((
                        file_idx,
                        start,
                        stop,
                        flip,
                    ))
                else:
                    # bottom: both x and y flip
                    flip = 1
                    y = y_size - y - 1
                    # quadrant 3, left part of the result: we have the one other block
                    # in the original data in front of us
                    start, stop = _get_row_start_stop(
                        offset, 1 * x_size_half, stride, y, x_size_half
                    )
                    read_ranges.append((
                        file_idx,
                        start,
                        stop,
                        flip,
                    ))
                    # quadrant 4, right part of the result: we have the no other blocks
                    # in the original data in front of us
                    start, stop = _get_row_start_stop(offset, 0, stride, y, x_size_half)
                    read_ranges.append((
                        file_idx,
                        start,
                        stop,
                        flip,
                    ))

        # the indices are compressed to the selected frames
        compressed_slice = np.array([
            [slice_offset + inner_indices_start] + [i for i in slice_origin],
            [inner_indices_stop - inner_indices_start] + [i for i in slice_shape],
        ])
        result.append((slice_idx, compressed_slice, read_ranges))

    return result
Exemple #8
0
def calculate_infections(states, contacts, params, indexers, group_probs,
                         seed):
    """Calculate infections from contacts.

    This function mainly converts the relevant parts from states and contacts into
    numpy arrays or other objects that are supported in numba nopython mode and
    then calls ``calculate_infections_numba``.

    Args:
        states (pandas.DataFrame): see :ref:`states`.
        contacts (pandas.DataFrame): One column per contact_model. Same index as states.
        params (pandas.DataFrame): See :ref:`params`.
        indexers (dict): Dict of numba.Typed.List The i_th entry of the lists are the
            indices of the i_th group.
        group_probs (dict): dict of arrays of shape
            n_group, n_groups. probs[i, j] is the probability that an individual from
            group i meets someone from group j.
        seed (itertools.count): Seed counter to control randomness.

    Returns:
        infected_sr (pd.Series): Boolean Series that is True for newly infected people.
        states (pandas.DataFrame): Copy of states with updated immune column.

    """
    is_meet_group = np.array([k not in group_probs for k in indexers])
    states = states.copy()
    infectious = states["infectious"].to_numpy(copy=True)
    immune = states["immune"].to_numpy(copy=True)
    group_codes = states[[f"group_codes_{cm}" for cm in indexers]].to_numpy()
    infect_probs = np.array(
        [params.loc[("infection_prob", cm, None), "value"] for cm in indexers])

    group_probs_list = NumbaList()
    for gp in group_probs.values():
        group_probs_list.append(gp)
    # nopython mode fails, if we leave the list empty or put a 1d array inside the list.
    if len(group_probs_list) == 0:
        group_probs_list.append(np.zeros((0, 0)))

    indexers_list = NumbaList()
    for ind in indexers.values():
        indexers_list.append(ind)

    np.random.seed(next(seed))
    loop_entries = np.array(
        list(itertools.product(range(len(states)), range(len(indexers)))))

    indices = np.random.choice(len(loop_entries),
                               replace=False,
                               size=len(loop_entries))
    loop_order = loop_entries[indices]

    infected, infection_counter, immune, missed = _calculate_infections_numba(
        contacts,
        infectious,
        immune,
        group_codes,
        group_probs_list,
        indexers_list,
        infect_probs,
        next(seed),
        is_meet_group,
        loop_order,
    )

    infected_sr = pd.Series(infected, index=states.index, dtype=bool)
    states["infection_counter"] += infection_counter
    for i, contact_model in enumerate(group_probs):
        states[f"missed_{contact_model}"] = missed[:, i]

    states["immune"] = immune

    return infected_sr, states
def _sample_data_for_calculate_infections_numba(
    n_individuals=None,
    n_contacts=None,
    infectious_share=None,
    group_shares=None,
    group_probabilities=None,
    infection_prob=None,
    seed=None,
):
    """Sample data for the calculation of new infections."""
    if seed is not None:
        np.random.seed(seed)

    if n_individuals is None:
        n_individuals = np.random.randint(5, 1_000)

    if n_contacts is None:
        contacts = np.random.randint(2, 6, size=n_individuals)
    else:
        contacts = np.full(n_individuals, n_contacts)

    if infectious_share is None:
        infectious_share = np.random.uniform(0.000001, 1)

    infectious = np.zeros(n_individuals, dtype=np.bool)
    mask = np.random.choice(n_individuals, size=int(n_individuals * infectious_share))
    infectious[mask] = True

    immune = infectious.copy()

    if group_shares is None:
        n_groups = np.random.randint(1, 4)
        group_shares = np.random.uniform(0.1, 1, size=n_groups)
    group_shares = group_shares / group_shares.sum()
    group_shares[-1] = 1 - group_shares[:-1].sum()

    n_groups = len(group_shares)
    group_codes = np.random.choice(n_groups, p=group_shares, size=n_individuals)

    if group_probabilities is None:
        group_probabilities = np.random.uniform(0.00001, 1, size=(n_groups, n_groups))
        group_probabilities = group_probabilities / group_probabilities.sum(
            axis=1, keepdims=True
        )
    group_probs_list = NumbaList()
    group_probs_list.append(group_probabilities)

    indexer = NumbaList()
    for group in range(n_groups):
        indexer.append(np.where(group_codes == group)[0])

    indexers_list = NumbaList()
    indexers_list.append(indexer)

    if infection_prob is None:
        ip = np.random.uniform()
        infection_prob = np.array([ip])

    is_meet_group = np.array([False])

    loop_order = np.array(list(itertools.product(range(n_individuals), range(1))))

    return (
        contacts.reshape(-1, 1),
        infectious,
        immune,
        group_codes.reshape(-1, 1),
        group_probs_list,
        indexers_list,
        infection_prob,
        is_meet_group,
        loop_order,
    )
Exemple #10
0
def numba_apply(s: pd.Series, f) -> pd.Series:
    result = f(NumbaList(s.values))
    return pd.Series(result)