def _default_read_ranges_tile_block( slices_arr, fileset_arr, slice_sig_sizes, sig_origins, inner_indices_start, inner_indices_stop, frame_indices, sig_size, px_to_bytes, bpp, frame_header_bytes, frame_footer_bytes, file_idxs, slice_offset, extra, sig_shape, ): result = NumbaList() # positions in the signal dimensions: for slice_idx in range(slices_arr.shape[0]): # (offset, size) arrays defining what data to read (in pixels) # NOTE: assumes contiguous tiling scheme # (i.e. a shape like (1, 1, ..., 1, X1, ..., XN)) # where X1 is <= the dataset shape at that index, and X2, ..., XN are # equal to the dataset shape at that index slice_origin = slices_arr[slice_idx][0] slice_shape = slices_arr[slice_idx][1] slice_sig_size = slice_sig_sizes[slice_idx] sig_origin = sig_origins[slice_idx] read_ranges = NumbaList() # inner "depth" loop along the (flat) navigation axis of a tile: for i, inner_frame_idx in enumerate(range(inner_indices_start, inner_indices_stop)): inner_frame = frame_indices[inner_frame_idx] file_idx = file_idxs[i] f = fileset_arr[file_idx] frame_in_file_idx = inner_frame - f[0] px_to_bytes( bpp=bpp, frame_in_file_idx=frame_in_file_idx, slice_sig_size=slice_sig_size, sig_size=sig_size, sig_origin=sig_origin, frame_footer_bytes=frame_footer_bytes, frame_header_bytes=frame_header_bytes, file_idx=file_idx, read_ranges=read_ranges, ) # the indices are compressed to the selected frames compressed_slice = np.array([ [slice_offset + inner_indices_start] + [i for i in slice_origin], [inner_indices_stop - inner_indices_start] + [i for i in slice_shape], ]) result.append((slice_idx, compressed_slice, read_ranges)) return result
def transform( self, data: Union[str, Iterable[str]] ) -> np.ndarray: """Transform nucleotide sequences to tf-idf weighted bow representations Takes either a single sequence (string) or an iterable of them. Parameters ---------- data: a sequence or list of sequences (strings) Returns ------- array: an array containing (possibly one) tf-idf weighted representation(s). """ seqs = NumbaList() if isinstance(data, str): seqs.append(data) elif isinstance(data, Iterable): for seq in data: seqs.append(seq) else: raise TypeError( f"Expected either a str or iterable of str, got {type(data)} instead." ) result = multiple_oligofreq(seqs, self.k) * self.idfs result = result / np.linalg.norm(result, axis=1).reshape((-1, 1)) result = result.reshape(len(data), -1) return result
def _create_group_indexer(states: pd.DataFrame, assort_by: Dict[str, List[str]]) -> nb.typed.List: """Create the group indexer. The indexer is a list where the positions correspond to the group number defined by assortative variables. The values inside the list are one-dimensional integer arrays containing the indices of states belonging to the group. If there are no assortative variables, all individuals are assigned to a single group with code 0 and the indexer is a list where the first position contains all indices of states. For efficiency reasons, we assign each group a number instead of identifying by the values of the assort_by variables directly. Note: This function is from sid commit 206886a14eeb3257deb71db91aba4e7fb2385fc2. Args: states (pandas.DataFrame): The states. assort_by (List[str]): List of variables that influence matching probabilities. Returns: indexer (numba.typed.List): The i_th entry are the indices of the i_th group. """ states = states.reset_index() if assort_by: groups = states.groupby(assort_by).groups _, group_codes_values = factorize_assortative_variables( states, assort_by) indexer = NumbaList() for group in group_codes_values: # the keys of groups are not tuples if there was just one assort_by variable # but the group_codes_values are. group = group[0] if isinstance( group, tuple) and len(group) == 1 else group indexer.append(groups[group].to_numpy(dtype=DTYPE_INDEX)) else: indexer = NumbaList() indexer.append(states.index.to_numpy(DTYPE_INDEX)) return indexer
def create_group_indexer(states, assort_by): """Create the group indexer. The indexer is a list where the positions correspond to the group number defined by assortative variables. The values inside the list are one-dimensional integer arrays containing the indices of states belonging to the group. If there are no assortative variables, all individuals are assigned to a single group with code 0 and the indexer is a list where the first position contains all indices of states. For efficiency reasons, we assign each group a number instead of identifying by the values of the assort_by variables directly. Args: states (pandas.DataFrame): See :ref:`states` assort_by (list): List of variables that influence matching probabilities. Returns: indexer (numba.typed.List): The i_th entry are the indices of the i_th group. """ if assort_by: groups = states.groupby(assort_by).groups _, group_codes_values = factorize_assortative_variables( states, assort_by) indexer = NumbaList() for group in group_codes_values: # the keys of groups are not tuples if there was just one assort_by variable # but the group_codes_values are. group = group[0] if len(group) == 1 else group indexer.append(groups[group].to_numpy(dtype=DTYPE_INDEX)) else: indexer = NumbaList() indexer.append(states.index.to_numpy(DTYPE_INDEX)) return indexer
def test_ex_jitclass_type_hints(self): # magictoken.ex_jitclass_type_hints.begin from typing import List from numba.experimental import jitclass from numba.typed import List as NumbaList @jitclass class Counter: value: int def __init__(self): self.value = 0 def get(self) -> int: ret = self.value self.value += 1 return ret @jitclass class ListLoopIterator: counter: Counter items: List[float] def __init__(self, items: List[float]): self.items = items self.counter = Counter() def get(self) -> float: idx = self.counter.get() % len(self.items) return self.items[idx] items = NumbaList([3.14, 2.718, 0.123, -4.]) loop_itr = ListLoopIterator(items) # magictoken.ex_jitclass_type_hints.end for idx in range(10): self.assertEqual(loop_itr.counter.value, idx) self.assertAlmostEqual(loop_itr.get(), items[idx % len(items)]) self.assertEqual(loop_itr.counter.value, idx + 1)
def _get_read_ranges_inner( start_at_frame, stop_before_frame, roi, depth, slices_arr, fileset_arr, sig_shape, bpp, extra=None, frame_header_bytes=0, frame_footer_bytes=0, ): result = NumbaList() sig_size = np.prod(np.array(sig_shape)) if roi is None: frame_indices = np.arange(start_at_frame, stop_before_frame) slice_offset = start_at_frame else: frame_indices = _roi_to_indices(roi, start_at_frame, stop_before_frame) slice_offset = np.count_nonzero( roi.reshape((-1, ))[:start_at_frame]) num_indices = frame_indices.shape[0] # indices into `frame_indices`: inner_indices_start = 0 inner_indices_stop = min(depth, num_indices) # this should be `np.prod(..., axis=-1)``, which is not supported by numba yet: # slices that divide the signal dimensions: slice_sig_sizes = np.array([ np.prod(slices_arr[slice_idx, 1, :]) for slice_idx in range(slices_arr.shape[0]) ]) sig_origins = np.array([ _ravel_multi_index(slices_arr[slice_idx][0], sig_shape) for slice_idx in range(slices_arr.shape[0]) ]) # outer "depth" loop skipping over `depth` frames at a time: while inner_indices_start < num_indices: file_idxs = np.array([ _find_file_for_frame_idx(fileset_arr, frame_indices[inner_frame_idx]) for inner_frame_idx in range(inner_indices_start, inner_indices_stop) ]) for slice_idx, compressed_slice, read_ranges in read_ranges_tile_block( slices_arr, fileset_arr, slice_sig_sizes, sig_origins, inner_indices_start, inner_indices_stop, frame_indices, sig_size, px_to_bytes, bpp, frame_header_bytes, frame_footer_bytes, file_idxs, slice_offset, extra=extra, sig_shape=sig_shape, ): result.append((compressed_slice, read_ranges, slice_idx)) inner_indices_start = inner_indices_start + depth inner_indices_stop = min(inner_indices_stop + depth, num_indices) result_slices = np.zeros((len(result), 2, 1 + len(sig_shape)), dtype=np.int64) for tile_idx, res in enumerate(result): result_slices[tile_idx] = res[0] if len(result) == 0: return ( result_slices, np.zeros((len(result), depth, 3), dtype=np.int64), np.zeros((len(result)), dtype=np.int64), ) max_rr_per_tile = max([len(res[1]) for res in result]) slice_indices = np.zeros(len(result), dtype=np.int64) # read_ranges_tile_block can decide how many entries there are per read range, # so we need to generate a result array with the correct size: rr_num_entries = max(3, len(result[0][1][0])) result_ranges = np.zeros( (len(result), max_rr_per_tile, rr_num_entries), dtype=np.int64) for tile_idx, res in enumerate(result): for depth_idx, read_range in enumerate(res[1]): result_ranges[tile_idx][depth_idx] = read_range slice_indices[tile_idx] = res[2] return result_slices, result_ranges, slice_indices
def _mib_2x2_tile_block( slices_arr, fileset_arr, slice_sig_sizes, sig_origins, inner_indices_start, inner_indices_stop, frame_indices, sig_size, px_to_bytes, bpp, frame_header_bytes, frame_footer_bytes, file_idxs, slice_offset, extra, sig_shape, ): """ Generate read ranges for 2x2 Merlin Quad raw data. The arrangement means that reading a contiguous block of data from the file, we get data from all four quadrants. The arrangement, and thus resulting array, looks like this: _________ | 1 | 2 | --------- | 3 | 4 | --------- with the original data layed out like this: [4 | 3 | 2 | 1] (note that quadrants 3 and 4 are also flipped in x and y direction in the resulting array, compared to the original data) So if we read one row of raw data, we first get the bottom-most rows from 4 and 3 first, then the top-most rows from 2 and 1. This is similar to how FRMS6 works, and we generate the read ranges in a similar way here. In addition to the cut-and-flip from FRMS6, we also have the split in x direction in quadrants. """ result = NumbaList() # positions in the signal dimensions: for slice_idx in range(slices_arr.shape[0]): # (offset, size) arrays defining what data to read (in pixels) # NOTE: assumes contiguous tiling scheme # (i.e. a shape like (1, 1, ..., 1, X1, ..., XN)) # where X1 is <= the dataset shape at that index, and X2, ..., XN are # equal to the dataset shape at that index slice_origin = slices_arr[slice_idx][0] slice_shape = slices_arr[slice_idx][1] read_ranges = NumbaList() x_shape = slice_shape[1] x_size = x_shape * bpp // 8 # back in bytes x_size_half = x_size // 2 stride = x_size_half * 4 sig_size_bytes = sig_size * bpp // 8 y_start = slice_origin[0] y_stop = slice_origin[0] + slice_shape[0] y_size_half = sig_shape[0] // 2 y_size = sig_shape[0] # inner "depth" loop along the (flat) navigation axis of a tile: for i, inner_frame_idx in enumerate(range(inner_indices_start, inner_indices_stop)): inner_frame = frame_indices[inner_frame_idx] file_idx = file_idxs[i] f = fileset_arr[file_idx] frame_in_file_idx = inner_frame - f[0] file_header_bytes = f[3] # we are reading a part of a single frame, so we first need to find # the offset caused by headers: header_offset = file_header_bytes + frame_header_bytes * (frame_in_file_idx + 1) # now let's figure in the current frame index: # (go down into the file by full frames; `sig_size`) offset = header_offset + frame_in_file_idx * sig_size_bytes # in total, we generate depth * 2 * (y_stop - y_start) read ranges per tile for y in range(y_start, y_stop): if y < y_size_half: # top: no y-flip, no x-flip flip = 0 # quadrant 1, left part of the result: we have the three other blocks # in the original data in front of us start, stop = _get_row_start_stop( offset, 3 * x_size_half, stride, y, x_size_half ) read_ranges.append(( file_idx, start, stop, flip, )) # quadrant 2, right part of the result: we have the two other blocks # in the original data in front of us start, stop = _get_row_start_stop( offset, 2 * x_size_half, stride, y, x_size_half ) read_ranges.append(( file_idx, start, stop, flip, )) else: # bottom: both x and y flip flip = 1 y = y_size - y - 1 # quadrant 3, left part of the result: we have the one other block # in the original data in front of us start, stop = _get_row_start_stop( offset, 1 * x_size_half, stride, y, x_size_half ) read_ranges.append(( file_idx, start, stop, flip, )) # quadrant 4, right part of the result: we have the no other blocks # in the original data in front of us start, stop = _get_row_start_stop(offset, 0, stride, y, x_size_half) read_ranges.append(( file_idx, start, stop, flip, )) # the indices are compressed to the selected frames compressed_slice = np.array([ [slice_offset + inner_indices_start] + [i for i in slice_origin], [inner_indices_stop - inner_indices_start] + [i for i in slice_shape], ]) result.append((slice_idx, compressed_slice, read_ranges)) return result
def calculate_infections(states, contacts, params, indexers, group_probs, seed): """Calculate infections from contacts. This function mainly converts the relevant parts from states and contacts into numpy arrays or other objects that are supported in numba nopython mode and then calls ``calculate_infections_numba``. Args: states (pandas.DataFrame): see :ref:`states`. contacts (pandas.DataFrame): One column per contact_model. Same index as states. params (pandas.DataFrame): See :ref:`params`. indexers (dict): Dict of numba.Typed.List The i_th entry of the lists are the indices of the i_th group. group_probs (dict): dict of arrays of shape n_group, n_groups. probs[i, j] is the probability that an individual from group i meets someone from group j. seed (itertools.count): Seed counter to control randomness. Returns: infected_sr (pd.Series): Boolean Series that is True for newly infected people. states (pandas.DataFrame): Copy of states with updated immune column. """ is_meet_group = np.array([k not in group_probs for k in indexers]) states = states.copy() infectious = states["infectious"].to_numpy(copy=True) immune = states["immune"].to_numpy(copy=True) group_codes = states[[f"group_codes_{cm}" for cm in indexers]].to_numpy() infect_probs = np.array( [params.loc[("infection_prob", cm, None), "value"] for cm in indexers]) group_probs_list = NumbaList() for gp in group_probs.values(): group_probs_list.append(gp) # nopython mode fails, if we leave the list empty or put a 1d array inside the list. if len(group_probs_list) == 0: group_probs_list.append(np.zeros((0, 0))) indexers_list = NumbaList() for ind in indexers.values(): indexers_list.append(ind) np.random.seed(next(seed)) loop_entries = np.array( list(itertools.product(range(len(states)), range(len(indexers))))) indices = np.random.choice(len(loop_entries), replace=False, size=len(loop_entries)) loop_order = loop_entries[indices] infected, infection_counter, immune, missed = _calculate_infections_numba( contacts, infectious, immune, group_codes, group_probs_list, indexers_list, infect_probs, next(seed), is_meet_group, loop_order, ) infected_sr = pd.Series(infected, index=states.index, dtype=bool) states["infection_counter"] += infection_counter for i, contact_model in enumerate(group_probs): states[f"missed_{contact_model}"] = missed[:, i] states["immune"] = immune return infected_sr, states
def _sample_data_for_calculate_infections_numba( n_individuals=None, n_contacts=None, infectious_share=None, group_shares=None, group_probabilities=None, infection_prob=None, seed=None, ): """Sample data for the calculation of new infections.""" if seed is not None: np.random.seed(seed) if n_individuals is None: n_individuals = np.random.randint(5, 1_000) if n_contacts is None: contacts = np.random.randint(2, 6, size=n_individuals) else: contacts = np.full(n_individuals, n_contacts) if infectious_share is None: infectious_share = np.random.uniform(0.000001, 1) infectious = np.zeros(n_individuals, dtype=np.bool) mask = np.random.choice(n_individuals, size=int(n_individuals * infectious_share)) infectious[mask] = True immune = infectious.copy() if group_shares is None: n_groups = np.random.randint(1, 4) group_shares = np.random.uniform(0.1, 1, size=n_groups) group_shares = group_shares / group_shares.sum() group_shares[-1] = 1 - group_shares[:-1].sum() n_groups = len(group_shares) group_codes = np.random.choice(n_groups, p=group_shares, size=n_individuals) if group_probabilities is None: group_probabilities = np.random.uniform(0.00001, 1, size=(n_groups, n_groups)) group_probabilities = group_probabilities / group_probabilities.sum( axis=1, keepdims=True ) group_probs_list = NumbaList() group_probs_list.append(group_probabilities) indexer = NumbaList() for group in range(n_groups): indexer.append(np.where(group_codes == group)[0]) indexers_list = NumbaList() indexers_list.append(indexer) if infection_prob is None: ip = np.random.uniform() infection_prob = np.array([ip]) is_meet_group = np.array([False]) loop_order = np.array(list(itertools.product(range(n_individuals), range(1)))) return ( contacts.reshape(-1, 1), infectious, immune, group_codes.reshape(-1, 1), group_probs_list, indexers_list, infection_prob, is_meet_group, loop_order, )
def numba_apply(s: pd.Series, f) -> pd.Series: result = f(NumbaList(s.values)) return pd.Series(result)