def synthetic_stack( num_hyb: int=DEFAULT_NUM_HYB, num_ch: int=DEFAULT_NUM_CH, num_z: int=DEFAULT_NUM_Z, tile_height: int=DEFAULT_HEIGHT, tile_width: int=DEFAULT_WIDTH, tile_data_provider: Callable[[int, int, int, int, int], np.ndarray]=default_tile_data_provider, tile_extras_provider: Callable[[int, int, int], Any]=default_tile_extras_provider ) -> ImageStack: """generate a synthetic ImageStack Returns ------- ImageStack : imagestack containing a tensor of (2, 3, 4, 30, 20) whose values are all 1. """ img = TileSet( {Coordinates.X, Coordinates.Y, Indices.HYB, Indices.CH, Indices.Z}, { Indices.HYB: num_hyb, Indices.CH: num_ch, Indices.Z: num_z, }, default_tile_shape=(tile_height, tile_width), ) for hyb in range(num_hyb): for ch in range(num_ch): for z in range(num_z): tile = Tile( { Coordinates.X: (0.0, 0.001), Coordinates.Y: (0.0, 0.001), Coordinates.Z: (0.0, 0.001), }, { Indices.HYB: hyb, Indices.CH: ch, Indices.Z: z, }, extras=tile_extras_provider(hyb, ch, z), ) tile.numpy_array = tile_data_provider(hyb, ch, z, tile_height, tile_width) img.add_tile(tile) stack = ImageStack(img) return stack
def synthesize() -> Tuple[Stack, list]: """Synthesize synthetic spatial image-based transcriptomics data Returns ------- Stack : starfish Stack containing synthetic spots list : codebook matching the synthetic data """ # set random seed so that data is consistent across tests random.seed(2) np.random.seed(2) NUM_HYB = 4 NUM_CH = 2 NUM_Z = 1 HEIGHT = 100 WIDTH = 100 assert WIDTH == HEIGHT # for compatibility with the parameterization of the code def choose(n, k): if n == k: return [[1] * k] subsets = [[0] + a for a in choose(n - 1, k)] if k > 0: subsets += [[1] + a for a in choose(n - 1, k - 1)] return subsets def graham_sloane_codes(n): # n is length of codeword # number of on bits is 4 def code_sum(codeword): return sum([i * c for i, c in enumerate(codeword)]) % n return [c for c in choose(n, 4) if code_sum(c) == 0] p = { # number of on bits (not used with current codebook) 'N_high': 4, # length of barcode 'N_barcode': NUM_CH * NUM_HYB, # mean number of flourophores per transcripts - depends on amplification strategy (e.g HCR, bDNA) 'N_flour': 200, # mean number of photons per flourophore - depends on exposure time, bleaching rate of dye 'N_photons_per_flour': 50, # mean number of background photons per pixel - depends on tissue clearing and autoflourescence 'N_photon_background': 1000, # quantum efficiency of the camera detector units number of electrons per photon 'detection_efficiency': .25, # camera read noise per pixel in units electrons 'N_background_electrons': 1, # number of RNA puncta; keep this low to reduce overlap probability 'N_spots': 20, # height and width of image in pixel units 'N_size': WIDTH, # standard devitation of gaussian in pixel units 'psf': 2, # dynamic range of camera sensor 37,000 assuming a 16-bit AD converter 'graylevel': 37000.0 / 2 ** 16, # 16-bit AD converter 'bits': 16 } codebook = graham_sloane_codes(p['N_barcode']) def generate_spot(p): position = rand(2) gene = random.choice(range(len(codebook))) barcode = array(codebook[gene]) photons = [poisson(p['N_photons_per_flour']) * poisson(p['N_flour']) * b for b in barcode] return DataFrame({'position': [position], 'barcode': [barcode], 'photons': [photons], 'gene': gene}) # right now there is no jitter on x-y positions of the spots, we might want to make it a vector spots = concat([generate_spot(p) for _ in range(p['N_spots'])]) # type: ignore image = zeros((p['N_barcode'], p['N_size'], p['N_size'],)) for s in spots.itertuples(): image[:, int(p['N_size'] * s.position[0]), int(p['N_size'] * s.position[1])] = s.photons image_with_background = image + poisson(p['N_photon_background'], size=image.shape) filtered = array([gaussian(im, p['psf']) for im in image_with_background]) filtered = filtered * p['detection_efficiency'] + normal(scale=p['N_background_electrons'], size=filtered.shape) signal = np.array([(x / p['graylevel']).astype(int).clip(0, 2 ** p['bits']) for x in filtered]) def select_uint_dtype(array): """choose appropriate dtype based on values of an array""" max_val = np.max(array) for dtype in [np.uint8, np.uint16, np.uint32, np.uint64]: if max_val <= dtype(-1): return array.astype(dtype) raise ValueError('value exceeds dynamic range of largest numpy type') corrected_signal = select_uint_dtype(signal) rescaled_signal: np.ndarray = rescale_intensity(corrected_signal) # set up the tile set image_data = TileSet( {Coordinates.X, Coordinates.Y, Indices.HYB, Indices.CH, Indices.Z}, { Indices.HYB: NUM_HYB, Indices.CH: NUM_CH, Indices.Z: NUM_Z, }, default_tile_shape=(HEIGHT, WIDTH), ) # fill the TileSet experiment_indices = list(product(range(NUM_HYB), range(NUM_CH), range(NUM_Z))) for i, (hyb, ch, z) in enumerate(experiment_indices): tile = Tile( { Coordinates.X: (0.0, 0.001), Coordinates.Y: (0.0, 0.001), Coordinates.Z: (0.0, 0.001), }, { Indices.HYB: hyb, Indices.CH: ch, Indices.Z: z, } ) tile.numpy_array = rescaled_signal[i] image_data.add_tile(tile) data_stack = ImageStack(image_data) # make a max projection and pretend that's the dots image, which we'll create another ImageStack for this dots_data = TileSet( {Coordinates.X, Coordinates.Y, Indices.HYB, Indices.CH, Indices.Z}, { Indices.HYB: 1, Indices.CH: 1, Indices.Z: 1, }, default_tile_shape=(HEIGHT, WIDTH), ) tile = Tile( { Coordinates.X: (0.0, 0.001), Coordinates.Y: (0.0, 0.001), Coordinates.Z: (0.0, 0.001), }, { Indices.HYB: 0, Indices.CH: 0, Indices.Z: 0, } ) tile.numpy_array = np.max(rescaled_signal, 0) dots_data.add_tile(tile) dots_stack = ImageStack(dots_data) # TODO can we mock up a nuclei image somehow? # put the data together into a top-level Stack results = Stack.from_data(data_stack, aux_dict={'dots': dots_stack}) # make the codebook(s) codebook = [] for _, code_record in spots.iterrows(): codeword = [] for code_value, (hyb, ch, z) in zip(code_record['barcode'], experiment_indices): if code_value != 0: codeword.append({ Indices.HYB: hyb, Indices.CH: ch, Indices.Z: z, "v": code_value }) codebook.append( { 'codeword': codeword, 'gene_name': code_record['gene'] } ) return results, codebook