def setUp(self): self.frames_meta = meta_utils.make_dataframe() self.channel_ids = [0, 1, 2, 3, 4] self.slice_ids = [5, 6, 7] self.time_ids = [50] self.pos_ids = [3, 14, 26] for (c, z, t, p) in itertools.product(self.channel_ids, self.slice_ids, self.time_ids, self.pos_ids): meta_row = dict.fromkeys(meta_utils.DF_NAMES) meta_row['channel_idx'] = c meta_row['slice_idx'] = z meta_row['time_idx'] = t meta_row['pos_idx'] = p meta_row['sha256'] = 'AAAABBBB' meta_row['file_name'] = self._get_imname(meta_row) self.frames_meta = self.frames_meta.append(meta_row, ignore_index=True) self.im_height = 10 self.im_width = 20 self.im_colors = 1 self.global_meta = { "storage_dir": 'storage_dir_path', "nbr_frames": self.frames_meta.shape[0], "im_height": self.im_height, "im_width": self.im_width, "im_colors": self.im_colors, "bit_depth": 'uint16', "nbr_slices": len(np.unique(self.frames_meta["slice_idx"])), "nbr_channels": len(np.unique(self.frames_meta["channel_idx"])), "nbr_timepoints": len(np.unique(self.frames_meta["time_idx"])), "nbr_positions": len(np.unique(self.frames_meta["pos_idx"])), } self.storage_inst = data_storage.DataStorage('test_storage', 12)
def test_get_stack_from_meta(self): # Upload image stack storage_dir = "raw_frames/ML-2005-05-23-10-00-00-0001" self.data_storage.upload_frames(self.stack_names, self.im_stack) global_meta = { "storage_dir": storage_dir, "nbr_frames": 5, "im_height": 10, "im_width": 15, "nbr_slices": 5, "nbr_channels": 1, "im_colors": 1, "bit_depth": "uint16", "nbr_timepoints": 1, "nbr_positions": 1, } # Download slices 1:4 frames_meta = meta_utils.make_dataframe(nbr_frames=3) for i in range(3): sha = meta_utils.gen_sha256(self.im_stack[..., i + 1]) frames_meta.loc[i] = [0, i + 1, 0, "A", self.stack_names[i + 1], 0, sha] im_stack, dim_order = self.data_storage.get_stack_from_meta( global_meta=global_meta, frames_meta=frames_meta, ) # Stack has X = 10, Y = 15, grayscale, Z = 3, C = 1, T = 1, P = 1 # so expected stack shape and order should be: expected_shape = (10, 15, 3) nose.tools.assert_equal(im_stack.shape, expected_shape) nose.tools.assert_equal(dim_order, "XYZ")
def test_make_dataframe(): nbr_frames = 3 test_col_names = ["A", "B"] frames_meta = meta_utils.make_dataframe( nbr_frames=nbr_frames, col_names=test_col_names, ) nose.tools.assert_equal(frames_meta.shape, (nbr_frames, len(test_col_names))) nose.tools.assert_equal(test_col_names, list(frames_meta))
def test_make_empty_dataframe(): expected_names = [ "channel_idx", "slice_idx", "time_idx", "channel_name", "file_name", "pos_idx", "sha256"] frames_meta = meta_utils.make_dataframe() nose.tools.assert_equal(expected_names, list(frames_meta)) nose.tools.assert_true(frames_meta.empty)
def split_file(self, file_path, schema_filename): """ Splits file into frames and gets metadata for each frame. set_frame_info must be called prior to this function call. :param str file_path: Full path to file :param str schema_filename: Full path to schema file name :return dataframe frames_meta: Metadata for all frames :return np.array im_stack: Image stack extracted from file """ frames = tifffile.TiffFile(file_path) # Get global metadata nbr_frames = len(frames.pages) # Create image stack with image bit depth 16 or 8 im_stack = np.empty((self.frame_shape[0], self.frame_shape[1], self.im_colors, nbr_frames), dtype=self.bit_depth) # Get metadata schema meta_schema = json_ops.read_json_file(schema_filename) # Convert frames to numpy stack and collect metadata # Separate structured metadata (with known fields) # from unstructured, the latter goes into frames_json frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames) # Pandas doesn't really support inserting dicts into dataframes, # so micromanager metadata goes into a separate list for i in range(nbr_frames): page = frames.pages[i] im_stack[..., i] = np.atleast_3d(page.asarray()) # Get dict with metadata from json schema json_i, meta_i = json_ops.get_metadata_from_tags( page=page, meta_schema=meta_schema, validate=True, ) self.frames_json.append(json_i) # Add required metadata fields to data frame meta_names = meta_utils.META_NAMES df_names = meta_utils.DF_NAMES for meta_name, df_name in zip(meta_names, df_names): if meta_name in meta_i.keys(): frames_meta.loc[i, df_name] = meta_i[meta_name] # Create a file name and add it im_name = self._get_imname(frames_meta.loc[i]) frames_meta.loc[i, "file_name"] = im_name return frames_meta, im_stack
def test_get_stack_from_meta(self): # Upload image stack storage_dir = "raw_frames/ML-2005-05-23-10-00-00-0001" data_storage = s3_storage.S3Storage(storage_dir, self.nbr_workers) data_storage.upload_frames(self.stack_names, self.im_stack) global_meta = { "storage_dir": storage_dir, "nbr_frames": 2, "im_height": 10, "im_width": 15, "nbr_slices": 1, "nbr_channels": 2, "im_colors": 1, "bit_depth": "uint16", "nbr_timepoints": 1, "nbr_positions": 1, } frames_meta = meta_utils.make_dataframe( nbr_frames=global_meta["nbr_frames"], ) nbr_frames = self.im_stack.shape[2] sha = [None] * nbr_frames for i in range(nbr_frames): sha[i] = meta_utils.gen_sha256(self.im_stack[..., i]) frames_meta.loc[0] = [0, 0, 0, "A", "im1.png", 0, sha[0]] frames_meta.loc[1] = [1, 0, 0, "B", "im2.png", 0, sha[1]] im_stack, dim_order = data_storage.get_stack_from_meta( global_meta, frames_meta, ) # Stack has X = 10, Y = 15, grayscale, Z = 1, C = 2, T = 1, P = 1 # so expected stack shape and order should be: expected_shape = (10, 15, 2) nose.tools.assert_equal(im_stack.shape, expected_shape) nose.tools.assert_equal(dim_order, "XYC")
def get_frames_and_metadata(self, filename_parser='parse_idx_from_name'): """ Frame metadata is extracted from each frame, and frames are uploaded on a file by file basis. Since metadata is separated from files, the file name must contain the required indices channel_idx, slice_idx, time and pos_idx. By default, it will assume that the file name contains 4 integers corresponding to these 4 indices. If that's not the case, you can specify a custom parser in filename_parsers. A global metadata dict is assumed to be in the same directory in a file named metadata.txt. If not, the global_json will be empty and frame info will be determined from the first frame. The metadata.txt file (if there) is assumed to contain a minimum of the following (with example values): 'Summary': { 'PixelType': 'GRAY16', 'BitDepth': 16, 'Width': 15, 'Height': 10 } :param str filename_parser: Function name in filename_parsers.py """ assert os.path.isdir(self.data_path), \ "Directory doesn't exist: {}".format(self.data_path) try: parse_func = getattr(file_parsers, filename_parser) except AttributeError as e: raise AttributeError( "Must use filename_parsers function for file name. {}".format( e)) frame_paths = natsort.natsorted( glob.glob(os.path.join(self.data_path, "*.tif")), ) nbr_frames = len(frame_paths) metadata_path = os.path.join(self.data_path, "metadata.txt") if len(glob.glob(metadata_path)) == 1: self.global_json = json_ops.read_json_file( os.path.join(self.data_path, "metadata.txt"), ) self.set_frame_info(self.global_json["Summary"]) else: # No metadata.txt file in dir, get frame info from first frame self.set_frame_info_from_file(frame_paths[0]) self.global_json = {} self.frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames) self.frames_json = [] # Loop over all the frames to get metadata for i, frame_path in enumerate(frame_paths): # Get structured frames metadata self.frames_meta.loc[i] = self._set_frame_meta( parse_func=parse_func, file_name=frame_path, ) # Use multiprocessing for more efficient file read and upload file_names = self.frames_meta['file_name'] with concurrent.futures.ProcessPoolExecutor(self.nbr_workers) as ex: res = ex.map(self.serialize_upload, zip(frame_paths, file_names)) # Collect metadata for each uploaded file for i, (sha256, dict_i) in enumerate(res): self.frames_json.append(json.loads(dict_i)) self.frames_meta.loc[i, 'sha256'] = sha256 # Set global metadata self.set_global_meta(nbr_frames=nbr_frames)
def get_frames_and_metadata(self, schema_filename, positions=None): """ Reads ome.tiff file into memory and separates image frames and metadata. Workaround in case I need to read ome-xml: https://github.com/soft-matter/pims/issues/125 It is assumed that all metadata lives as dicts inside tiff frame tags. NOTE: It seems like the IJMetadata Info field is a dict converted into string, and it's only present in the first frame. :param str schema_filename: Full path to metadata json schema file :param [None, list of ints] positions: Position files to upload. If None, """ if isinstance(positions, type(None)): positions = [] if os.path.isfile(self.data_path): # Run through processing only once file_paths = [self.data_path] # Only one file so don't consider positions positions = [] else: # Get position files in the folder file_paths = glob.glob(os.path.join(self.data_path, "*.ome.tif")) assert len(file_paths) > 0,\ "Can't find ome.tifs in {}".format(self.data_path) # Parse positions if isinstance(positions, str): positions = json_ops.str2json(positions) if isinstance(positions, int): print('is int') positions = [positions] # Read first file to find available positions frames = tifffile.TiffFile(file_paths[0]) # Get global metadata page = frames.pages[0] # Set frame info. This should not vary between positions self.set_frame_info(page) # IJMetadata only exists in first frame, so that goes into global json self.global_json = json_ops.get_global_json( page=page, file_name=self.data_path, ) # Validate given positions if len(positions) > 0: file_paths = self._validate_file_paths( positions=positions, glob_paths=file_paths, ) self.frames_meta = meta_utils.make_dataframe() self.frames_json = [] pos_prog_bar = tqdm(file_paths, desc='Position') for file_path in pos_prog_bar: file_meta, im_stack = self.split_file( file_path, schema_filename, ) sha = self._generate_hash(im_stack) file_meta['sha256'] = sha self.frames_meta = self.frames_meta.append( file_meta, ignore_index=True, ) # Upload frames in file to S3 self.data_uploader.upload_frames( file_names=list(file_meta["file_name"]), im_stack=im_stack, ) # Finally, set global metadata from frames_meta self.set_global_meta(nbr_frames=self.frames_meta.shape[0])
def setUp(self): super().setUp() self.dataset_serial = 'TEST-2005-10-09-20-00-00-0001' self.global_meta = { "storage_dir": "dir_name", "nbr_frames": 6, "im_height": 256, "im_width": 512, "im_colors": 1, "bit_depth": "uint16", "nbr_slices": 2, "nbr_channels": 3, "nbr_timepoints": 1, "nbr_positions": 1, } self.global_json_meta = {'status': 'test'} self.microscope = 'test_microscope' self.description = 'This is a test' self.storage_dir = 'testing/TEST-2005-10-09-20-00-00-0001' self.sha256 = 'aaabbbccc' self.frames_meta = meta_utils.make_dataframe(6) self.frames_json_meta = [] self.meta_dict = {'local_key': 'local_value'} self.channel_names = ['brightfield', 'phase', '405'] for i, (c, z) in enumerate(itertools.product(range(3), range(2))): im_name = 'im_c00{}_z00{}_t005_p050.png'.format(c, z) self.frames_meta.loc[i, 'file_name'] = im_name self.frames_meta.loc[i, 'channel_idx'] = c self.frames_meta.loc[i, 'channel_name'] = self.channel_names[c] self.frames_meta.loc[i, 'slice_idx'] = z self.frames_meta.loc[i, 'pos_idx'] = 50 self.frames_meta.loc[i, 'time_idx'] = 5 self.frames_meta.loc[i, 'sha256'] = self.sha256 self.frames_json_meta.append(self.meta_dict) # Insert frames self.db_inst = db_ops.DatabaseOperations( dataset_serial=self.dataset_serial, ) self.db_inst.insert_frames( session=self.session, description='test frames', frames_meta=self.frames_meta, frames_json_meta=self.frames_json_meta, global_meta=self.global_meta, global_json_meta=self.global_json_meta, microscope=self.microscope, parent_dataset=None, ) # Add some more datasets for queries self.dataset_ids = [ 'PROJECT-2010-04-01-00-00-00-0001', 'PROJECT-2010-05-01-00-00-00-0001', 'PROJECT-2010-06-01-00-00-00-0001' ] self.descriptions = [ 'First dataset test', 'Second dataset', 'Third dataset' ] self.microscopes = ['scope1', 'scope2', 'scope2'] # Add a few more datasets for i in range(len(self.dataset_ids)): new_dataset = db_ops.DataSet( dataset_serial=self.dataset_ids[i], description=self.descriptions[i], frames=True, microscope=self.microscopes[i], parent_id=None, ) self.session.add(new_dataset) self.session.commit() # query frames self.frames = self.session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == self.dataset_serial) \ .order_by(db_ops.Frames.file_name)
def get_frames_and_metadata(self, filename_parser=None): """ Reads tif files into memory and separates image frames and metadata. Use this class if no MicroManagerMetadata tag is present, but you have an ImageDescription tag. It assumes that if there are any information number of channels, slices, timepoints or positions, the info is embedded as a string in the ImageDescription tag of the first frame. It assumes the acquisition order is: 1) channels, 2) slices, 3) positions, 4) frames. There is no way of validating the order because only the number of each is specified, so use at your own risk. :param str filename_parser: Optional function name that will generate global json metadata from file name. """ assert os.path.isfile(self.data_path), \ "File doesn't exist: {}".format(self.data_path) frames = tifffile.TiffFile(self.data_path) # Get global metadata page = frames.pages[0] nbr_frames = len(frames.pages) float2uint = self.set_frame_info(page) # Create image stack with image bit depth 16 or 8 self.im_stack = np.empty((self.frame_shape[0], self.frame_shape[1], self.im_colors, nbr_frames), dtype=self.bit_depth) # Get what little channel info there is from image description indices = self._get_params_from_str( page.tags["ImageDescription"].value, ) # Get global json metadata if filename_parser is not None: parse_func = getattr(file_parsers, filename_parser) self.global_json = parse_func(self.data_path) else: self.global_json = {} self.global_json["file_origin"] = self.data_path print('float', float2uint) # Convert frames to numpy stack and collect metadata self.frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames) self.frames_json = [] # Loop over all the frames to get data and metadata variable_iterator = itertools.product( range(indices['nbr_timepoints']), range(indices['nbr_positions']), range(indices['nbr_slices']), range(indices['nbr_channels']), ) for i, (time_idx, pos_idx, slice_idx, channel_idx) in \ enumerate(variable_iterator): page = frames.pages[i] try: im = page.asarray() except ValueError as e: raise ValueError("Can't read page ", i, self.data_path) if float2uint: assert im.max() < 65536, "Im > 16 bit, max: {}".format( im.max()) im = im.astype(np.uint16) self.im_stack[..., i] = np.atleast_3d(im) tiftags = page.tags # Get all frame specific metadata dict_i = {} for t in tiftags.keys(): # IJMeta often contain an ndarray LUT which is not serializable if t != 'IJMetadata': dict_i[t] = tiftags[t].value self.frames_json.append(dict_i) meta_row = dict.fromkeys(meta_utils.DF_NAMES) meta_row["channel_name"] = None meta_row["channel_idx"] = channel_idx meta_row["time_idx"] = time_idx meta_row["pos_idx"] = pos_idx meta_row["slice_idx"] = slice_idx meta_row["file_name"] = self._get_imname(meta_row) self.frames_meta.loc[i] = meta_row sha = self._generate_hash(self.im_stack) self.frames_meta['sha256'] = sha # Set global metadata self.set_global_meta(nbr_frames=nbr_frames) self.data_uploader.upload_frames( file_names=self.frames_meta["file_name"], im_stack=self.im_stack, )