Python make_dataframe Examples, imaging_db.utils.meta_utils.make_dataframe Python Examples

Example #1

0

Show file

File: data_storage_tests.py Project: czbiohub/imagingDB

    def setUp(self):
        self.frames_meta = meta_utils.make_dataframe()

        self.channel_ids = [0, 1, 2, 3, 4]
        self.slice_ids = [5, 6, 7]
        self.time_ids = [50]
        self.pos_ids = [3, 14, 26]
        for (c, z, t, p) in itertools.product(self.channel_ids, self.slice_ids,
                                              self.time_ids, self.pos_ids):
            meta_row = dict.fromkeys(meta_utils.DF_NAMES)
            meta_row['channel_idx'] = c
            meta_row['slice_idx'] = z
            meta_row['time_idx'] = t
            meta_row['pos_idx'] = p
            meta_row['sha256'] = 'AAAABBBB'
            meta_row['file_name'] = self._get_imname(meta_row)
            self.frames_meta = self.frames_meta.append(meta_row,
                                                       ignore_index=True)

        self.im_height = 10
        self.im_width = 20
        self.im_colors = 1
        self.global_meta = {
            "storage_dir": 'storage_dir_path',
            "nbr_frames": self.frames_meta.shape[0],
            "im_height": self.im_height,
            "im_width": self.im_width,
            "im_colors": self.im_colors,
            "bit_depth": 'uint16',
            "nbr_slices": len(np.unique(self.frames_meta["slice_idx"])),
            "nbr_channels": len(np.unique(self.frames_meta["channel_idx"])),
            "nbr_timepoints": len(np.unique(self.frames_meta["time_idx"])),
            "nbr_positions": len(np.unique(self.frames_meta["pos_idx"])),
        }
        self.storage_inst = data_storage.DataStorage('test_storage', 12)

Example #2

0

Show file

File: local_storage_tests.py Project: czbiohub/imagingDB

    def test_get_stack_from_meta(self):
        # Upload image stack
        storage_dir = "raw_frames/ML-2005-05-23-10-00-00-0001"
        self.data_storage.upload_frames(self.stack_names, self.im_stack)
        global_meta = {
            "storage_dir": storage_dir,
            "nbr_frames": 5,
            "im_height": 10,
            "im_width": 15,
            "nbr_slices": 5,
            "nbr_channels": 1,
            "im_colors": 1,
            "bit_depth": "uint16",
            "nbr_timepoints": 1,
            "nbr_positions": 1,
        }
        # Download slices 1:4
        frames_meta = meta_utils.make_dataframe(nbr_frames=3)
        for i in range(3):
            sha = meta_utils.gen_sha256(self.im_stack[..., i + 1])
            frames_meta.loc[i] = [0, i + 1, 0, "A", self.stack_names[i + 1], 0, sha]

        im_stack, dim_order = self.data_storage.get_stack_from_meta(
            global_meta=global_meta,
            frames_meta=frames_meta,
        )
        # Stack has X = 10, Y = 15, grayscale, Z = 3, C = 1, T = 1, P = 1
        # so expected stack shape and order should be:
        expected_shape = (10, 15, 3)
        nose.tools.assert_equal(im_stack.shape, expected_shape)
        nose.tools.assert_equal(dim_order, "XYZ")

Example #3

0

Show file

def test_make_dataframe():
    nbr_frames = 3
    test_col_names = ["A", "B"]
    frames_meta = meta_utils.make_dataframe(
        nbr_frames=nbr_frames,
        col_names=test_col_names,
    )
    nose.tools.assert_equal(frames_meta.shape, (nbr_frames, len(test_col_names)))
    nose.tools.assert_equal(test_col_names, list(frames_meta))

Example #4

0

Show file

def test_make_empty_dataframe():
    expected_names = [
        "channel_idx",
        "slice_idx",
        "time_idx",
        "channel_name",
        "file_name",
        "pos_idx",
        "sha256"]
    frames_meta = meta_utils.make_dataframe()
    nose.tools.assert_equal(expected_names, list(frames_meta))
    nose.tools.assert_true(frames_meta.empty)

Example #5

0

Show file

File: ometif_splitter.py Project: czbiohub/imagingDB

    def split_file(self, file_path, schema_filename):
        """
        Splits file into frames and gets metadata for each frame.
        set_frame_info must be called prior to this function call.

        :param str file_path: Full path to file
        :param str schema_filename: Full path to schema file name
        :return dataframe frames_meta: Metadata for all frames
        :return np.array im_stack: Image stack extracted from file
        """
        frames = tifffile.TiffFile(file_path)
        # Get global metadata
        nbr_frames = len(frames.pages)
        # Create image stack with image bit depth 16 or 8
        im_stack = np.empty((self.frame_shape[0],
                             self.frame_shape[1],
                             self.im_colors,
                             nbr_frames),
                            dtype=self.bit_depth)

        # Get metadata schema
        meta_schema = json_ops.read_json_file(schema_filename)
        # Convert frames to numpy stack and collect metadata
        # Separate structured metadata (with known fields)
        # from unstructured, the latter goes into frames_json
        frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames)
        # Pandas doesn't really support inserting dicts into dataframes,
        # so micromanager metadata goes into a separate list
        for i in range(nbr_frames):
            page = frames.pages[i]
            im_stack[..., i] = np.atleast_3d(page.asarray())
            # Get dict with metadata from json schema
            json_i, meta_i = json_ops.get_metadata_from_tags(
                page=page,
                meta_schema=meta_schema,
                validate=True,
            )
            self.frames_json.append(json_i)
            # Add required metadata fields to data frame
            meta_names = meta_utils.META_NAMES
            df_names = meta_utils.DF_NAMES
            for meta_name, df_name in zip(meta_names, df_names):
                if meta_name in meta_i.keys():
                    frames_meta.loc[i, df_name] = meta_i[meta_name]

            # Create a file name and add it
            im_name = self._get_imname(frames_meta.loc[i])
            frames_meta.loc[i, "file_name"] = im_name
        return frames_meta, im_stack

Example #6

0

Show file

    def test_get_stack_from_meta(self):
        # Upload image stack
        storage_dir = "raw_frames/ML-2005-05-23-10-00-00-0001"
        data_storage = s3_storage.S3Storage(storage_dir, self.nbr_workers)
        data_storage.upload_frames(self.stack_names, self.im_stack)
        global_meta = {
            "storage_dir": storage_dir,
            "nbr_frames": 2,
            "im_height": 10,
            "im_width": 15,
            "nbr_slices": 1,
            "nbr_channels": 2,
            "im_colors": 1,
            "bit_depth": "uint16",
            "nbr_timepoints": 1,
            "nbr_positions": 1,
        }
        frames_meta = meta_utils.make_dataframe(
            nbr_frames=global_meta["nbr_frames"], )

        nbr_frames = self.im_stack.shape[2]
        sha = [None] * nbr_frames
        for i in range(nbr_frames):
            sha[i] = meta_utils.gen_sha256(self.im_stack[..., i])

        frames_meta.loc[0] = [0, 0, 0, "A", "im1.png", 0, sha[0]]
        frames_meta.loc[1] = [1, 0, 0, "B", "im2.png", 0, sha[1]]
        im_stack, dim_order = data_storage.get_stack_from_meta(
            global_meta,
            frames_meta,
        )
        # Stack has X = 10, Y = 15, grayscale, Z = 1, C = 2, T = 1, P = 1
        # so expected stack shape and order should be:
        expected_shape = (10, 15, 2)
        nose.tools.assert_equal(im_stack.shape, expected_shape)
        nose.tools.assert_equal(dim_order, "XYC")

Example #7

0

Show file

File: tiffolder_splitter.py Project: czbiohub/imagingDB

    def get_frames_and_metadata(self, filename_parser='parse_idx_from_name'):
        """
        Frame metadata is extracted from each frame, and frames are uploaded
        on a file by file basis.
        Since metadata is separated from files, the file name must contain the
        required indices channel_idx, slice_idx, time and pos_idx. By default,
        it will assume that the file name contains 4 integers corresponding to
        these 4 indices. If that's not the case, you can specify a custom parser
        in filename_parsers.
        A global metadata dict is assumed to be in the same directory in a file
        named metadata.txt. If not, the global_json will be empty and frame
        info will be determined from the first frame.
        The metadata.txt file (if there) is assumed to contain a minimum of the
        following (with example values):
            'Summary': {
                'PixelType': 'GRAY16',
                'BitDepth': 16,
                'Width': 15,
                'Height': 10
            }

        :param str filename_parser: Function name in filename_parsers.py
        """
        assert os.path.isdir(self.data_path), \
            "Directory doesn't exist: {}".format(self.data_path)

        try:
            parse_func = getattr(file_parsers, filename_parser)
        except AttributeError as e:
            raise AttributeError(
                "Must use filename_parsers function for file name. {}".format(
                    e))

        frame_paths = natsort.natsorted(
            glob.glob(os.path.join(self.data_path, "*.tif")), )
        nbr_frames = len(frame_paths)

        metadata_path = os.path.join(self.data_path, "metadata.txt")
        if len(glob.glob(metadata_path)) == 1:
            self.global_json = json_ops.read_json_file(
                os.path.join(self.data_path, "metadata.txt"), )
            self.set_frame_info(self.global_json["Summary"])
        else:
            # No metadata.txt file in dir, get frame info from first frame
            self.set_frame_info_from_file(frame_paths[0])
            self.global_json = {}

        self.frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames)
        self.frames_json = []
        # Loop over all the frames to get metadata
        for i, frame_path in enumerate(frame_paths):
            # Get structured frames metadata
            self.frames_meta.loc[i] = self._set_frame_meta(
                parse_func=parse_func,
                file_name=frame_path,
            )
        # Use multiprocessing for more efficient file read and upload
        file_names = self.frames_meta['file_name']
        with concurrent.futures.ProcessPoolExecutor(self.nbr_workers) as ex:
            res = ex.map(self.serialize_upload, zip(frame_paths, file_names))
        # Collect metadata for each uploaded file
        for i, (sha256, dict_i) in enumerate(res):
            self.frames_json.append(json.loads(dict_i))
            self.frames_meta.loc[i, 'sha256'] = sha256
        # Set global metadata
        self.set_global_meta(nbr_frames=nbr_frames)

Example #8

0

Show file

File: ometif_splitter.py Project: czbiohub/imagingDB

    def get_frames_and_metadata(self, schema_filename, positions=None):
        """
        Reads ome.tiff file into memory and separates image frames and metadata.
        Workaround in case I need to read ome-xml:
        https://github.com/soft-matter/pims/issues/125
        It is assumed that all metadata lives as dicts inside tiff frame tags.
        NOTE: It seems like the IJMetadata Info field is a dict converted into
        string, and it's only present in the first frame.

        :param str schema_filename: Full path to metadata json schema file
        :param [None, list of ints] positions: Position files to upload.
            If None,
        """
        if isinstance(positions, type(None)):
            positions = []
        if os.path.isfile(self.data_path):
            # Run through processing only once
            file_paths = [self.data_path]
            # Only one file so don't consider positions
            positions = []
        else:
            # Get position files in the folder
            file_paths = glob.glob(os.path.join(self.data_path, "*.ome.tif"))
            assert len(file_paths) > 0,\
                "Can't find ome.tifs in {}".format(self.data_path)
            # Parse positions
            if isinstance(positions, str):
                positions = json_ops.str2json(positions)
                if isinstance(positions, int):
                    print('is int')
                    positions = [positions]

        # Read first file to find available positions
        frames = tifffile.TiffFile(file_paths[0])
        # Get global metadata
        page = frames.pages[0]
        # Set frame info. This should not vary between positions
        self.set_frame_info(page)
        # IJMetadata only exists in first frame, so that goes into global json
        self.global_json = json_ops.get_global_json(
            page=page,
            file_name=self.data_path,
        )
        # Validate given positions
        if len(positions) > 0:
            file_paths = self._validate_file_paths(
                positions=positions,
                glob_paths=file_paths,
            )
        self.frames_meta = meta_utils.make_dataframe()
        self.frames_json = []

        pos_prog_bar = tqdm(file_paths, desc='Position')

        for file_path in pos_prog_bar:
            file_meta, im_stack = self.split_file(
                file_path,
                schema_filename,
            )

            sha = self._generate_hash(im_stack)
            file_meta['sha256'] = sha

            self.frames_meta = self.frames_meta.append(
                file_meta,
                ignore_index=True,
            )
            # Upload frames in file to S3
            self.data_uploader.upload_frames(
                file_names=list(file_meta["file_name"]),
                im_stack=im_stack,
            )
        # Finally, set global metadata from frames_meta
        self.set_global_meta(nbr_frames=self.frames_meta.shape[0])

Example #9

0

Show file

    def setUp(self):
        super().setUp()

        self.dataset_serial = 'TEST-2005-10-09-20-00-00-0001'
        self.global_meta = {
            "storage_dir": "dir_name",
            "nbr_frames": 6,
            "im_height": 256,
            "im_width": 512,
            "im_colors": 1,
            "bit_depth": "uint16",
            "nbr_slices": 2,
            "nbr_channels": 3,
            "nbr_timepoints": 1,
            "nbr_positions": 1,
        }
        self.global_json_meta = {'status': 'test'}
        self.microscope = 'test_microscope'
        self.description = 'This is a test'
        self.storage_dir = 'testing/TEST-2005-10-09-20-00-00-0001'
        self.sha256 = 'aaabbbccc'

        self.frames_meta = meta_utils.make_dataframe(6)
        self.frames_json_meta = []
        self.meta_dict = {'local_key': 'local_value'}
        self.channel_names = ['brightfield', 'phase', '405']
        for i, (c, z) in enumerate(itertools.product(range(3), range(2))):
            im_name = 'im_c00{}_z00{}_t005_p050.png'.format(c, z)
            self.frames_meta.loc[i, 'file_name'] = im_name
            self.frames_meta.loc[i, 'channel_idx'] = c
            self.frames_meta.loc[i, 'channel_name'] = self.channel_names[c]
            self.frames_meta.loc[i, 'slice_idx'] = z
            self.frames_meta.loc[i, 'pos_idx'] = 50
            self.frames_meta.loc[i, 'time_idx'] = 5
            self.frames_meta.loc[i, 'sha256'] = self.sha256
            self.frames_json_meta.append(self.meta_dict)
        # Insert frames
        self.db_inst = db_ops.DatabaseOperations(
            dataset_serial=self.dataset_serial, )
        self.db_inst.insert_frames(
            session=self.session,
            description='test frames',
            frames_meta=self.frames_meta,
            frames_json_meta=self.frames_json_meta,
            global_meta=self.global_meta,
            global_json_meta=self.global_json_meta,
            microscope=self.microscope,
            parent_dataset=None,
        )
        # Add some more datasets for queries
        self.dataset_ids = [
            'PROJECT-2010-04-01-00-00-00-0001',
            'PROJECT-2010-05-01-00-00-00-0001',
            'PROJECT-2010-06-01-00-00-00-0001'
        ]
        self.descriptions = [
            'First dataset test', 'Second dataset', 'Third dataset'
        ]
        self.microscopes = ['scope1', 'scope2', 'scope2']
        # Add a few more datasets
        for i in range(len(self.dataset_ids)):
            new_dataset = db_ops.DataSet(
                dataset_serial=self.dataset_ids[i],
                description=self.descriptions[i],
                frames=True,
                microscope=self.microscopes[i],
                parent_id=None,
            )
            self.session.add(new_dataset)
        self.session.commit()
        # query frames
        self.frames = self.session.query(db_ops.Frames) \
            .join(db_ops.FramesGlobal) \
            .join(db_ops.DataSet) \
            .filter(db_ops.DataSet.dataset_serial == self.dataset_serial) \
            .order_by(db_ops.Frames.file_name)

Example #10

0

Show file

File: tif_id_splitter.py Project: czbiohub/imagingDB

    def get_frames_and_metadata(self, filename_parser=None):
        """
        Reads tif files into memory and separates image frames and metadata.
        Use this class if no MicroManagerMetadata tag is present, but you
        have an ImageDescription tag.
        It assumes that if there are any information number of channels,
        slices, timepoints or positions, the info is embedded as a string in
        the ImageDescription tag of the first frame.
        It assumes the acquisition order is:
         1) channels, 2) slices, 3) positions, 4) frames.
        There is no way of validating the order because only the number
        of each is specified, so use at your own risk.

        :param str filename_parser: Optional function name that will
            generate global json metadata from file name.
        """
        assert os.path.isfile(self.data_path), \
            "File doesn't exist: {}".format(self.data_path)

        frames = tifffile.TiffFile(self.data_path)
        # Get global metadata
        page = frames.pages[0]
        nbr_frames = len(frames.pages)
        float2uint = self.set_frame_info(page)
        # Create image stack with image bit depth 16 or 8
        self.im_stack = np.empty((self.frame_shape[0], self.frame_shape[1],
                                  self.im_colors, nbr_frames),
                                 dtype=self.bit_depth)

        # Get what little channel info there is from image description
        indices = self._get_params_from_str(
            page.tags["ImageDescription"].value, )
        # Get global json metadata
        if filename_parser is not None:
            parse_func = getattr(file_parsers, filename_parser)
            self.global_json = parse_func(self.data_path)
        else:
            self.global_json = {}
        self.global_json["file_origin"] = self.data_path
        print('float', float2uint)
        # Convert frames to numpy stack and collect metadata
        self.frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames)
        self.frames_json = []
        # Loop over all the frames to get data and metadata
        variable_iterator = itertools.product(
            range(indices['nbr_timepoints']),
            range(indices['nbr_positions']),
            range(indices['nbr_slices']),
            range(indices['nbr_channels']),
        )
        for i, (time_idx, pos_idx, slice_idx, channel_idx) in \
                enumerate(variable_iterator):
            page = frames.pages[i]
            try:
                im = page.asarray()
            except ValueError as e:
                raise ValueError("Can't read page ", i, self.data_path)

            if float2uint:
                assert im.max() < 65536, "Im > 16 bit, max: {}".format(
                    im.max())
                im = im.astype(np.uint16)

            self.im_stack[..., i] = np.atleast_3d(im)

            tiftags = page.tags
            # Get all frame specific metadata
            dict_i = {}
            for t in tiftags.keys():
                # IJMeta often contain an ndarray LUT which is not serializable
                if t != 'IJMetadata':
                    dict_i[t] = tiftags[t].value
            self.frames_json.append(dict_i)

            meta_row = dict.fromkeys(meta_utils.DF_NAMES)
            meta_row["channel_name"] = None
            meta_row["channel_idx"] = channel_idx
            meta_row["time_idx"] = time_idx
            meta_row["pos_idx"] = pos_idx
            meta_row["slice_idx"] = slice_idx
            meta_row["file_name"] = self._get_imname(meta_row)
            self.frames_meta.loc[i] = meta_row

        sha = self._generate_hash(self.im_stack)
        self.frames_meta['sha256'] = sha

        # Set global metadata
        self.set_global_meta(nbr_frames=nbr_frames)
        self.data_uploader.upload_frames(
            file_names=self.frames_meta["file_name"],
            im_stack=self.im_stack,
        )