Esempio n. 1
0
 def test_download_frames(self, mock_session):
     mock_session.return_value.__enter__.return_value = self.session
     # Create dest dir
     self.tempdir.makedir('dest_dir')
     dest_dir = os.path.join(self.temp_path, 'dest_dir')
     # Download data
     data_downloader.download_data(
         dataset_serial=self.dataset_serial,
         login=self.credentials_path,
         dest=dest_dir,
         storage_access=self.mount_point,
     )
     # Images are separated by slice first then channel
     im_order = [0, 2, 4, 1, 3, 5]
     it = itertools.product(range(self.nbr_channels),
                            range(self.nbr_slices))
     for i, (c, z) in enumerate(it):
         im_name = 'im_c00{}_z00{}_t000_p000.png'.format(c, z)
         im_path = os.path.join(
             dest_dir,
             self.dataset_serial,
             im_name,
         )
         im = cv2.imread(im_path, cv2.IMREAD_ANYDEPTH)
         numpy.testing.assert_array_equal(im, self.im[im_order[i], ...])
     # Read and validate frames meta
     meta_path = os.path.join(
         dest_dir,
         self.dataset_serial,
         'frames_meta.csv',
     )
     frames_meta = pd.read_csv(meta_path)
     for i, row in frames_meta.iterrows():
         c = i // self.nbr_slices
         z = i % self.nbr_slices
         self.assertEqual(row.channel_idx, c)
         self.assertEqual(row.slice_idx, z)
         self.assertEqual(row.time_idx, 0)
         self.assertEqual(row.pos_idx, 0)
         im_name = 'im_c00{}_z00{}_t000_p000.png'.format(c, z)
         self.assertEqual(row.file_name, im_name)
         sha256 = meta_utils.gen_sha256(self.im[im_order[i], ...])
         self.assertEqual(row.sha256, sha256)
     # Read and validate global meta
     meta_path = os.path.join(
         dest_dir,
         self.dataset_serial,
         'global_metadata.json',
     )
     meta_json = json_ops.read_json_file(meta_path)
     self.assertEqual(meta_json['storage_dir'], self.frames_storage_dir)
     self.assertEqual(meta_json['nbr_frames'], 6)
     self.assertEqual(meta_json['im_width'], 15)
     self.assertEqual(meta_json['im_height'], 10)
     self.assertEqual(meta_json['nbr_slices'], self.nbr_slices)
     self.assertEqual(meta_json['nbr_channels'], self.nbr_channels)
     self.assertEqual(meta_json['im_colors'], 1)
     self.assertEqual(meta_json['nbr_timepoints'], 1)
     self.assertEqual(meta_json['nbr_positions'], 1)
     self.assertEqual(meta_json['bit_depth'], 'uint16')
def test_read_not_a_json_file():
    with TempDirectory() as tempdir:
        invalid_json = {"drivername": "postgres", "username": "******"}
        # Remove last bracket
        invalid_json_str = json.dumps(invalid_json)[:-1]
        tempdir.write('invalid_json_file.json', invalid_json_str.encode())
        json_object = json_ops.read_json_file(
            os.path.join(tempdir.path, "invalid_json_file.json"), )
Esempio n. 3
0
def get_connection_str(credentials_filename):
    """
    Bundles the JSON read of the login credentials file with
    a conversion to a URI for connecting to the database

    :param credentials_filename: JSON file containing DB credentials
    :return str connection_str: URI for connecting to the DB
    """
    # Read and validate json
    credentials_json = json_ops.read_json_file(
        json_filename=credentials_filename,
        schema_name="CREDENTIALS_SCHEMA")
    # Convert json to string compatible with engine
    return json_to_uri(credentials_json)
Esempio n. 4
0
    def split_file(self, file_path, schema_filename):
        """
        Splits file into frames and gets metadata for each frame.
        set_frame_info must be called prior to this function call.

        :param str file_path: Full path to file
        :param str schema_filename: Full path to schema file name
        :return dataframe frames_meta: Metadata for all frames
        :return np.array im_stack: Image stack extracted from file
        """
        frames = tifffile.TiffFile(file_path)
        # Get global metadata
        nbr_frames = len(frames.pages)
        # Create image stack with image bit depth 16 or 8
        im_stack = np.empty((self.frame_shape[0],
                             self.frame_shape[1],
                             self.im_colors,
                             nbr_frames),
                            dtype=self.bit_depth)

        # Get metadata schema
        meta_schema = json_ops.read_json_file(schema_filename)
        # Convert frames to numpy stack and collect metadata
        # Separate structured metadata (with known fields)
        # from unstructured, the latter goes into frames_json
        frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames)
        # Pandas doesn't really support inserting dicts into dataframes,
        # so micromanager metadata goes into a separate list
        for i in range(nbr_frames):
            page = frames.pages[i]
            im_stack[..., i] = np.atleast_3d(page.asarray())
            # Get dict with metadata from json schema
            json_i, meta_i = json_ops.get_metadata_from_tags(
                page=page,
                meta_schema=meta_schema,
                validate=True,
            )
            self.frames_json.append(json_i)
            # Add required metadata fields to data frame
            meta_names = meta_utils.META_NAMES
            df_names = meta_utils.DF_NAMES
            for meta_name, df_name in zip(meta_names, df_names):
                if meta_name in meta_i.keys():
                    frames_meta.loc[i, df_name] = meta_i[meta_name]

            # Create a file name and add it
            im_name = self._get_imname(frames_meta.loc[i])
            frames_meta.loc[i, "file_name"] = im_name
        return frames_meta, im_stack
def test_read_json_file():
    with TempDirectory() as tempdir:
        valid_json = {
            "drivername": "postgres",
            "username": "******",
            "password": "******",
            "host": "db_host",
            "port": 666,
            "dbname": "db_name"
        }
        tempdir.write('valid_json_file.json', json.dumps(valid_json).encode())
        json_object = json_ops.read_json_file(os.path.join(
            tempdir.path, "valid_json_file.json"),
                                              schema_name="CREDENTIALS_SCHEMA")
        nose.tools.assert_equal(json_object, valid_json)
def test_read_nonexisting_json_file():
    json_ops.read_json_file("not_a_json_file.json")
Esempio n. 7
0
    def get_frames_and_metadata(self, filename_parser='parse_idx_from_name'):
        """
        Frame metadata is extracted from each frame, and frames are uploaded
        on a file by file basis.
        Since metadata is separated from files, the file name must contain the
        required indices channel_idx, slice_idx, time and pos_idx. By default,
        it will assume that the file name contains 4 integers corresponding to
        these 4 indices. If that's not the case, you can specify a custom parser
        in filename_parsers.
        A global metadata dict is assumed to be in the same directory in a file
        named metadata.txt. If not, the global_json will be empty and frame
        info will be determined from the first frame.
        The metadata.txt file (if there) is assumed to contain a minimum of the
        following (with example values):
            'Summary': {
                'PixelType': 'GRAY16',
                'BitDepth': 16,
                'Width': 15,
                'Height': 10
            }

        :param str filename_parser: Function name in filename_parsers.py
        """
        assert os.path.isdir(self.data_path), \
            "Directory doesn't exist: {}".format(self.data_path)

        try:
            parse_func = getattr(file_parsers, filename_parser)
        except AttributeError as e:
            raise AttributeError(
                "Must use filename_parsers function for file name. {}".format(
                    e))

        frame_paths = natsort.natsorted(
            glob.glob(os.path.join(self.data_path, "*.tif")), )
        nbr_frames = len(frame_paths)

        metadata_path = os.path.join(self.data_path, "metadata.txt")
        if len(glob.glob(metadata_path)) == 1:
            self.global_json = json_ops.read_json_file(
                os.path.join(self.data_path, "metadata.txt"), )
            self.set_frame_info(self.global_json["Summary"])
        else:
            # No metadata.txt file in dir, get frame info from first frame
            self.set_frame_info_from_file(frame_paths[0])
            self.global_json = {}

        self.frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames)
        self.frames_json = []
        # Loop over all the frames to get metadata
        for i, frame_path in enumerate(frame_paths):
            # Get structured frames metadata
            self.frames_meta.loc[i] = self._set_frame_meta(
                parse_func=parse_func,
                file_name=frame_path,
            )
        # Use multiprocessing for more efficient file read and upload
        file_names = self.frames_meta['file_name']
        with concurrent.futures.ProcessPoolExecutor(self.nbr_workers) as ex:
            res = ex.map(self.serialize_upload, zip(frame_paths, file_names))
        # Collect metadata for each uploaded file
        for i, (sha256, dict_i) in enumerate(res):
            self.frames_json.append(json.loads(dict_i))
            self.frames_meta.loc[i, 'sha256'] = sha256
        # Set global metadata
        self.set_global_meta(nbr_frames=nbr_frames)
Esempio n. 8
0
import imaging_db.database.db_operations as db_ops
import imaging_db.metadata.json_operations as json_ops
import imaging_db.utils.db_utils as db_utils

# Edit this depending on where your database credential file is stored
# This assumes it's stored in dir above imagingDB
dir_name = os.path.abspath(os.path.join('..'))
DB_CREDENTIALS_PATH = os.path.join(dir_name, 'db_credentials.json')

# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config

# Overwrite the ini-file sqlalchemy.url path
credentials_json = json_ops.read_json_file(json_filename=DB_CREDENTIALS_PATH,
                                           schema_name="CREDENTIALS_SCHEMA")

config.set_main_option('sqlalchemy.url',
                       db_utils.json_to_uri(credentials_json=credentials_json))

print("Using url:", config.get_main_option('sqlalchemy.url'))

# Add model metadata object
target_metadata = db_ops.Base.metadata

# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.

Esempio n. 9
0
def upload_data_and_update_db(csv,
                              login,
                              config,
                              nbr_workers=None,
                              overwrite=False):
    """
    Takes a csv file in which each row represents a dataset, uploads the data
    to storage and metadata to database. If 'frames' is selected as upload
    type, each dataset will be split into individual 2D frames before moving
    to storage.
    TODO: Add logging instead of printing

    :param str login: Full path to json file containing login credentials
    :param str csv: Full path to csv file containing the following fields
        for each file to be uploaded:
            str dataset_id: Unique dataset ID <ID>-YYYY-MM-DD-HH-MM-SS-<SSSS>
            str file_name: Full path to file to be uploaded
            str description: Short description of file
            str parent_dataset_id: Parent dataset unique ID if there is one
                list positions: Which position files in folder to upload.
                Uploads all if left empty and file_name is a folder.
                Only valid for ome-tiff uploads.
    :param  str config: Full path to json config file containing the fields:
            str upload_type: Specify if the file should be split prior to upload
                Valid options: 'frames' or 'file'
            str frames_format: Which file splitter class to use.
                Valid options:
                'ome_tiff' needs MicroManagerMetadata tag for each frame for metadata
                'tif_folder' when each file is already an individual frame
                and relies on MicroManager metadata
                'tif_id' needs ImageDescription tag in first frame page for metadata
            str storage: 'local' (default) - data will be stored locally and
                synced to S3 the same day. Or 'S3' - data will be uploaded
                directly to S3 then synced with local storage daily.
            str storage_access: If not using predefined storage locations,
                this parameter refers to mount_point for local storage and
                bucket_name for S3 storage. (optional)
            str json_meta: If splitting to frames, give full path to json
                metadata schema for reading metadata (optional)
    :param int, None nbr_workers: Number of workers for parallel uploads
    :param bool overwrite: Use with caution if your upload if your upload was
            interrupted and you want to overwrite existing data in database
            and storage
    """
    # Assert that csv file exists and load it
    assert os.path.isfile(csv), \
        "File doesn't exist: {}".format(csv)
    files_data = pd.read_csv(csv)

    # Get database connection URI
    db_connection = db_utils.get_connection_str(login)
    db_utils.check_connection(db_connection)
    # Read and validate config json
    config_json = json_ops.read_json_file(
        json_filename=config,
        schema_name="CONFIG_SCHEMA",
    )
    # Assert that upload type is valid
    upload_type = config_json['upload_type'].lower()
    assert upload_type in {"file", "frames"}, \
        "upload_type should be 'file' or 'frames', not {}".format(
            upload_type,
        )
    if nbr_workers is not None:
        assert nbr_workers > 0, \
            "Nbr of worker must be >0, not {}".format(nbr_workers)
    # Import local or S3 storage class
    storage = 'local'
    if 'storage' in config_json:
        storage = config_json['storage']
    storage_class = aux_utils.get_storage_class(storage_type=storage)
    storage_access = None
    if 'storage_access' in config_json:
        storage_access = config_json['storage_access']

    # Make sure microscope is a string
    microscope = None
    if 'microscope' in config_json:
        if isinstance(config_json['microscope'], str):
            microscope = config_json['microscope']

    if upload_type == 'frames':
        # If upload type is frames, check from frames format
        assert 'frames_format' in config_json, \
            'You must specify the type of file(s)'
        splitter_class = aux_utils.get_splitter_class(
            config_json['frames_format'],
        )
    # Upload all files
    for file_nbr, row in files_data.iterrows():
        # Assert that ID is correctly formatted
        dataset_serial = row.dataset_id
        try:
            cli_utils.validate_id(dataset_serial)
        except AssertionError as e:
            raise AssertionError("Invalid ID:", e)

        # Get S3 directory based on upload type
        if upload_type == "frames":
            storage_dir = "/".join([FRAME_FOLDER_NAME, dataset_serial])
        else:
            storage_dir = "/".join([FILE_FOLDER_NAME, dataset_serial])
        # Instantiate database operations class
        db_inst = db_ops.DatabaseOperations(
            dataset_serial=dataset_serial,
        )
        # Make sure dataset is not already in database
        if not overwrite:
            with db_ops.session_scope(db_connection) as session:
                db_inst.assert_unique_id(session)
        # Check for parent dataset
        parent_dataset_id = 'None'
        if 'parent_dataset_id' in row:
            parent_dataset_id = row.parent_dataset_id
        # Check for dataset description
        description = None
        if 'description' in row:
            if row.description == row.description:
                description = row.description

        if upload_type == "frames":
            # Instantiate splitter class
            frames_inst = splitter_class(
                data_path=row.file_name,
                storage_dir=storage_dir,
                storage_class=storage_class,
                storage_access=storage_access,
                overwrite=overwrite,
                file_format=FRAME_FILE_FORMAT,
                nbr_workers=nbr_workers,
            )
            # Get kwargs if any
            kwargs = {}
            if 'positions' in row:
                positions = row['positions']
                if not pd.isna(positions):
                    kwargs['positions'] = positions
            if 'schema_filename' in config_json:
                kwargs['schema_filename'] = config_json['schema_filename']
            if 'filename_parser' in config_json:
                filename_parser = config_json['filename_parser']
                kwargs['filename_parser'] = filename_parser
            # Extract metadata and split file into frames
            frames_inst.get_frames_and_metadata(**kwargs)

            # Add frames metadata to database
            try:
                with db_ops.session_scope(db_connection) as session:
                    db_inst.insert_frames(
                        session=session,
                        description=description,
                        frames_meta=frames_inst.get_frames_meta(),
                        frames_json_meta=frames_inst.get_frames_json(),
                        global_meta=frames_inst.get_global_meta(),
                        global_json_meta=frames_inst.get_global_json(),
                        microscope=microscope,
                        parent_dataset=parent_dataset_id,
                    )
            except AssertionError as e:
                print("Data set {} already in DB".format(dataset_serial), e)
        # File upload
        else:
            # Just upload file without opening it
            assert os.path.isfile(row.file_name), \
                "File doesn't exist: {}".format(row.file_name)
            data_uploader = storage_class(
                storage_dir=storage_dir,
                access_point=storage_access,
            )
            if not overwrite:
                data_uploader.assert_unique_id()
            try:
                data_uploader.upload_file(file_path=row.file_name)
                print("File {} uploaded to S3".format(row.file_name))
            except AssertionError as e:
                print("File already on S3, moving on to DB entry. {}".format(e))

            sha = meta_utils.gen_sha256(row.file_name)
            # Add file entry to DB once I can get it tested
            global_json = {"file_origin": row.file_name}
            file_name = row.file_name.split("/")[-1]
            try:
                with db_ops.session_scope(db_connection) as session:
                    db_inst.insert_file(
                        session=session,
                        description=description,
                        storage_dir=storage_dir,
                        file_name=file_name,
                        global_json_meta=global_json,
                        microscope=microscope,
                        parent_dataset=parent_dataset_id,
                        sha256=sha,
                    )
                print("File info for {} inserted in DB".format(dataset_serial))
            except AssertionError as e:
                print("File {} already in database".format(dataset_serial))