def test_download_frames(self, mock_session): mock_session.return_value.__enter__.return_value = self.session # Create dest dir self.tempdir.makedir('dest_dir') dest_dir = os.path.join(self.temp_path, 'dest_dir') # Download data data_downloader.download_data( dataset_serial=self.dataset_serial, login=self.credentials_path, dest=dest_dir, storage_access=self.mount_point, ) # Images are separated by slice first then channel im_order = [0, 2, 4, 1, 3, 5] it = itertools.product(range(self.nbr_channels), range(self.nbr_slices)) for i, (c, z) in enumerate(it): im_name = 'im_c00{}_z00{}_t000_p000.png'.format(c, z) im_path = os.path.join( dest_dir, self.dataset_serial, im_name, ) im = cv2.imread(im_path, cv2.IMREAD_ANYDEPTH) numpy.testing.assert_array_equal(im, self.im[im_order[i], ...]) # Read and validate frames meta meta_path = os.path.join( dest_dir, self.dataset_serial, 'frames_meta.csv', ) frames_meta = pd.read_csv(meta_path) for i, row in frames_meta.iterrows(): c = i // self.nbr_slices z = i % self.nbr_slices self.assertEqual(row.channel_idx, c) self.assertEqual(row.slice_idx, z) self.assertEqual(row.time_idx, 0) self.assertEqual(row.pos_idx, 0) im_name = 'im_c00{}_z00{}_t000_p000.png'.format(c, z) self.assertEqual(row.file_name, im_name) sha256 = meta_utils.gen_sha256(self.im[im_order[i], ...]) self.assertEqual(row.sha256, sha256) # Read and validate global meta meta_path = os.path.join( dest_dir, self.dataset_serial, 'global_metadata.json', ) meta_json = json_ops.read_json_file(meta_path) self.assertEqual(meta_json['storage_dir'], self.frames_storage_dir) self.assertEqual(meta_json['nbr_frames'], 6) self.assertEqual(meta_json['im_width'], 15) self.assertEqual(meta_json['im_height'], 10) self.assertEqual(meta_json['nbr_slices'], self.nbr_slices) self.assertEqual(meta_json['nbr_channels'], self.nbr_channels) self.assertEqual(meta_json['im_colors'], 1) self.assertEqual(meta_json['nbr_timepoints'], 1) self.assertEqual(meta_json['nbr_positions'], 1) self.assertEqual(meta_json['bit_depth'], 'uint16')
def test_read_not_a_json_file(): with TempDirectory() as tempdir: invalid_json = {"drivername": "postgres", "username": "******"} # Remove last bracket invalid_json_str = json.dumps(invalid_json)[:-1] tempdir.write('invalid_json_file.json', invalid_json_str.encode()) json_object = json_ops.read_json_file( os.path.join(tempdir.path, "invalid_json_file.json"), )
def get_connection_str(credentials_filename): """ Bundles the JSON read of the login credentials file with a conversion to a URI for connecting to the database :param credentials_filename: JSON file containing DB credentials :return str connection_str: URI for connecting to the DB """ # Read and validate json credentials_json = json_ops.read_json_file( json_filename=credentials_filename, schema_name="CREDENTIALS_SCHEMA") # Convert json to string compatible with engine return json_to_uri(credentials_json)
def split_file(self, file_path, schema_filename): """ Splits file into frames and gets metadata for each frame. set_frame_info must be called prior to this function call. :param str file_path: Full path to file :param str schema_filename: Full path to schema file name :return dataframe frames_meta: Metadata for all frames :return np.array im_stack: Image stack extracted from file """ frames = tifffile.TiffFile(file_path) # Get global metadata nbr_frames = len(frames.pages) # Create image stack with image bit depth 16 or 8 im_stack = np.empty((self.frame_shape[0], self.frame_shape[1], self.im_colors, nbr_frames), dtype=self.bit_depth) # Get metadata schema meta_schema = json_ops.read_json_file(schema_filename) # Convert frames to numpy stack and collect metadata # Separate structured metadata (with known fields) # from unstructured, the latter goes into frames_json frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames) # Pandas doesn't really support inserting dicts into dataframes, # so micromanager metadata goes into a separate list for i in range(nbr_frames): page = frames.pages[i] im_stack[..., i] = np.atleast_3d(page.asarray()) # Get dict with metadata from json schema json_i, meta_i = json_ops.get_metadata_from_tags( page=page, meta_schema=meta_schema, validate=True, ) self.frames_json.append(json_i) # Add required metadata fields to data frame meta_names = meta_utils.META_NAMES df_names = meta_utils.DF_NAMES for meta_name, df_name in zip(meta_names, df_names): if meta_name in meta_i.keys(): frames_meta.loc[i, df_name] = meta_i[meta_name] # Create a file name and add it im_name = self._get_imname(frames_meta.loc[i]) frames_meta.loc[i, "file_name"] = im_name return frames_meta, im_stack
def test_read_json_file(): with TempDirectory() as tempdir: valid_json = { "drivername": "postgres", "username": "******", "password": "******", "host": "db_host", "port": 666, "dbname": "db_name" } tempdir.write('valid_json_file.json', json.dumps(valid_json).encode()) json_object = json_ops.read_json_file(os.path.join( tempdir.path, "valid_json_file.json"), schema_name="CREDENTIALS_SCHEMA") nose.tools.assert_equal(json_object, valid_json)
def test_read_nonexisting_json_file(): json_ops.read_json_file("not_a_json_file.json")
def get_frames_and_metadata(self, filename_parser='parse_idx_from_name'): """ Frame metadata is extracted from each frame, and frames are uploaded on a file by file basis. Since metadata is separated from files, the file name must contain the required indices channel_idx, slice_idx, time and pos_idx. By default, it will assume that the file name contains 4 integers corresponding to these 4 indices. If that's not the case, you can specify a custom parser in filename_parsers. A global metadata dict is assumed to be in the same directory in a file named metadata.txt. If not, the global_json will be empty and frame info will be determined from the first frame. The metadata.txt file (if there) is assumed to contain a minimum of the following (with example values): 'Summary': { 'PixelType': 'GRAY16', 'BitDepth': 16, 'Width': 15, 'Height': 10 } :param str filename_parser: Function name in filename_parsers.py """ assert os.path.isdir(self.data_path), \ "Directory doesn't exist: {}".format(self.data_path) try: parse_func = getattr(file_parsers, filename_parser) except AttributeError as e: raise AttributeError( "Must use filename_parsers function for file name. {}".format( e)) frame_paths = natsort.natsorted( glob.glob(os.path.join(self.data_path, "*.tif")), ) nbr_frames = len(frame_paths) metadata_path = os.path.join(self.data_path, "metadata.txt") if len(glob.glob(metadata_path)) == 1: self.global_json = json_ops.read_json_file( os.path.join(self.data_path, "metadata.txt"), ) self.set_frame_info(self.global_json["Summary"]) else: # No metadata.txt file in dir, get frame info from first frame self.set_frame_info_from_file(frame_paths[0]) self.global_json = {} self.frames_meta = meta_utils.make_dataframe(nbr_frames=nbr_frames) self.frames_json = [] # Loop over all the frames to get metadata for i, frame_path in enumerate(frame_paths): # Get structured frames metadata self.frames_meta.loc[i] = self._set_frame_meta( parse_func=parse_func, file_name=frame_path, ) # Use multiprocessing for more efficient file read and upload file_names = self.frames_meta['file_name'] with concurrent.futures.ProcessPoolExecutor(self.nbr_workers) as ex: res = ex.map(self.serialize_upload, zip(frame_paths, file_names)) # Collect metadata for each uploaded file for i, (sha256, dict_i) in enumerate(res): self.frames_json.append(json.loads(dict_i)) self.frames_meta.loc[i, 'sha256'] = sha256 # Set global metadata self.set_global_meta(nbr_frames=nbr_frames)
import imaging_db.database.db_operations as db_ops import imaging_db.metadata.json_operations as json_ops import imaging_db.utils.db_utils as db_utils # Edit this depending on where your database credential file is stored # This assumes it's stored in dir above imagingDB dir_name = os.path.abspath(os.path.join('..')) DB_CREDENTIALS_PATH = os.path.join(dir_name, 'db_credentials.json') # this is the Alembic Config object, which provides # access to the values within the .ini file in use. config = context.config # Overwrite the ini-file sqlalchemy.url path credentials_json = json_ops.read_json_file(json_filename=DB_CREDENTIALS_PATH, schema_name="CREDENTIALS_SCHEMA") config.set_main_option('sqlalchemy.url', db_utils.json_to_uri(credentials_json=credentials_json)) print("Using url:", config.get_main_option('sqlalchemy.url')) # Add model metadata object target_metadata = db_ops.Base.metadata # other values from the config, defined by the needs of env.py, # can be acquired: # my_important_option = config.get_main_option("my_important_option") # ... etc.
def upload_data_and_update_db(csv, login, config, nbr_workers=None, overwrite=False): """ Takes a csv file in which each row represents a dataset, uploads the data to storage and metadata to database. If 'frames' is selected as upload type, each dataset will be split into individual 2D frames before moving to storage. TODO: Add logging instead of printing :param str login: Full path to json file containing login credentials :param str csv: Full path to csv file containing the following fields for each file to be uploaded: str dataset_id: Unique dataset ID <ID>-YYYY-MM-DD-HH-MM-SS-<SSSS> str file_name: Full path to file to be uploaded str description: Short description of file str parent_dataset_id: Parent dataset unique ID if there is one list positions: Which position files in folder to upload. Uploads all if left empty and file_name is a folder. Only valid for ome-tiff uploads. :param str config: Full path to json config file containing the fields: str upload_type: Specify if the file should be split prior to upload Valid options: 'frames' or 'file' str frames_format: Which file splitter class to use. Valid options: 'ome_tiff' needs MicroManagerMetadata tag for each frame for metadata 'tif_folder' when each file is already an individual frame and relies on MicroManager metadata 'tif_id' needs ImageDescription tag in first frame page for metadata str storage: 'local' (default) - data will be stored locally and synced to S3 the same day. Or 'S3' - data will be uploaded directly to S3 then synced with local storage daily. str storage_access: If not using predefined storage locations, this parameter refers to mount_point for local storage and bucket_name for S3 storage. (optional) str json_meta: If splitting to frames, give full path to json metadata schema for reading metadata (optional) :param int, None nbr_workers: Number of workers for parallel uploads :param bool overwrite: Use with caution if your upload if your upload was interrupted and you want to overwrite existing data in database and storage """ # Assert that csv file exists and load it assert os.path.isfile(csv), \ "File doesn't exist: {}".format(csv) files_data = pd.read_csv(csv) # Get database connection URI db_connection = db_utils.get_connection_str(login) db_utils.check_connection(db_connection) # Read and validate config json config_json = json_ops.read_json_file( json_filename=config, schema_name="CONFIG_SCHEMA", ) # Assert that upload type is valid upload_type = config_json['upload_type'].lower() assert upload_type in {"file", "frames"}, \ "upload_type should be 'file' or 'frames', not {}".format( upload_type, ) if nbr_workers is not None: assert nbr_workers > 0, \ "Nbr of worker must be >0, not {}".format(nbr_workers) # Import local or S3 storage class storage = 'local' if 'storage' in config_json: storage = config_json['storage'] storage_class = aux_utils.get_storage_class(storage_type=storage) storage_access = None if 'storage_access' in config_json: storage_access = config_json['storage_access'] # Make sure microscope is a string microscope = None if 'microscope' in config_json: if isinstance(config_json['microscope'], str): microscope = config_json['microscope'] if upload_type == 'frames': # If upload type is frames, check from frames format assert 'frames_format' in config_json, \ 'You must specify the type of file(s)' splitter_class = aux_utils.get_splitter_class( config_json['frames_format'], ) # Upload all files for file_nbr, row in files_data.iterrows(): # Assert that ID is correctly formatted dataset_serial = row.dataset_id try: cli_utils.validate_id(dataset_serial) except AssertionError as e: raise AssertionError("Invalid ID:", e) # Get S3 directory based on upload type if upload_type == "frames": storage_dir = "/".join([FRAME_FOLDER_NAME, dataset_serial]) else: storage_dir = "/".join([FILE_FOLDER_NAME, dataset_serial]) # Instantiate database operations class db_inst = db_ops.DatabaseOperations( dataset_serial=dataset_serial, ) # Make sure dataset is not already in database if not overwrite: with db_ops.session_scope(db_connection) as session: db_inst.assert_unique_id(session) # Check for parent dataset parent_dataset_id = 'None' if 'parent_dataset_id' in row: parent_dataset_id = row.parent_dataset_id # Check for dataset description description = None if 'description' in row: if row.description == row.description: description = row.description if upload_type == "frames": # Instantiate splitter class frames_inst = splitter_class( data_path=row.file_name, storage_dir=storage_dir, storage_class=storage_class, storage_access=storage_access, overwrite=overwrite, file_format=FRAME_FILE_FORMAT, nbr_workers=nbr_workers, ) # Get kwargs if any kwargs = {} if 'positions' in row: positions = row['positions'] if not pd.isna(positions): kwargs['positions'] = positions if 'schema_filename' in config_json: kwargs['schema_filename'] = config_json['schema_filename'] if 'filename_parser' in config_json: filename_parser = config_json['filename_parser'] kwargs['filename_parser'] = filename_parser # Extract metadata and split file into frames frames_inst.get_frames_and_metadata(**kwargs) # Add frames metadata to database try: with db_ops.session_scope(db_connection) as session: db_inst.insert_frames( session=session, description=description, frames_meta=frames_inst.get_frames_meta(), frames_json_meta=frames_inst.get_frames_json(), global_meta=frames_inst.get_global_meta(), global_json_meta=frames_inst.get_global_json(), microscope=microscope, parent_dataset=parent_dataset_id, ) except AssertionError as e: print("Data set {} already in DB".format(dataset_serial), e) # File upload else: # Just upload file without opening it assert os.path.isfile(row.file_name), \ "File doesn't exist: {}".format(row.file_name) data_uploader = storage_class( storage_dir=storage_dir, access_point=storage_access, ) if not overwrite: data_uploader.assert_unique_id() try: data_uploader.upload_file(file_path=row.file_name) print("File {} uploaded to S3".format(row.file_name)) except AssertionError as e: print("File already on S3, moving on to DB entry. {}".format(e)) sha = meta_utils.gen_sha256(row.file_name) # Add file entry to DB once I can get it tested global_json = {"file_origin": row.file_name} file_name = row.file_name.split("/")[-1] try: with db_ops.session_scope(db_connection) as session: db_inst.insert_file( session=session, description=description, storage_dir=storage_dir, file_name=file_name, global_json_meta=global_json, microscope=microscope, parent_dataset=parent_dataset_id, sha256=sha, ) print("File info for {} inserted in DB".format(dataset_serial)) except AssertionError as e: print("File {} already in database".format(dataset_serial))