def setUp(self, MockPoolExecutor): """ Set up temporary test directory and mock S3 bucket connection """ # Magic mocking of multiprocessing MockPoolExecutor().__enter__().map = map_mock # Mock S3 directory for upload self.storage_dir = "raw_frames/SMS-2010-01-01-00-00-00-0001" # Create temporary directory and write temp image self.tempdir = TempDirectory() self.temp_path = self.tempdir.path # Temporary frame self.im = np.ones((10, 15), dtype=np.uint16) self.im[2:5, 3:12] = 10000 # Save test tif files self.channel_names = ['phase', 'brightfield', '666'] # Write files in dir for c in self.channel_names: for z in range(2): file_name = 'img_{}_t000_p050_z00{}.tif'.format(c, z) file_path = os.path.join(self.temp_path, file_name) ijmeta = {"Info": json.dumps({"c": c, "z": z})} tifffile.imsave( file_path, self.im + 5000 * z, ijmetadata=ijmeta, ) # Write external metadata in dir self.meta_dict = { 'Summary': { 'Slices': 26, 'PixelType': 'GRAY16', 'Time': '2018-11-01 19:20:34 -0700', 'z-step_um': 0.5, 'PixelSize_um': 0, 'BitDepth': 16, 'Width': 15, 'Height': 10 }, } self.json_filename = os.path.join(self.temp_path, 'metadata.txt') json_ops.write_json_file(self.meta_dict, self.json_filename) # Setup mock S3 bucket self.mock = mock_s3() self.mock.start() self.conn = boto3.resource('s3', region_name='us-east-1') self.bucket_name = 'czbiohub-imaging' self.conn.create_bucket(Bucket=self.bucket_name) # Instantiate file parser class storage_class = aux_utils.get_storage_class('s3') self.frames_inst = tif_splitter.TifFolderSplitter( data_path=self.temp_path, storage_dir=self.storage_dir, storage_class=storage_class, ) # Upload data self.frames_inst.get_frames_and_metadata( filename_parser='parse_sms_name', )
def setUp(self): # Setup mock local storage # Create temporary directory and write temp image self.tempdir = TempDirectory() self.temp_path = self.tempdir.path self.tempdir.makedir('storage_mount_point') mount_point = os.path.join(self.temp_path, 'storage_mount_point') self.test_path = "/datapath/testfile.tif" self.storage_dir = "raw_frames/ISP-2005-06-09-20-00-00-0001" storage_class = aux_utils.get_storage_class('local') self.mock_inst = file_splitter.FileSplitter( data_path=self.test_path, storage_dir=self.storage_dir, storage_class=storage_class, storage_access=mount_point)
def setUp(self): """ Set up temporary test directory and mock S3 bucket connection """ # Test metadata parameters self.nbr_channels = 2 self.nbr_slices = 3 # Mock S3 dir self.storage_dir = "raw_frames/ML-2005-06-09-20-00-00-1000" # Create temporary directory and write temp image self.tempdir = TempDirectory() self.temp_path = self.tempdir.path # Temporary file with 6 frames, tifffile stores channels first self.im = 50 * np.ones((6, 10, 15), dtype=np.uint16) self.im[0, :5, 3:12] = 50000 self.im[2, :5, 3:12] = 40000 self.im[4, :5, 3:12] = 30000 # Metadata self.description = 'ImageJ=1.52e\nimages=6\nchannels=2\nslices=3\nmax=10411.0' # Save test tif file self.file_path = os.path.join(self.temp_path, "A1_2_PROTEIN_test.tif") tifffile.imsave( self.file_path, self.im, description=self.description, ) # Setup mock S3 bucket self.mock = mock_s3() self.mock.start() self.conn = boto3.resource('s3', region_name='us-east-1') self.bucket_name = 'czbiohub-imaging' self.conn.create_bucket(Bucket=self.bucket_name) # Instantiate file parser class storage_class = aux_utils.get_storage_class('s3') self.frames_inst = tif_id_splitter.TifIDSplitter( data_path=self.file_path, storage_dir="raw_frames/ML-2005-06-09-20-00-00-1000", storage_class=storage_class, ) # Upload data self.frames_inst.get_frames_and_metadata( filename_parser="parse_ml_name", )
def download_data(dataset_serial, login, dest, storage='local', storage_access=None, metadata=True, download=True, nbr_workers=None, positions=None, times=None, channels=None, slices=None): """ Find all files associated with unique project identifier and download them to a local directory. :param str dataset_serial: Unique dataset identifier :param str login: Full path to json file containing database login credentials :param str dest: Local destination directory name :param str storage: 'local' (default) - data will be stored locally and synced to S3 the same day. Or 'S3' - data will be uploaded directly to S3 then synced with local storage daily. :param str/None storage_access: If not using predefined storage locations, this parameter refers to mount_point for local storage and bucket_name for S3 storage. :param bool download: Downloads all files associated with dataset (default) If False, will only write csvs with metadata. Only for datasets split into frames :param bool metadata: Writes metadata (default True) global metadata in json, local for each frame in csv :param int, None nbr_workers: Number of workers for parallel download If None, it defaults to number of machine processors * 5 :param list, None positions: Positions (FOVs) as integers (default None downloads all) :param list, None times: Timepoints as integers (default None downloads all) :param list, None channels: Channels as integer indices or strings for channel names (default None downloads all) :param list, None slices: Slice (z) integer indices (Default None downloads all) """ try: cli_utils.validate_id(dataset_serial) except AssertionError as e: raise AssertionError("Invalid ID:", e) # Create output directory as a subdirectory in dest named # dataset_serial. It stops if the subdirectory already exists to avoid # the risk of overwriting existing data dest_dir = os.path.join(dest, dataset_serial) try: os.makedirs(dest_dir, exist_ok=False) except FileExistsError as e: raise FileExistsError("Folder {} already exists, {}".format( dest_dir, e)) # Get database connection URI db_connection = db_utils.get_connection_str(login) db_utils.check_connection(db_connection) # Instantiate database class db_inst = db_ops.DatabaseOperations(dataset_serial=dataset_serial, ) # Import local or S3 storage class storage_class = aux_utils.get_storage_class(storage_type=storage) if metadata is False: # Just download file(s) assert download,\ "You set metadata *and* download to False. You get nothing." with db_ops.session_scope(db_connection) as session: storage_dir, file_names = db_inst.get_filenames(session=session, ) else: # If channels can be converted to ints, they're indices if channels is not None: if not isinstance(channels, list): channels = [channels] try: channels = [int(c) for c in channels] except ValueError: # Channels are names, not indices assert all([isinstance(c, str) for c in channels]), \ "channels must be either all str or int" # Get the metadata from the requested frames with db_ops.session_scope(db_connection) as session: global_meta, frames_meta = db_inst.get_frames_meta( session=session, positions=positions, times=times, channels=channels, slices=slices, ) # Write global metadata to destination directory global_meta_filename = os.path.join( dest_dir, "global_metadata.json", ) json_ops.write_json_file( meta_dict=global_meta, json_filename=global_meta_filename, ) # Write info for each frame to destination directory local_meta_filename = os.path.join( dest_dir, "frames_meta.csv", ) frames_meta.to_csv(local_meta_filename, sep=",") # Extract folder and file names if we want to download storage_dir = global_meta["storage_dir"] file_names = frames_meta["file_name"] if download: if nbr_workers is not None: assert nbr_workers > 0,\ "Nbr of worker must be >0, not {}".format(nbr_workers) data_loader = storage_class( storage_dir=storage_dir, nbr_workers=nbr_workers, access_point=storage_access, ) data_loader.download_files(file_names, dest_dir)
def setUp(self): """ Set up temporary test directory and mock S3 bucket connection """ # Test metadata parameters self.channel_idx = 1 self.slice_idx = 2 self.time_idx = 3 self.channel_name = "TESTCHANNEL" # Mock S3 dir self.storage_dir = "raw_frames/ISP-2005-06-09-20-00-00-0001" # Create temporary directory and write temp image self.tempdir = TempDirectory() self.temp_path = self.tempdir.path # Temporary frame self.im = np.ones((10, 15), dtype=np.uint16) self.im[2:5, 3:12] = 50000 # Metadata mmmetadata = self._get_mmmeta() ijmeta = self._get_ijmeta() extra_tags = [('MicroManagerMetadata', 's', 0, mmmetadata, True)] # Save test ome tif file self.file_path1 = os.path.join(self.temp_path, "test_Pos1.ome.tif") tifffile.imsave( self.file_path1, self.im, ijmetadata=ijmeta, extratags=extra_tags, ) mmmetadata = self._get_mmmeta(pos_idx=3) extra_tags = [('MicroManagerMetadata', 's', 0, mmmetadata, True)] # Save test ome tif file self.file_path3 = os.path.join(self.temp_path, "test_Pos3.ome.tif") tifffile.imsave( self.file_path3, self.im, ijmetadata=ijmeta, extratags=extra_tags, ) # Setup mock S3 bucket self.mock = mock_s3() self.mock.start() self.conn = boto3.resource('s3', region_name='us-east-1') self.bucket_name = 'czbiohub-imaging' self.conn.create_bucket(Bucket=self.bucket_name) # Instantiate file parser class self.storage_class = aux_utils.get_storage_class('s3') self.frames_inst = ometif_splitter.OmeTiffSplitter( data_path=self.temp_path, storage_dir="raw_frames/ISP-2005-06-09-20-00-00-0001", storage_class=self.storage_class, ) # Get path to json schema file dir_name = os.path.dirname(__file__) self.schema_file_path = os.path.realpath( os.path.join(dir_name, '..', '..', 'metadata_schema.json'), ) # Upload data self.frames_inst.get_frames_and_metadata( schema_filename=self.schema_file_path, positions='[1, 3]', )
def test_get_bad_storage_class(): aux_utils.get_storage_class('no_valid_format')
def test_get_storage_class(): storage_type = 'local' class_inst = aux_utils.get_storage_class(storage_type) nose.tools.assert_true(inspect.isclass(class_inst)) nose.tools.assert_equal(class_inst.__name__, 'LocalStorage')
def upload_data_and_update_db(csv, login, config, nbr_workers=None, overwrite=False): """ Takes a csv file in which each row represents a dataset, uploads the data to storage and metadata to database. If 'frames' is selected as upload type, each dataset will be split into individual 2D frames before moving to storage. TODO: Add logging instead of printing :param str login: Full path to json file containing login credentials :param str csv: Full path to csv file containing the following fields for each file to be uploaded: str dataset_id: Unique dataset ID <ID>-YYYY-MM-DD-HH-MM-SS-<SSSS> str file_name: Full path to file to be uploaded str description: Short description of file str parent_dataset_id: Parent dataset unique ID if there is one list positions: Which position files in folder to upload. Uploads all if left empty and file_name is a folder. Only valid for ome-tiff uploads. :param str config: Full path to json config file containing the fields: str upload_type: Specify if the file should be split prior to upload Valid options: 'frames' or 'file' str frames_format: Which file splitter class to use. Valid options: 'ome_tiff' needs MicroManagerMetadata tag for each frame for metadata 'tif_folder' when each file is already an individual frame and relies on MicroManager metadata 'tif_id' needs ImageDescription tag in first frame page for metadata str storage: 'local' (default) - data will be stored locally and synced to S3 the same day. Or 'S3' - data will be uploaded directly to S3 then synced with local storage daily. str storage_access: If not using predefined storage locations, this parameter refers to mount_point for local storage and bucket_name for S3 storage. (optional) str json_meta: If splitting to frames, give full path to json metadata schema for reading metadata (optional) :param int, None nbr_workers: Number of workers for parallel uploads :param bool overwrite: Use with caution if your upload if your upload was interrupted and you want to overwrite existing data in database and storage """ # Assert that csv file exists and load it assert os.path.isfile(csv), \ "File doesn't exist: {}".format(csv) files_data = pd.read_csv(csv) # Get database connection URI db_connection = db_utils.get_connection_str(login) db_utils.check_connection(db_connection) # Read and validate config json config_json = json_ops.read_json_file( json_filename=config, schema_name="CONFIG_SCHEMA", ) # Assert that upload type is valid upload_type = config_json['upload_type'].lower() assert upload_type in {"file", "frames"}, \ "upload_type should be 'file' or 'frames', not {}".format( upload_type, ) if nbr_workers is not None: assert nbr_workers > 0, \ "Nbr of worker must be >0, not {}".format(nbr_workers) # Import local or S3 storage class storage = 'local' if 'storage' in config_json: storage = config_json['storage'] storage_class = aux_utils.get_storage_class(storage_type=storage) storage_access = None if 'storage_access' in config_json: storage_access = config_json['storage_access'] # Make sure microscope is a string microscope = None if 'microscope' in config_json: if isinstance(config_json['microscope'], str): microscope = config_json['microscope'] if upload_type == 'frames': # If upload type is frames, check from frames format assert 'frames_format' in config_json, \ 'You must specify the type of file(s)' splitter_class = aux_utils.get_splitter_class( config_json['frames_format'], ) # Upload all files for file_nbr, row in files_data.iterrows(): # Assert that ID is correctly formatted dataset_serial = row.dataset_id try: cli_utils.validate_id(dataset_serial) except AssertionError as e: raise AssertionError("Invalid ID:", e) # Get S3 directory based on upload type if upload_type == "frames": storage_dir = "/".join([FRAME_FOLDER_NAME, dataset_serial]) else: storage_dir = "/".join([FILE_FOLDER_NAME, dataset_serial]) # Instantiate database operations class db_inst = db_ops.DatabaseOperations( dataset_serial=dataset_serial, ) # Make sure dataset is not already in database if not overwrite: with db_ops.session_scope(db_connection) as session: db_inst.assert_unique_id(session) # Check for parent dataset parent_dataset_id = 'None' if 'parent_dataset_id' in row: parent_dataset_id = row.parent_dataset_id # Check for dataset description description = None if 'description' in row: if row.description == row.description: description = row.description if upload_type == "frames": # Instantiate splitter class frames_inst = splitter_class( data_path=row.file_name, storage_dir=storage_dir, storage_class=storage_class, storage_access=storage_access, overwrite=overwrite, file_format=FRAME_FILE_FORMAT, nbr_workers=nbr_workers, ) # Get kwargs if any kwargs = {} if 'positions' in row: positions = row['positions'] if not pd.isna(positions): kwargs['positions'] = positions if 'schema_filename' in config_json: kwargs['schema_filename'] = config_json['schema_filename'] if 'filename_parser' in config_json: filename_parser = config_json['filename_parser'] kwargs['filename_parser'] = filename_parser # Extract metadata and split file into frames frames_inst.get_frames_and_metadata(**kwargs) # Add frames metadata to database try: with db_ops.session_scope(db_connection) as session: db_inst.insert_frames( session=session, description=description, frames_meta=frames_inst.get_frames_meta(), frames_json_meta=frames_inst.get_frames_json(), global_meta=frames_inst.get_global_meta(), global_json_meta=frames_inst.get_global_json(), microscope=microscope, parent_dataset=parent_dataset_id, ) except AssertionError as e: print("Data set {} already in DB".format(dataset_serial), e) # File upload else: # Just upload file without opening it assert os.path.isfile(row.file_name), \ "File doesn't exist: {}".format(row.file_name) data_uploader = storage_class( storage_dir=storage_dir, access_point=storage_access, ) if not overwrite: data_uploader.assert_unique_id() try: data_uploader.upload_file(file_path=row.file_name) print("File {} uploaded to S3".format(row.file_name)) except AssertionError as e: print("File already on S3, moving on to DB entry. {}".format(e)) sha = meta_utils.gen_sha256(row.file_name) # Add file entry to DB once I can get it tested global_json = {"file_origin": row.file_name} file_name = row.file_name.split("/")[-1] try: with db_ops.session_scope(db_connection) as session: db_inst.insert_file( session=session, description=description, storage_dir=storage_dir, file_name=file_name, global_json_meta=global_json, microscope=microscope, parent_dataset=parent_dataset_id, sha256=sha, ) print("File info for {} inserted in DB".format(dataset_serial)) except AssertionError as e: print("File {} already in database".format(dataset_serial))