Python session_scope Examples, imaging_db.database.db_operations.session_scope Python Examples

Example #1

0

Show file

def get_positions(db_credentials: str, dataset_serial: str):
    """
    Queries the database for a given dataset serial number and returns list of available positions

    Parameters
    ----------
    db_credentials: str
        Absolute url to location of .json credentials
    dataset_serial: str
        dataset_serial field of a dataset in the database

    Returns
    -------
    List[int] of positions for a given experiment
    """
    credentials_str = db_utils.get_connection_str(db_credentials)

    with db_ops.session_scope(credentials_str) as session:
        frames = session.query(db_ops.Frames) \
            .join(db_ops.FramesGlobal) \
            .join(db_ops.DataSet) \
            .filter(db_ops.DataSet.dataset_serial == dataset_serial)
    positions = set()
    for f in frames:
        positions.add(f.pos_idx)
    return list(positions)

Example #2

0

Show file

def get_channels(db_credentials: str, dataset_id: str):
    """
    Queries the database for a dataset id and returns a dict of channel assignments

    Parameters
    ----------
    db_credentials: str
        Absolute url to location of .json credentials
    dataset_serial: str
        dataset_serial field of a dataset in the database

    Returns
    -------
    Dict of (channel_idx:channel_name) pairs
    """
    dbops = DatabaseOperations(dataset_id)
    credentials_str = db_utils.get_connection_str(db_credentials)
    with db_ops.session_scope(credentials_str) as session:
        _, frames_meta = dbops.get_frames_meta(session)

    df = frames_meta[['channel_name', 'channel_idx']].drop_duplicates()
    channels = {}
    for idx, row in df.iterrows():
        channels[row['channel_idx']] = row['channel_name']
    return channels

Example #3

0

Show file

def getIDs(db_credentials, string):
    credentials_str = db_utils.get_connection_str(db_credentials)
    with db_ops.session_scope(credentials_str) as session:
        frames = session.query(db_ops.DataSet) 

    set_of_ids = list()
    for f in frames:
        name = f.dataset_serial
        if string in name:
            set_of_ids.append(name)
    return list(set_of_ids)

Example #4

0

Show file

def check_connection(db_connection):
    """
    Make sure you can connect to database before anything else.

    :param str db_connection: URI for connecting to the DB
    :raises IOError: If you can't connect to the DB
    """
    try:
        with db_ops.session_scope(db_connection) as session:
            db_ops.test_connection(session)
    except Exception as e:
        raise IOError("Can't connect to DB: {}".format(e))

Example #5

0

Show file

def getPositions(db_credentials, dataset_identifier):
    credentials_str = db_utils.get_connection_str(db_credentials)
    
    with db_ops.session_scope(credentials_str) as session:
        frames = session.query(db_ops.Frames) \
            .join(db_ops.FramesGlobal) \
            .join(db_ops.DataSet)  \
            .filter(db_ops.DataSet.dataset_serial == dataset_identifier) 
    positions = set()
    for f in frames:
        positions.add(f.pos_idx)
    return list(positions)

Example #6

0

Show file

def migrate_db(credentials_filename):
    """
    Updates sha256 checksums for all files and frames

    :param credentials_filename: Full path to DB credentials file
    """
    # Edit this depending on where your database credential file is stored
    # This assumes it's stored in dir above imagingDB
    dir_name = os.path.abspath(os.path.join('..'))
    dest_dir = os.path.join(dir_name, 'temp_downloads')
    os.makedirs(dest_dir, exist_ok=True)
    credentials_str = db_utils.get_connection_str(
        credentials_filename=credentials_filename, )
    # Get files and compute checksums
    with db_ops.session_scope(credentials_str) as session:
        files = session.query(db_ops.FileGlobal)
        for file in files:
            if file.sha256 is None:
                data_loader = s3_storage.S3Storage(
                    storage_dir=file.storage_dir, )
                file_name = file.metadata_json["file_origin"]
                file_name = file_name.split("/")[-1]
                dest_path = os.path.join(dest_dir, file_name)
                data_loader.download_file(
                    file_name=file_name,
                    dest_path=dest_path,
                )
                checksum = meta_utils.gen_sha256(dest_path)
                file.sha256 = checksum

    # Get frames and compute checksums
    with db_ops.session_scope(credentials_filename) as session:
        frames = session.query(db_ops.Frames)
        for frame in frames:
            if frame.sha256 is None:
                data_loader = s3_storage.S3Storage(
                    storage_dir=frame.frames_global.storage_dir, )
                im = data_loader.get_im(frame.file_name)
                checksum = meta_utils.gen_sha256(im)
                frame.sha256 = checksum

Example #7

0

Show file

File: _image_database.py Project: czbiohub/InSituToolkit

    def getNbrPositions(self, dataset_identifier):
        importlib.reload(db_session)

        with db_ops.session_scope(self.credentials_filename) as session:
            # Find the Frames of interest
            frames_global = session.query(db_ops.FramesGlobal) \
                .join(db_ops.DataSet) \
                .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \
             .all()

            nbr_positions = frames_global[0].nbr_positions

        return nbr_positions

Example #8

0

Show file

File: _image_database.py Project: czbiohub/InSituToolkit

    def getAcqMeta(self, dataset_identifier):
        importlib.reload(db_session)

        with db_ops.session_scope(self.credentials_filename) as session:
            # Find the Frames of interest
            all_frames = session.query(db_ops.Frames) \
                .join(db_ops.FramesGlobal) \
                .join(db_ops.DataSet) \
                .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \
             .all()

            acq_meta = all_frames[0].frames_global.metadata_json['IJMetadata']

        return acq_meta

Example #9

0

Show file

def query_data(login,
               project_id=None,
               microscope=None,
               start_date=None,
               end_date=None,
               description=None):
    """
    Provide CLI access to wrappers for common queries.
    Prints the dataset IDs of the datasets returned from the query to the
    standard output device.

    :param str login: Full path to json file containing database login
            credentials
    :param str project_id: First part of dataset_serial containing
            project ID (e.g. ML)
    :param str microscope: Microscope column
    :param str start_date: Format YYYY-MM-DD. Find >= dates in date_time column
    :param str end_date: Format YYYY-MM-DD. Find <= dates in date_time column
    :param str description: Find substring in description column
    """
    # Get database connection URI
    db_connection = db_utils.get_connection_str(login)

    search_dict = {}
    if project_id is not None:
        search_dict['project_id'] = project_id
    if microscope is not None:
        search_dict['microscope'] = microscope
    if start_date is not None:
        search_dict['start_date'] = start_date
        if end_date is not None:
            cli_utils.assert_date_order(start_date, end_date)
    if end_date is not None:
        search_dict['end_date'] = end_date
    if description is not None:
        search_dict['description'] = description

    with db_ops.session_scope(db_connection) as session:
        datasets = db_ops.get_datasets(session, search_dict)
        print("Number of datasets matching your query: {}".format(
            len(datasets)))
        for i, d in enumerate(datasets):
            print(i, d.dataset_serial)

Example #10

0

Show file

File: _image_database.py Project: czbiohub/InSituToolkit

    def getImageMeta(self, dataset_identifier):
        '''
			Return metadata for each frame in a list.

		'''

        importlib.reload(db_session)

        with db_ops.session_scope(self.credentials_filename) as session:
            datasets = session.query(db_ops.DataSet)

            # Find the Frames of interest
            all_frames = session.query(db_ops.Frames) \
             .join(db_ops.FramesGlobal) \
             .join(db_ops.DataSet) \
             .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \
             .all()

            # Get the image metadata
            image_metadata = []
            for im in all_frames:
                image_metadata.append(im.metadata_json['MicroManagerMetadata'])

        return image_metadata

Example #11

0

Show file

def search_ids(db_credentials: str, string: str):
    """
    Retrieves all the datasets in database whose id's contain a specified string
    Parameters
    ----------
    db_credentials: str
        Absolute url to location of .json credentials
    string: str
        string to match to dataset id's

    Returns
    -------s
    List[string] - list of dataset id's that contain the specified string
    """
    credentials_str = db_utils.get_connection_str(db_credentials)
    with db_ops.session_scope(credentials_str) as session:
        frames = session.query(db_ops.DataSet)

    set_of_ids = list()
    for f in frames:
        name = f.dataset_serial
        if string in name:
            set_of_ids.append(name)
    return list(set_of_ids)

Example #12

0

Show file

File: _image_database.py Project: czbiohub/InSituToolkit

    def getFrames(self, dataset_identifier, channels='all', slices='all'):
        ''' 
			Get particular slices from an imaging dataset.

			Todo: add slicing for pos and time.
		'''

        # Open the session
        importlib.reload(db_session)

        with db_ops.session_scope(self.credentials_filename) as session:
            datasets = session.query(db_ops.DataSet)

            # Find the Frames of interest
            all_frames = session.query(db_ops.Frames) \
             .join(db_ops.FramesGlobal) \
             .join(db_ops.DataSet) \
             .filter(db_ops.DataSet.dataset_serial == dataset_identifier)

            # Filter by channel
            if channels == 'all':
                pass

            elif type(channels) is tuple:
                slice_filtered = all_frames.filter(
                    db_ops.Frames.channel_name.in_(channels))

            else:
                print('Invalid channel query')

            # Filter by slice
            if slices == 'all':
                pass

            elif type(slices) is tuple:
                slice_filtered = all_frames.filter(
                    db_ops.Frames.slice_idx.in_(Frames))

            else:
                print('Invalid slice query')

            # Get the names of the files
            file_names = [im.filename for im in all_frames]
            # for im in all_frames:
            # 	file_names.append(im.file_name)

            # Get the bit depth
            bit_depth = all_frames[0].frames_global.bit_depth

            # Get the shape of the stack
            # TODO: get the shape from the acq meta
            stack_shape = (
                all_frames[0].frames_global.im_width,
                all_frames[0].frames_global.im_height,
                all_frames[0].frames_global.im_colors,
                len(all_frames),
            )

            # Get the folder
            s3_dir = all_frames[0].frames_global.s3_dir

            # Download the files
            data_loader = s3_storage.DataStorage(s3_dir=s3_dir)
            im_stack = data_loader.get_stack(file_names, stack_shape,
                                             bit_depth)

        session.rollback()
        session.close()

        return im_stack

Example #13

0

Show file

File: _image_database.py Project: czbiohub/InSituToolkit

    def getStack(self,
                 dataset_identifier,
                 channel,
                 time_idx=0,
                 pos_idx=0,
                 verbose=False):
        ''' Download a stack at a given set of pos, time, channel indices

			Returns
			im_ordered : np.ndarray containing the image [time, chan, z, x, y]

		'''

        with db_ops.session_scope(self.credentials_filename) as session:
            datasets = session.query(db_ops.DataSet)

            # Find the Frames of interest
            all_frames = session.query(db_ops.Frames) \
             .join(db_ops.FramesGlobal) \
             .join(db_ops.DataSet) \
             .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \
             .filter(db_ops.Frames.channel_name == channel) \
             .filter(db_ops.Frames.time_idx == time_idx) \
             .filter(db_ops.Frames.pos_idx == pos_idx) \
             .all()

            # Get the names of the files
            file_names = [im.file_name for im in all_frames]

            if len(file_names) == 0:
                raise ValueError('No images match query')

            # Get the bit depth
            bit_depth = all_frames[0].frames_global.bit_depth

            # Get the shape of the stack
            # TODO: get the shape from the acq meta
            stack_shape = (
                all_frames[0].frames_global.im_width,
                all_frames[0].frames_global.im_height,
                all_frames[0].frames_global.im_colors,
                len(all_frames),
            )

            # Get the folder
            s3_dir = all_frames[0].frames_global.s3_dir

            # Download the files
            data_loader = s3_storage.DataStorage(s3_dir=s3_dir)
            im_stack = data_loader.get_stack(file_names, stack_shape,
                                             bit_depth)

            im_ordered = np.zeros(
                (1, 1, stack_shape[3], stack_shape[0], stack_shape[1]),
                dtype='uint16')

            # Todo update get_stack so this isn't required...
            for im_idx in range(len(all_frames)):
                im_ordered[0, 0, im_idx, :, :] = im_stack[:, :, 0, im_idx]

        session.rollback()
        session.close()

        return im_ordered

Example #14

0

Show file

File: _make_experiment_csv.py Project: czbiohub/InSituToolkit

def make_experiment_csv(db_credentials: str,
                        csv_file: str,
                        image_ids: List[str],
                        channels: List[str],
                        metadata_format: str = 'micromanager',
                        positions: int = [0],
                        time: int = 0,
                        data_path: str = '/Volumes/imaging/czbiohub-imaging'):
    """
    Creates a CSV file mapping imagingDB frames to indices in an ImageStack
    for usage with the spacetx format writer.

    Parameters
    ----------
    db_credentials : str
        Path to the database credentials file
    file_path : str
        file_path of the resulting CSV file 
    metadata_format : str
        Format for the image metadata on imagingDB. For micromanager, set to 'micromanager'.
        Default value is 'micromanager'
    image_ids : List[str]
        A list of the image ids in the order of the 
    channels : Optional[]
        A list of the channels to be downloaded in the index order.
    positions : int
        Index of the position to download. The default value is 0.
    time : int
        Index of the time point to download. The default value is 0.
    data_path : str
        Path to the image store volume
    """

    meta_keys = metadata_keys[metadata_format.lower()]

    fov = []
    rnd = []
    channel = []
    z = []
    file_path = []
    #sha = []
    xc_min = []
    xc_max = []
    yc_min = []
    yc_max = []
    zc_min = []
    zc_max = []
    tile_width = []
    tile_height = []

    credentials_str = db_utils.get_connection_str(db_credentials)

    with db_ops.session_scope(credentials_str) as session:
        for r, im_id in enumerate(image_ids):
            for fov_idx, p in enumerate(positions):
                for chan_idx, c in enumerate(channels):
                    frames = session.query(db_ops.Frames) \
                        .join(db_ops.FramesGlobal) \
                        .join(db_ops.DataSet) \
                        .filter(db_ops.DataSet.dataset_serial == im_id) \
                        .filter(db_ops.Frames.pos_idx == p) \
                        .filter(db_ops.Frames.channel_name == c) \
                        .filter(db_ops.Frames.time_idx == time) \

                    for frame in frames:
                        # Determine
                        pixel_size = frame.metadata_json[meta_keys['key']][
                            meta_keys['pixel_size']]
                        im_width = frame.frames_global.im_width
                        im_height = frame.frames_global.im_height

                        # Add frame indices
                        fov.append(fov_idx)
                        rnd.append(r)
                        channel.append(chan_idx)
                        z.append(frame.slice_idx)
                        # Clean any windows file path seps before adding path
                        fp = os.path.join(frame.frames_global.storage_dir,
                                          frame.file_name)
                        clean_fp = os.path.join(*fp.split('\\'))
                        file_path.append(clean_fp)
                        xc_min.append(frame.metadata_json[meta_keys['key']][
                            meta_keys['xpos_um']])
                        xc_max.append(xc_min[-1] + im_width * pixel_size)
                        yc_min.append(frame.metadata_json[meta_keys['key']][
                            meta_keys['ypos_um']])
                        yc_max.append(yc_min[-1] + im_height * pixel_size)
                        zc_min.append(frame.metadata_json[meta_keys['key']][
                            meta_keys['zpos_um']])
                        zc_max.append(frame.metadata_json[meta_keys['key']][
                            meta_keys['zpos_um']])
                        tile_width.append(im_width)
                        tile_height.append(im_height)

    sha = _calc_checksums(file_path, data_path)

    data = [
        fov, rnd, channel, z, file_path, sha, xc_min, xc_max, yc_min, yc_max,
        zc_min, zc_max, tile_width, tile_height
    ]
    columns = [
        'fov', 'round', 'ch', 'zplane', 'path', 'sha256', 'xc_min', 'xc_max',
        'yc_min', 'yc_max', 'zc_min', 'zc_max', 'tile_width', 'tile_height'
    ]
    im_df = pd.DataFrame(dict(zip(columns, data)))
    im_df.to_csv(csv_file)

    return im_width, im_height

Example #15

0

Show file

def download_data(dataset_serial,
                  login,
                  dest,
                  storage='local',
                  storage_access=None,
                  metadata=True,
                  download=True,
                  nbr_workers=None,
                  positions=None,
                  times=None,
                  channels=None,
                  slices=None):
    """
    Find all files associated with unique project identifier and
    download them to a local directory.

    :param str dataset_serial: Unique dataset identifier
    :param str login: Full path to json file containing database login
                credentials
    :param str dest: Local destination directory name
    :param str storage: 'local' (default) - data will be stored locally and
                synced to S3 the same day. Or 'S3' - data will be uploaded
                directly to S3 then synced with local storage daily.
    :param str/None storage_access: If not using predefined storage locations,
                this parameter refers to mount_point for local storage and
                bucket_name for S3 storage.
    :param bool download: Downloads all files associated with dataset (default)
                If False, will only write csvs with metadata. Only for
                datasets split into frames
    :param bool metadata: Writes metadata (default True)
                global metadata in json, local for each frame in csv
    :param int, None nbr_workers: Number of workers for parallel download
                If None, it defaults to number of machine processors * 5
    :param list, None positions: Positions (FOVs) as integers (default
                None downloads all)
    :param list, None times: Timepoints as integers (default None downloads all)
    :param list, None channels: Channels as integer indices or strings for channel
                names (default None downloads all)
    :param list, None slices: Slice (z) integer indices (Default None downloads all)
    """
    try:
        cli_utils.validate_id(dataset_serial)
    except AssertionError as e:
        raise AssertionError("Invalid ID:", e)

    # Create output directory as a subdirectory in dest named
    # dataset_serial. It stops if the subdirectory already exists to avoid
    # the risk of overwriting existing data
    dest_dir = os.path.join(dest, dataset_serial)
    try:
        os.makedirs(dest_dir, exist_ok=False)
    except FileExistsError as e:
        raise FileExistsError("Folder {} already exists, {}".format(
            dest_dir, e))

    # Get database connection URI
    db_connection = db_utils.get_connection_str(login)
    db_utils.check_connection(db_connection)

    # Instantiate database class
    db_inst = db_ops.DatabaseOperations(dataset_serial=dataset_serial, )
    # Import local or S3 storage class
    storage_class = aux_utils.get_storage_class(storage_type=storage)

    if metadata is False:
        # Just download file(s)
        assert download,\
            "You set metadata *and* download to False. You get nothing."
        with db_ops.session_scope(db_connection) as session:
            storage_dir, file_names = db_inst.get_filenames(session=session, )
    else:
        # If channels can be converted to ints, they're indices
        if channels is not None:
            if not isinstance(channels, list):
                channels = [channels]
            try:
                channels = [int(c) for c in channels]
            except ValueError:
                # Channels are names, not indices
                assert all([isinstance(c, str) for c in channels]), \
                    "channels must be either all str or int"

        # Get the metadata from the requested frames
        with db_ops.session_scope(db_connection) as session:
            global_meta, frames_meta = db_inst.get_frames_meta(
                session=session,
                positions=positions,
                times=times,
                channels=channels,
                slices=slices,
            )
        # Write global metadata to destination directory
        global_meta_filename = os.path.join(
            dest_dir,
            "global_metadata.json",
        )
        json_ops.write_json_file(
            meta_dict=global_meta,
            json_filename=global_meta_filename,
        )
        # Write info for each frame to destination directory
        local_meta_filename = os.path.join(
            dest_dir,
            "frames_meta.csv",
        )
        frames_meta.to_csv(local_meta_filename, sep=",")
        # Extract folder and file names if we want to download
        storage_dir = global_meta["storage_dir"]
        file_names = frames_meta["file_name"]

    if download:
        if nbr_workers is not None:
            assert nbr_workers > 0,\
                "Nbr of worker must be >0, not {}".format(nbr_workers)
        data_loader = storage_class(
            storage_dir=storage_dir,
            nbr_workers=nbr_workers,
            access_point=storage_access,
        )
        data_loader.download_files(file_names, dest_dir)

Example #16

0

Show file

def upload_data_and_update_db(csv,
                              login,
                              config,
                              nbr_workers=None,
                              overwrite=False):
    """
    Takes a csv file in which each row represents a dataset, uploads the data
    to storage and metadata to database. If 'frames' is selected as upload
    type, each dataset will be split into individual 2D frames before moving
    to storage.
    TODO: Add logging instead of printing

    :param str login: Full path to json file containing login credentials
    :param str csv: Full path to csv file containing the following fields
        for each file to be uploaded:
            str dataset_id: Unique dataset ID <ID>-YYYY-MM-DD-HH-MM-SS-<SSSS>
            str file_name: Full path to file to be uploaded
            str description: Short description of file
            str parent_dataset_id: Parent dataset unique ID if there is one
                list positions: Which position files in folder to upload.
                Uploads all if left empty and file_name is a folder.
                Only valid for ome-tiff uploads.
    :param  str config: Full path to json config file containing the fields:
            str upload_type: Specify if the file should be split prior to upload
                Valid options: 'frames' or 'file'
            str frames_format: Which file splitter class to use.
                Valid options:
                'ome_tiff' needs MicroManagerMetadata tag for each frame for metadata
                'tif_folder' when each file is already an individual frame
                and relies on MicroManager metadata
                'tif_id' needs ImageDescription tag in first frame page for metadata
            str storage: 'local' (default) - data will be stored locally and
                synced to S3 the same day. Or 'S3' - data will be uploaded
                directly to S3 then synced with local storage daily.
            str storage_access: If not using predefined storage locations,
                this parameter refers to mount_point for local storage and
                bucket_name for S3 storage. (optional)
            str json_meta: If splitting to frames, give full path to json
                metadata schema for reading metadata (optional)
    :param int, None nbr_workers: Number of workers for parallel uploads
    :param bool overwrite: Use with caution if your upload if your upload was
            interrupted and you want to overwrite existing data in database
            and storage
    """
    # Assert that csv file exists and load it
    assert os.path.isfile(csv), \
        "File doesn't exist: {}".format(csv)
    files_data = pd.read_csv(csv)

    # Get database connection URI
    db_connection = db_utils.get_connection_str(login)
    db_utils.check_connection(db_connection)
    # Read and validate config json
    config_json = json_ops.read_json_file(
        json_filename=config,
        schema_name="CONFIG_SCHEMA",
    )
    # Assert that upload type is valid
    upload_type = config_json['upload_type'].lower()
    assert upload_type in {"file", "frames"}, \
        "upload_type should be 'file' or 'frames', not {}".format(
            upload_type,
        )
    if nbr_workers is not None:
        assert nbr_workers > 0, \
            "Nbr of worker must be >0, not {}".format(nbr_workers)
    # Import local or S3 storage class
    storage = 'local'
    if 'storage' in config_json:
        storage = config_json['storage']
    storage_class = aux_utils.get_storage_class(storage_type=storage)
    storage_access = None
    if 'storage_access' in config_json:
        storage_access = config_json['storage_access']

    # Make sure microscope is a string
    microscope = None
    if 'microscope' in config_json:
        if isinstance(config_json['microscope'], str):
            microscope = config_json['microscope']

    if upload_type == 'frames':
        # If upload type is frames, check from frames format
        assert 'frames_format' in config_json, \
            'You must specify the type of file(s)'
        splitter_class = aux_utils.get_splitter_class(
            config_json['frames_format'],
        )
    # Upload all files
    for file_nbr, row in files_data.iterrows():
        # Assert that ID is correctly formatted
        dataset_serial = row.dataset_id
        try:
            cli_utils.validate_id(dataset_serial)
        except AssertionError as e:
            raise AssertionError("Invalid ID:", e)

        # Get S3 directory based on upload type
        if upload_type == "frames":
            storage_dir = "/".join([FRAME_FOLDER_NAME, dataset_serial])
        else:
            storage_dir = "/".join([FILE_FOLDER_NAME, dataset_serial])
        # Instantiate database operations class
        db_inst = db_ops.DatabaseOperations(
            dataset_serial=dataset_serial,
        )
        # Make sure dataset is not already in database
        if not overwrite:
            with db_ops.session_scope(db_connection) as session:
                db_inst.assert_unique_id(session)
        # Check for parent dataset
        parent_dataset_id = 'None'
        if 'parent_dataset_id' in row:
            parent_dataset_id = row.parent_dataset_id
        # Check for dataset description
        description = None
        if 'description' in row:
            if row.description == row.description:
                description = row.description

        if upload_type == "frames":
            # Instantiate splitter class
            frames_inst = splitter_class(
                data_path=row.file_name,
                storage_dir=storage_dir,
                storage_class=storage_class,
                storage_access=storage_access,
                overwrite=overwrite,
                file_format=FRAME_FILE_FORMAT,
                nbr_workers=nbr_workers,
            )
            # Get kwargs if any
            kwargs = {}
            if 'positions' in row:
                positions = row['positions']
                if not pd.isna(positions):
                    kwargs['positions'] = positions
            if 'schema_filename' in config_json:
                kwargs['schema_filename'] = config_json['schema_filename']
            if 'filename_parser' in config_json:
                filename_parser = config_json['filename_parser']
                kwargs['filename_parser'] = filename_parser
            # Extract metadata and split file into frames
            frames_inst.get_frames_and_metadata(**kwargs)

            # Add frames metadata to database
            try:
                with db_ops.session_scope(db_connection) as session:
                    db_inst.insert_frames(
                        session=session,
                        description=description,
                        frames_meta=frames_inst.get_frames_meta(),
                        frames_json_meta=frames_inst.get_frames_json(),
                        global_meta=frames_inst.get_global_meta(),
                        global_json_meta=frames_inst.get_global_json(),
                        microscope=microscope,
                        parent_dataset=parent_dataset_id,
                    )
            except AssertionError as e:
                print("Data set {} already in DB".format(dataset_serial), e)
        # File upload
        else:
            # Just upload file without opening it
            assert os.path.isfile(row.file_name), \
                "File doesn't exist: {}".format(row.file_name)
            data_uploader = storage_class(
                storage_dir=storage_dir,
                access_point=storage_access,
            )
            if not overwrite:
                data_uploader.assert_unique_id()
            try:
                data_uploader.upload_file(file_path=row.file_name)
                print("File {} uploaded to S3".format(row.file_name))
            except AssertionError as e:
                print("File already on S3, moving on to DB entry. {}".format(e))

            sha = meta_utils.gen_sha256(row.file_name)
            # Add file entry to DB once I can get it tested
            global_json = {"file_origin": row.file_name}
            file_name = row.file_name.split("/")[-1]
            try:
                with db_ops.session_scope(db_connection) as session:
                    db_inst.insert_file(
                        session=session,
                        description=description,
                        storage_dir=storage_dir,
                        file_name=file_name,
                        global_json_meta=global_json,
                        microscope=microscope,
                        parent_dataset=parent_dataset_id,
                        sha256=sha,
                    )
                print("File info for {} inserted in DB".format(dataset_serial))
            except AssertionError as e:
                print("File {} already in database".format(dataset_serial))