def copy_files_from_bucket_to_bucket(s3_helper, key_pogs, key_aws_pogs, galaxy_name, run_id, galaxy_id): filename_aws_pogs = '{0}/{1}.hdf5'.format( FAST_DISK, get_galaxy_file_name(galaxy_name, run_id, galaxy_id) ) s3_helper.get_file_from_bucket(BUCKET_NAME_OLD_POGS, key_pogs, filename_aws_pogs) # Get the size if should_be_multipart(filename_aws_pogs): s3_helper.add_file_to_bucket_multipart( BUCKET_NAME, key_aws_pogs, filename_aws_pogs, reduced_redundancy=True, delete_source=True ) else: s3_helper.add_file_to_bucket(BUCKET_NAME, key_aws_pogs, filename_aws_pogs, reduced_redundancy=True, delete_source=True)
def run(self): galaxy = self._connection.execute( select([GALAXY]).where(GALAXY.c.galaxy_id == self._galaxy_id)).first() if galaxy is not None: LOG.info( 'Processing {0}'.format( get_galaxy_file_name( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id], ) ) ) self._dimension_x = galaxy[GALAXY.c.dimension_x] self._dimension_y = galaxy[GALAXY.c.dimension_y] self._find_mask()
def copy_hdf5_files(args): s3_helper = S3Helper() bucket_aws_pogs = s3_helper.get_bucket(BUCKET_NAME) bucket_pogs = s3_helper.get_bucket(BUCKET_NAME_OLD_POGS) engine = create_engine(DB_LOGIN) connection = engine.connect() subquery = select([STEP_DONE.c.galaxy_id]).distinct() for galaxy in connection.execute(select([GALAXY]).where(~GALAXY.c.galaxy_id.in_(subquery)).order_by(GALAXY.c.galaxy_id)): if args.verbose: LOG.info('Checking galaxy_id: {0}, name: {1}'.format(galaxy[GALAXY.c.galaxy_id], galaxy[GALAXY.c.name])) # noinspection PyBroadException try: block_dir = get_block_dir(galaxy[GALAXY.c.galaxy_id] / 1000) name_aws_pogs = get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) key_aws_pogs = Key(bucket_aws_pogs) key_aws_pogs.key = '{0}/{1}.hdf5'.format(block_dir, name_aws_pogs) if not key_aws_pogs.exists(): # Does it exist in POGS? name_pogs = get_galaxy_file_name_pogs(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) key_pogs = Key(bucket_pogs) key_pogs.key = '{0}/{0}.hdf5'.format(name_pogs) if key_pogs.exists(): check_and_copy_or_restore(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id], args.verbose) else: add_step_done_id(connection, galaxy[GALAXY.c.galaxy_id], STEP_DONE_ID_NO_HDF5_FILE) except BaseException: LOG.exception('error during fetch, quitting') break if args.verbose: LOG.info('Done')
def check_and_copy_or_restore(galaxy_name, run_id, galaxy_id, verbose): # noinspection PyBroadException try: s3_helper = S3Helper(PROFILE_NAME) bucket_aws_pogs = s3_helper.get_bucket(BUCKET_NAME) bucket_pogs = s3_helper.get_bucket(BUCKET_NAME_OLD_POGS) name_aws_pogs = get_galaxy_file_name(galaxy_name, run_id, galaxy_id) name_pogs = get_galaxy_file_name_pogs(galaxy_name, run_id, galaxy_id) # Get the new key from the old key block_dir = get_block_dir(galaxy_id / 1000) key_aws_pogs = Key(bucket_aws_pogs) key_aws_pogs.key = '{0}/{1}.hdf5'.format(block_dir, name_aws_pogs) keyname_pogs = '{0}/{0}.hdf5'.format(name_pogs) # Work around to silly bug in boto key_pogs = bucket_pogs.get_key(keyname_pogs) if key_pogs.storage_class == 'GLACIER' and key_pogs.expiry_date is None: if key_pogs.ongoing_restore is None: if verbose: LOG.info('Retrieving from glacier: {0}'.format(key_pogs.key)) key_pogs.restore(days=10) else: if verbose: LOG.info('The file is being retrieved from glacier: {0}'.format(key_pogs.key)) else: if verbose: LOG.info('Copy: {0} to {1}'.format(key_pogs.key, key_aws_pogs.key)) copy_files_from_bucket_to_bucket(s3_helper, key_pogs.key, key_aws_pogs.key, galaxy_name, run_id, galaxy_id) except BaseException: LOG.exception('Error during copy')
def process_galaxy(connection, galaxies, verbosity, profile_name='aws-pogs'): """ Process the galaxies """ # Build the tables build_dynamic_tables(connection) for galaxy in galaxies: galaxy_id = galaxy[GALAXY.c.galaxy_id] steps_done = get_step_done_ids( connection, galaxy_id, [STEP_DONE_ID_NO_HDF5_FILE, STEP_DONE_ID_ORIGINAL_VALUES, STEP_DONE_ID_SED_DATA], True, True ) # Work out what SED data is required data_required_sed = get_data_required_sed(connection, steps_done) data_required_original = get_data_required_original(steps_done) if len(data_required_sed) > 0 or len(data_required_original) > 0: # Copy the file from S3 s3_helper = S3Helper(profile_name=profile_name) galaxy_id = int(galaxy[GALAXY.c.galaxy_id]) galaxy_name = get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy_id) s3_name = os.path.join('{0:04d}000'.format(galaxy_id / 1000), galaxy_name) + '.hdf5' hdf5_file_name = os.path.join(FAST_DISK, galaxy_name) + '.hdf5' copy_ok = s3_helper.copy_file_to_disk(BUCKET_NAME, s3_name, hdf5_file_name) if copy_ok: h5_file = h5py.File(hdf5_file_name, 'r') LOG.info('Processing SED for name: {0}, run_id: {1}, galaxy_id: {2}'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy_id)) # Do we have anything to do? if len(data_required_sed) > 0: # noinspection PyBroadException try: # Store the SED fit values add_sed_data(connection, galaxy_id, h5_file, data_required_sed) except Exception: LOG.exception('An exception occurred in process_galaxy processing the SED values') else: if verbosity >= 1: LOG.info('Nothing to add - SED') if len(data_required_original) > 0: # noinspection PyBroadException try: add_original_data(connection, galaxy_id, h5_file, data_required_original) except Exception: LOG.exception('An exception occurred in process_galaxy processing the original values') else: if verbosity >= 1: LOG.info('Nothing to add - Original Data') # Clean up after ourselves h5_file.close() os.remove(hdf5_file_name) else: LOG.error('The file for name: {0}, run_id: {1}, galaxy_id: {2} does not exist'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy_id))