def get_glacier_data_size(connection, bucket_name): """ Returns the total number of bytes that we have stored in glacier. Checks with the database first for a cached copy of this info to not have to keep re-requesting it. :param connection: The database connection. :param bucket_name: Name of the bucket to count. :return: """ # Load most recent entry from database # if timestamp on most recent entry is < 24 hours from now, use it # if not, do the full check and add a new entry in the db specifying the glacier size. day_ago = seconds_since_epoch(get_hours_ago(24)) result = connection.execute( select([HDF5_GLACIER_STORAGE_SIZE ]).where(HDF5_GLACIER_STORAGE_SIZE.c.count_time > day_ago)) latest_time = 0 latest_size = 0 for row in result: if row['count_time'] > latest_time: latest_size = row['size'] latest_time = row['count_time'] if latest_time == 0 or latest_size == 0: # Need to re-count s3helper = S3Helper() LOG.info("Glacier data size expired, recounting...") size = s3helper.glacier_data_size(bucket_name) LOG.info("Glacier data size counted: {0} bytes".format(size)) connection.execute(HDF5_GLACIER_STORAGE_SIZE.insert(), size=size, count_time=seconds_since_epoch(datetime.now())) else: size = latest_size return size
def get_glacier_data_size(connection, bucket_name): """ Returns the total number of bytes that we have stored in glacier. Checks with the database first for a cached copy of this info to not have to keep re-requesting it. :param connection: The database connection. :param bucket_name: Name of the bucket to count. :return: """ # Load most recent entry from database # if timestamp on most recent entry is < 24 hours from now, use it # if not, do the full check and add a new entry in the db specifying the glacier size. day_ago = seconds_since_epoch(get_hours_ago(24)) result = connection.execute(select([HDF5_GLACIER_STORAGE_SIZE]) .where(HDF5_GLACIER_STORAGE_SIZE.c.count_time > day_ago)) latest_time = 0 latest_size = 0 for row in result: if row['count_time'] > latest_time: latest_size = row['size'] latest_time = row['count_time'] if latest_time == 0 or latest_size == 0: # Need to re-count s3helper = S3Helper() LOG.info("Glacier data size expired, recounting...") size = s3helper.glacier_data_size(bucket_name) LOG.info("Glacier data size counted: {0} bytes".format(size)) connection.execute(HDF5_GLACIER_STORAGE_SIZE.insert(), size=size, count_time=seconds_since_epoch(datetime.now())) else: size = latest_size return size
def get_day_start_request_size(connection): """ Get the volume of data that has been received from glacier since the start of the day. :param connection: Database connection. :return: amount of data, in bytes, that has been requested since the start of the day. """ # for each row in the db where request time + 4 hours is > now - 24 hours # add up the size of the requests start_time = seconds_since_epoch(get_start_of_day()) result = connection.execute(select([HDF5_REQUEST_GALAXY_SIZE]).where(HDF5_REQUEST_GALAXY_SIZE.c.request_time > start_time)) size = 0 for line in result: # Add up the sizes size += line['size'] return size
def get_day_start_request_size(connection): """ Get the volume of data that has been received from glacier since the start of the day. :param connection: Database connection. :return: amount of data, in bytes, that has been requested since the start of the day. """ # for each row in the db where request time + 4 hours is > now - 24 hours # add up the size of the requests start_time = seconds_since_epoch(get_start_of_day()) result = connection.execute( select([HDF5_REQUEST_GALAXY_SIZE ]).where(HDF5_REQUEST_GALAXY_SIZE.c.request_time > start_time)) size = 0 for line in result: # Add up the sizes size += line['size'] return size
def generate_files(connection, hdf5_request_galaxy_ids, email, features, layers, pixel_types): """ Get the FITS files for this request :type connection: The database connection :param pixel_types: :param hdf5_request_galaxy_ids: the galaxy id :param email: :param features: :param layers: :return: """ uuid_string = str(uuid.uuid4()) results = [] available_galaxies = [] s3_helper = S3Helper() bucket_name = get_saved_files_bucket() # Check whether all the requested galaxies are available or not. for hdf5_request_galaxy in hdf5_request_galaxy_ids: galaxy = connection.execute( select([GALAXY]).where( GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() hdf5_request_galaxy = connection.execute( select([ HDF5_REQUEST_GALAXY ]).where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id)).first() state = hdf5_request_galaxy.state if state is not 0: LOG.info('Skipping {0}, state is {1}'.format( galaxy[GALAXY.c.name], state)) continue # Skip key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info( 'Galaxy {0} is still restoring from glacier'.format( galaxy[GALAXY.c.name])) else: # if file is not restoring, need to request. file_size = s3_helper.file_size(bucket_name, key) if restore_file_size_check(connection, bucket_name, file_size): # We're good to restore LOG.info( 'Making request for archived galaxy {0}'.format( galaxy[GALAXY.c.name])) s3_helper.restore_archived_file(bucket_name, key) connection.execute( HDF5_REQUEST_GALAXY_SIZE.insert(), hdf5_request_galaxy_id=hdf5_request_galaxy[ 'hdf5_request_galaxy_id'], size=file_size, request_time=seconds_since_epoch(datetime.now())) else: # Don't restore or we risk spending a lot of money LOG.info( 'Daily galaxy restore size hit. Cannot request archived galaxy.' ) else: # file is not archived LOG.info('Galaxy {0} is available in s3'.format( galaxy[GALAXY.c.name])) available_galaxies.append(hdf5_request_galaxy) else: LOG.error('Galaxy {0} does not exist on s3 or glacier!'.format( galaxy[GALAXY.c.name])) total_request_galaxies = len(hdf5_request_galaxy_ids) LOG.info( 'Need to have {0} galaxies available ({1} currently available)'.format( total_request_galaxies * GALAXY_EMAIL_THRESHOLD, len(available_galaxies))) if len( available_galaxies ) >= total_request_galaxies * GALAXY_EMAIL_THRESHOLD: # Only proceed if more than the threshold of galaxies are available LOG.info('{0}/{1} (> {2}%) galaxies are available. Email will be sent'. format(len(available_galaxies), total_request_galaxies, GALAXY_EMAIL_THRESHOLD * 100)) remaining_galaxies = total_request_galaxies - len(available_galaxies) for hdf5_request_galaxy in available_galaxies: result = HDF5ToFitsResult() results.append(result) connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values(state=1)) # noinspection PyBroadException try: galaxy = connection.execute( select([GALAXY ]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() result.galaxy_name = galaxy[GALAXY.c.name] LOG.info('Processing {0} ({1}) for {2}'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], email)) # make sure the galaxy is available if galaxy[GALAXY.c.status_id] == STORED or galaxy[ GALAXY.c.status_id] == DELETED: output_dir = tempfile.mkdtemp() try: s3_helper = S3Helper() LOG.info('Getting HDF5 file to {0}'.format(output_dir)) tmp_file = get_hdf5_file(s3_helper, output_dir, galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('File stored in {0}'.format(tmp_file)) # We have the file if os.path.isfile(tmp_file): int_flux_output = os.path.join( output_dir, 'intflux') rad_output = os.path.join(output_dir, 'rad') if not os.path.exists(int_flux_output): os.mkdir(int_flux_output) if not os.path.exists(rad_output): os.mkdir(rad_output) file_names = process_hdf5_file( tmp_file, galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], pixel_types, features, result, layers, output_dir, rad_output, int_flux_output, ) url = zip_files( s3_helper, get_galaxy_file_name( galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), uuid_string, file_names, output_dir) connection.execute(HDF5_REQUEST_GALAXY.update( ).where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id ).values(state=2, link=url, link_expires_at=datetime.now() + timedelta(days=10))) result.error = None result.link = url except S3ResponseError as e: # Handling for a strange s3 error LOG.error( 'Error retrieving galaxy {0} from s3. Retrying next run' .format(galaxy[GALAXY.c.name])) LOG.error('{0}'.format(str(e))) key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('Key: {0}'.format(key)) LOG.info('Exists: {0}'.format( s3_helper.file_exists(bucket_name, key))) result.error = traceback.format_exc() remaining_galaxies += 1 finally: # Delete the temp files now we're done shutil.rmtree(output_dir) else: connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values( state=3)) result.error = 'Cannot process {0} ({1}) as the HDF5 file has not been generated'.format( galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id]) LOG.info(result.error) except: LOG.error('Major error') result.error = traceback.format_exc() connection.execute(HDF5_REQUEST_GALAXY.update().where( HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values( state=3)) send_email(email, results, features, layers, pixel_types, remaining_galaxies)
def generate_files(connection, hdf5_request_galaxy_ids, email, features, layers, pixel_types): """ Get the FITS files for this request :type connection: The database connection :param pixel_types: :param hdf5_request_galaxy_ids: the galaxy id :param email: :param features: :param layers: :return: """ uuid_string = str(uuid.uuid4()) results = [] available_galaxies = [] s3_helper = S3Helper() bucket_name = get_saved_files_bucket() # Check whether all the requested galaxies are available or not. for hdf5_request_galaxy in hdf5_request_galaxy_ids: galaxy = connection.execute(select([GALAXY]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() hdf5_request_galaxy = connection.execute(select([HDF5_REQUEST_GALAXY]) .where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id)).first() state = hdf5_request_galaxy.state if state is not 0: LOG.info('Skipping {0}, state is {1}'.format(galaxy[GALAXY.c.name], state)) continue # Skip key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) if s3_helper.file_exists(bucket_name, key): if s3_helper.file_archived(bucket_name, key): # file is archived if s3_helper.file_restoring(bucket_name, key): # if file is restoring, just need to wait for it LOG.info('Galaxy {0} is still restoring from glacier'.format(galaxy[GALAXY.c.name])) else: # if file is not restoring, need to request. file_size = s3_helper.file_size(bucket_name, key) if restore_file_size_check(connection, bucket_name, file_size): # We're good to restore LOG.info('Making request for archived galaxy {0}'.format(galaxy[GALAXY.c.name])) s3_helper.restore_archived_file(bucket_name, key) connection.execute(HDF5_REQUEST_GALAXY_SIZE.insert(), hdf5_request_galaxy_id=hdf5_request_galaxy['hdf5_request_galaxy_id'], size=file_size, request_time=seconds_since_epoch(datetime.now())) else: # Don't restore or we risk spending a lot of money LOG.info('Daily galaxy restore size hit. Cannot request archived galaxy.') else: # file is not archived LOG.info('Galaxy {0} is available in s3'.format(galaxy[GALAXY.c.name])) available_galaxies.append(hdf5_request_galaxy) else: LOG.error('Galaxy {0} does not exist on s3 or glacier!'.format(galaxy[GALAXY.c.name])) total_request_galaxies = len(hdf5_request_galaxy_ids) LOG.info('Need to have {0} galaxies available ({1} currently available)'.format(total_request_galaxies * GALAXY_EMAIL_THRESHOLD, len(available_galaxies))) if len(available_galaxies) >= total_request_galaxies * GALAXY_EMAIL_THRESHOLD: # Only proceed if more than the threshold of galaxies are available LOG.info('{0}/{1} (> {2}%) galaxies are available. Email will be sent'.format( len(available_galaxies), total_request_galaxies, GALAXY_EMAIL_THRESHOLD * 100) ) remaining_galaxies = total_request_galaxies - len(available_galaxies) for hdf5_request_galaxy in available_galaxies: result = HDF5ToFitsResult() results.append(result) connection.execute(HDF5_REQUEST_GALAXY.update().where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id).values(state=1)) # noinspection PyBroadException try: galaxy = connection.execute(select([GALAXY]).where(GALAXY.c.galaxy_id == hdf5_request_galaxy.galaxy_id)).first() result.galaxy_name = galaxy[GALAXY.c.name] LOG.info('Processing {0} ({1}) for {2}'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], email)) # make sure the galaxy is available if galaxy[GALAXY.c.status_id] == STORED or galaxy[GALAXY.c.status_id] == DELETED: output_dir = tempfile.mkdtemp() try: s3_helper = S3Helper() LOG.info('Getting HDF5 file to {0}'.format(output_dir)) tmp_file = get_hdf5_file(s3_helper, output_dir, galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('File stored in {0}'.format(tmp_file)) # We have the file if os.path.isfile(tmp_file): int_flux_output = os.path.join(output_dir, 'intflux') rad_output = os.path.join(output_dir, 'rad') if not os.path.exists(int_flux_output): os.mkdir(int_flux_output) if not os.path.exists(rad_output): os.mkdir(rad_output) file_names = process_hdf5_file( tmp_file, galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id], pixel_types, features, result, layers, output_dir, rad_output, int_flux_output, ) url = zip_files( s3_helper, get_galaxy_file_name(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]), uuid_string, file_names, output_dir ) connection.execute( HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=2, link=url, link_expires_at=datetime.now() + timedelta(days=10))) result.error = None result.link = url except S3ResponseError as e: # Handling for a strange s3 error LOG.error('Error retrieving galaxy {0} from s3. Retrying next run'.format(galaxy[GALAXY.c.name])) LOG.error('{0}'.format(str(e))) key = get_key_hdf5(galaxy[GALAXY.c.name], galaxy[GALAXY.c.run_id], galaxy[GALAXY.c.galaxy_id]) LOG.info('Key: {0}'.format(key)) LOG.info('Exists: {0}'.format(s3_helper.file_exists(bucket_name, key))) result.error = traceback.format_exc() remaining_galaxies += 1 finally: # Delete the temp files now we're done shutil.rmtree(output_dir) else: connection.execute(HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=3)) result.error = 'Cannot process {0} ({1}) as the HDF5 file has not been generated'.format(galaxy[GALAXY.c.name], galaxy[GALAXY.c.galaxy_id]) LOG.info(result.error) except: LOG.error('Major error') result.error = traceback.format_exc() connection.execute(HDF5_REQUEST_GALAXY.update(). where(HDF5_REQUEST_GALAXY.c.hdf5_request_galaxy_id == hdf5_request_galaxy.hdf5_request_galaxy_id). values(state=3)) send_email(email, results, features, layers, pixel_types, remaining_galaxies)