def save_files_details(self): """ This will crawl available files, then loop through the file listing, saving each to dynamo """ if self.event['objectFilesApi_execution_count'] == 1: marble_files = self._crawl_available_files_from_s3_or_cache(self.config['marble-content-bucket'], True) # rbsc_files = self._crawl_available_files_from_s3_or_cache(self.config['rbsc-image-bucket'], True) # save in case we need to crawl the RBSC bucket ever again # all_files_listing = {**rbsc_files, **marble_files} all_files_listing = {**marble_files} else: all_files_listing = self._resume_execution() file_objects = [] processing_complete = True for key, value in all_files_listing.items(): if not value.get('recordProcessedFlag', False): file_objects.extend(self._save_file_objects_per_collection(value)) value['recordProcessedFlag'] = True print("saved", len(value.get('files', [])), "files for collection: ", key, int(time.time() - self.start_time), 'seconds.') if datetime.now() >= self.time_to_break: self._save_progress(all_files_listing) processing_complete = False break if processing_complete: self._clean_up_when_done() self.event['objectFilesApiComplete'] = True if self.event['local']: self._cache_s3_call(os.path.join(self.directory, "file_objects.json"), file_objects) else: write_s3_json(self.config['manifest-server-bucket'], 'objectFiles/all/index.json', file_objects) return file_objects
def _save_progress(self, all_files_listing: dict): """ This is used to save progress in order to resume execution later """ if self.event['local']: cache_file_name = os.path.join(self.directory, self.resumption_filename) self._cache_s3_call(self, cache_file_name, all_files_listing) else: s3_key = os.path.join(self.config['pipeline-control-folder'], self.resumption_filename) write_s3_json(self.config['process-bucket'], s3_key, all_files_listing)
def cache_pipeline_config(config, event): if event.get('local', False): return test_required_fields(event) s3Path = "pipeline_runs/" + event['config-file'] s3Bucket = event['process-bucket'] write_s3_json(s3Bucket, s3Path, config)
def _save_curate_json(self, curate_json: dict): """ Once we retrieve curate_json, save it so we can process more easily next time. """ item_id = curate_json.get('id', '') if self.config.get('local', True): filename = os.path.join(self.local_folder, "save", item_id + "_curate.json") with open(filename, 'w') as f: json.dump(curate_json, f, indent=2) else: key = os.path.join('save', item_id + '_curate.json') write_s3_json(self.config['process-bucket'], key, curate_json)
def _save_standard_json_for_future_processing(self, standard_json: dict): """ Once we get standard_json, save it so we can process more easily next time. """ item_id = standard_json.get('id', '') if self.config.get('local', True) or self.save_standard_json_locally: filename = os.path.join(self.local_folder, "save", item_id + "_standard.json") with open(filename, 'w') as f: json.dump(standard_json, f, indent=2, sort_keys=True) else: key = os.path.join('save', item_id + '_standard.json') write_s3_json(self.config['process-bucket'], key, standard_json)
def _save_seed_files_to_s3(bucket_name, folder_name): local_folder = os.path.dirname(os.path.realpath(__file__)) + "/" for file_name in os.listdir(folder_name): local_file_name = os.path.join(local_folder, folder_name, file_name) if os.path.isfile(local_file_name): try: with io.open(local_file_name, 'r', encoding='utf-8') as json_file: json_to_save = json.load(json_file) s3_key = os.path.join(folder_name, file_name) _delete_multipart_s3_file_if_necessary(bucket_name, s3_key) print('saving filename to s3 = ', file_name) write_s3_json(bucket_name, s3_key, json_to_save) except: # noqa E722 - intentionally ignore warning about bare except pass
def run(event, _context): """ run the process to retrieve and process web kiosk metadata """ _suplement_event(event) config = setup_pipeline_config(event) google_config = load_config_ssm(config['google_keys_ssm_base']) config.update(google_config) museum_config = load_config_ssm(config['museum_keys_ssm_base']) config.update(museum_config) time_to_break = datetime.now() + timedelta( seconds=config['seconds-to-allow-for-processing']) print("Will break after ", time_to_break) mode = event.get("mode", "full") if mode not in ["full", "incremental", "ids"]: mode = "full" json_web_kiosk_class = ProcessWebKioskJsonMetadata(config, event, time_to_break) if event["museumExecutionCount"] == 1: if not event.get('local'): save_file_system_record(config.get('website-metadata-tablename'), 'Google', 'Museum') save_source_system_record(config.get('website-metadata-tablename'), 'EmbARK') composite_json = json_web_kiosk_class.get_composite_json_metadata(mode) museum_image_metadata = json_web_kiosk_class.find_images_for_composite_json_metadata( composite_json) composite_json = CleanUpCompositeJson( composite_json).cleaned_up_content event['countToProcess'] = len(composite_json.get('objects')) write_s3_json(config['process-bucket'], 'museum_composite_metadata.json', composite_json) write_s3_json(config['process-bucket'], 'museum_image_metadata.json', museum_image_metadata) else: composite_json = read_s3_json(config['process-bucket'], 'museum_composite_metadata.json') museum_image_metadata = read_s3_json(config['process-bucket'], 'museum_image_metadata.json') if composite_json: objects_processed = json_web_kiosk_class.process_composite_json_metadata( composite_json, museum_image_metadata) event['museumHarvestComplete'] = _done_processing(composite_json) else: print('No JSON to process') if event["museumExecutionCount"] >= event["maximumMuseumExecutions"]: event['museumHarvestComplete'] = True if event['museumHarvestComplete']: if s3_file_exists(config['process-bucket'], 'museum_composite_metadata.json'): delete_s3_key(config['process-bucket'], 'museum_composite_metadata.json') if s3_file_exists(config['process-bucket'], 'museum_image_metadata.json'): delete_s3_key(config['process-bucket'], 'museum_image_metadata.json') elif composite_json: write_s3_json(config['process-bucket'], 'museum_composite_metadata.json', composite_json) key = 'countHarvestedLoop' + str(event["museumExecutionCount"]) event[key] = objects_processed event['countRemaining'] = len(composite_json.get('objects')) return event