def run(event, _context):
    """ Run the process to retrieve and process Aleph metadata. """
    _supplement_event(event)
    config = setup_pipeline_config(event)
    if config:
        time_to_break = datetime.now() + timedelta(
            seconds=config['seconds-to-allow-for-processing'])
        print("Will break after ", time_to_break)
        subject_terms = get_subject_terms_needing_expanded(
            config.get('website-metadata-tablename'), 1)
        if 'countToProcess' not in event:
            event['countToProcess'] = len(subject_terms)

        print('count of subject terms we need to expand = ',
              len(subject_terms))
        while len(subject_terms):
            subject = subject_terms.pop(0)
            expand_subject_term(subject,
                                config.get('website-metadata-tablename'))
            if datetime.now() > time_to_break:
                break
        event['countRemaining'] = len(subject_terms)
        if len(subject_terms) == 0:
            event['expandSubjectTermsComplete'] = True
        if event["expandSubjectTermsExecutionCount"] >= event[
                "maximumExpandSubjectTermsExecutions"]:
            event['expandSubjectTermsComplete'] = True

    return event
 def setUp(self):
     self.event = {"local": True}
     self.event['local-path'] = str(
         Path(__file__).parent.absolute()) + "/../example/"
     self.config = setup_pipeline_config(self.event)
     self.config['curate-token'] = 'some_token'
     time_to_break = datetime.now() + timedelta(
         seconds=self.config['seconds-to-allow-for-processing'])
     self.get_curate_metadata_class = GetCurateMetadata(
         self.config, self.event, time_to_break)
 def setUp(self):
     self.event = {"local": True}
     self.event['local-path'] = str(
         Path(__file__).parent.absolute()) + "/../example/"
     self.config = setup_pipeline_config(self.event)
     self.create_standard_json_class = CreateStandardJson(self.config)
     filename = local_folder + 'test/zp38w953h0s_curate.json'
     with open(filename, 'r') as input_source:
         self.curate_json = json.load(input_source)
     filename = local_folder + 'test/zp38w953h0s_preliminary_standard.json'
     with open(filename, 'r') as input_source:
         self.preliminary_standard_json = json.load(input_source)
def run(event: dict, context: dict) -> dict:
    """ Run the process to retrieve and process Aleph metadata. """
    _supplement_event(event)

    config = setup_pipeline_config(event)
    if config:
        time_to_break = datetime.now() + timedelta(
            seconds=config['seconds-to-allow-for-processing'])
        print("Will break after ", time_to_break)
        curate_config = load_config_ssm(config['curate_keys_ssm_base'])
        config.update(curate_config)

        unique_bendo_items = _get_unique_list_of_bendo_items(config)
        # print("unique_bendo_items = ", unique_bendo_items)
        unique_bendo_items = ['zp38w953h0s'
                              ]  # , 'pv63fx74g23']  # overwrite for testing
        _get_detail_for_all_items(config, unique_bendo_items)
    return event
Beispiel #5
0
def run(event, context):
    """ run the process to retrieve and process web kiosk metadata """
    _suplement_event(event)
    config = setup_pipeline_config(event)
    google_config = load_config_ssm(config['google_keys_ssm_base'])
    config.update(google_config)
    google_credentials = json.loads(config["museum-google-credentials"])
    harvest_metadata_rules_class = HarvestMetadataRules(google_credentials)
    local_folder = os.path.dirname(os.path.realpath(__file__)) + '/'
    for site_name in event['sites']:
        harvest_metadata_rules_class.harvest_google_spreadsheet_info(site_name)
    s3_sync(config["process-bucket"], "sites", local_folder + "sites")
    try:
        copy_tree(local_folder + "sites/",
                  local_folder + "../process_manifest/sites/")
    except EnvironmentError as e:
        print('Unable to sync sites files to process_manifest ')
        capture_exception(e)
Beispiel #6
0
def run(event: dict, context: dict) -> dict:
    """ Run the process to retrieve and process Aleph metadata. """
    _supplement_event(event)

    config = setup_pipeline_config(event)
    if config:
        time_to_break = datetime.now() + timedelta(
            seconds=config['seconds-to-allow-for-processing'])
        print("Will break after ", time_to_break)
        if event.get('curateExecutionCount',
                     0) == 1 and not event.get('local', True):
            save_source_system_record(config.get('website-metadata-tablename'),
                                      'Curate')
            _save_seed_files_to_s3(config['process-bucket'], 'save')
        if not config.get('local', True):
            curate_config = load_config_ssm(config['curate_keys_ssm_base'])
            config.update(curate_config)
            save_file_system_record(config.get('website-metadata-tablename'),
                                    'Curate', 'Curate')
            if not event.get("ids", False):
                string_list_to_save = _read_harvest_ids_from_json(
                    './source_system_export_ids.json', 'Curate')
                save_harvest_ids(config, 'Curate', string_list_to_save,
                                 config.get('website-metadata-tablename'))
                event['ids'] = _read_harvest_ids_from_dynamo(
                    config.get('website-metadata-tablename'), 'Curate')
                event['countToProcess'] = len(event['ids'])

        if "ids" in event:
            print("ids to process: ", event["ids"])
            curate_api_class = CurateApi(config, event, time_to_break)
            event[
                "curateHarvestComplete"] = curate_api_class.process_curate_items(
                    event["ids"])
        event['countRemaining'] = len(event['ids'])
        if event["curateExecutionCount"] >= event[
                "maxCurateExecutions"] and not event["curateHarvestComplete"]:
            event["curateHarvestComplete"] = True
            sentry_sdk.capture_message(
                'Curate did not complete harvest after maximum executions threshold of '
                + str(event["maxCurateExecutions"]))
    return event
def run(event: dict, _context: dict):
    """ Run the process to retrieve and process ArchivesSpace metadata.

    Information on the API can be found here:
        http://archivesspace.github.io/archivesspace/api/ """
    _supplement_event(event)
    _init_sentry()
    config = setup_pipeline_config(event)
    if not event.get("ids", False):
        string_list_to_save = _read_harvest_ids_from_json('./source_system_export_ids.json')
        save_harvest_ids(config, 'ArchivesSpace', string_list_to_save, config.get('website-metadata-tablename'))
        event['ids'] = _read_harvest_ids_from_dynamo(config.get('website-metadata-tablename'), 'ArchivesSpace')
        event['countToProcess'] = len(event['ids'])
    start_time = time.time()
    time_to_break = datetime.now() + timedelta(seconds=config['seconds-to-allow-for-processing'])
    print("Will break after ", time_to_break)
    if event.get('archivesSpaceExecutionCount', 0) == 1 and not event.get('local'):
        save_source_system_record(config.get('website-metadata-tablename'), 'ArchivesSpace')
    harvest_oai_eads_class = HarvestOaiEads(config)
    standard_json_helpers_class = StandardJsonHelpers(config)
    save_standard_json_to_dynamo_class = SaveStandardJsonToDynamo(config)
    ids = event.get("ids", [])
    while len(ids) > 0 and datetime.now() < time_to_break:
        standard_json = harvest_oai_eads_class.get_standard_json_from_archives_space_url(ids[0])
        if standard_json:
            print("ArchivesSpace ead_id = ", standard_json.get("id", ""), " source_system_url = ", ids[0], int(time.time() - start_time), 'seconds.')
            standard_json = standard_json_helpers_class.enhance_standard_json(standard_json)
            save_standard_json(config, standard_json)
            save_standard_json_to_dynamo_class.save_standard_json(standard_json)
        del ids[0]
    event['countRemaining'] = len(event['ids'])
    event['archivesSpaceHarvestComplete'] = (len(ids) == 0)
    event['eadsSavedToS3'] = os.path.join(config['process-bucket'], config['process-bucket-data-basepath'])
    if event["archivesSpaceExecutionCount"] >= event["maximumArchivesSpaceExecutions"]:
        event['archivesSpaceHarvestComplete'] = True
    return event
def run(event, _context):
    """ run the process to retrieve and process web kiosk metadata """
    _suplement_event(event)
    config = setup_pipeline_config(event)
    google_config = load_config_ssm(config['google_keys_ssm_base'])
    config.update(google_config)
    museum_config = load_config_ssm(config['museum_keys_ssm_base'])
    config.update(museum_config)
    time_to_break = datetime.now() + timedelta(
        seconds=config['seconds-to-allow-for-processing'])
    print("Will break after ", time_to_break)

    mode = event.get("mode", "full")
    if mode not in ["full", "incremental", "ids"]:
        mode = "full"
    json_web_kiosk_class = ProcessWebKioskJsonMetadata(config, event,
                                                       time_to_break)
    if event["museumExecutionCount"] == 1:
        if not event.get('local'):
            save_file_system_record(config.get('website-metadata-tablename'),
                                    'Google', 'Museum')
            save_source_system_record(config.get('website-metadata-tablename'),
                                      'EmbARK')
        composite_json = json_web_kiosk_class.get_composite_json_metadata(mode)
        museum_image_metadata = json_web_kiosk_class.find_images_for_composite_json_metadata(
            composite_json)
        composite_json = CleanUpCompositeJson(
            composite_json).cleaned_up_content
        event['countToProcess'] = len(composite_json.get('objects'))
        write_s3_json(config['process-bucket'],
                      'museum_composite_metadata.json', composite_json)
        write_s3_json(config['process-bucket'], 'museum_image_metadata.json',
                      museum_image_metadata)
    else:
        composite_json = read_s3_json(config['process-bucket'],
                                      'museum_composite_metadata.json')
        museum_image_metadata = read_s3_json(config['process-bucket'],
                                             'museum_image_metadata.json')

    if composite_json:
        objects_processed = json_web_kiosk_class.process_composite_json_metadata(
            composite_json, museum_image_metadata)
        event['museumHarvestComplete'] = _done_processing(composite_json)
    else:
        print('No JSON to process')

    if event["museumExecutionCount"] >= event["maximumMuseumExecutions"]:
        event['museumHarvestComplete'] = True
    if event['museumHarvestComplete']:
        if s3_file_exists(config['process-bucket'],
                          'museum_composite_metadata.json'):
            delete_s3_key(config['process-bucket'],
                          'museum_composite_metadata.json')
        if s3_file_exists(config['process-bucket'],
                          'museum_image_metadata.json'):
            delete_s3_key(config['process-bucket'],
                          'museum_image_metadata.json')
    elif composite_json:
        write_s3_json(config['process-bucket'],
                      'museum_composite_metadata.json', composite_json)
        key = 'countHarvestedLoop' + str(event["museumExecutionCount"])
        event[key] = objects_processed
    event['countRemaining'] = len(composite_json.get('objects'))
    return event
 def setUp(self):
     self.event = {"local": True}
     self.event['local-path'] = str(
         Path(__file__).parent.absolute()) + "/../example/"
     self.config = setup_pipeline_config(self.event)