Beispiel #1
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(
            EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception, e:
        msg = 'Exception in Collection {}, init {}'.format(url_api_collection,
                                                           str(e))
        logbook.error(msg)
        raise e
Beispiel #2
0
def main(collection_key=None,
         url_couchdb=None,
         object_auth=None,
         get_if_object=False):
    cleanup_work_dir()  # remove files from /tmp
    doc_ids, report_errors = ImageHarvester(
        url_couchdb=url_couchdb,
        object_auth=object_auth,
        get_if_object=get_if_object).by_collection(collection_key)
Beispiel #3
0
def main(collection_key=None,
         url_couchdb=None,
         object_auth=None,
         get_if_object=False,
         ignore_content_type=False):
    cleanup_work_dir()  # remove files from /tmp
    doc_ids, report_errors = ImageHarvester(
        url_couchdb=url_couchdb,
        object_auth=object_auth,
        get_if_object=get_if_object,
        ignore_content_type=ignore_content_type).by_collection(collection_key)
Beispiel #4
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS,
                                           emails,
                                           level='ERROR',
                                           bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception as e:
        msg = 'Exception in Collection {}, init {}'.format(
            url_api_collection, str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler,
        **kwargs)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
    logger.info('Enriched records')

    resp = save_records.main([None, ingest_doc_id])
    if not resp >= 0:
        logger.error("Error saving records {0}".format(str(resp)))
        raise Exception("Error saving records {0}".format(str(resp)))
    num_saved = resp
    logger.info("SAVED RECS : {}".format(num_saved))

    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))
    subject = format_results_subject(collection.id,
                                     'Harvest to CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Finished metadata harvest for CID: {}\n'
        'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved))

    log_handler.pop_application()
    mail_handler.pop_application()
Beispiel #5
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(
            EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception as e:
        msg = 'Exception in Collection {}, init {}'.format(url_api_collection,
                                                           str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler,
        **kwargs)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
    logger.info('Enriched records')

    resp = save_records.main([None, ingest_doc_id])
    if not resp >= 0:
        logger.error("Error saving records {0}".format(str(resp)))
        raise Exception("Error saving records {0}".format(str(resp)))
    num_saved = resp
    logger.info("SAVED RECS : {}".format(num_saved))

    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))
    subject = format_results_subject(collection.id,
                                     'Harvest to CouchDB {env} ')
    publish_to_harvesting(subject,
                          'Finished metadata harvest for CID: {}\n'
                          'Fetched: {}\nSaved: {}'.format(
                              collection.id,
                              num_recs,
                              num_saved))

    log_handler.pop_application()
    mail_handler.pop_application()