def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler( EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception, e: msg = 'Exception in Collection {}, init {}'.format(url_api_collection, str(e)) logbook.error(msg) raise e
def main(collection_key=None, url_couchdb=None, object_auth=None, get_if_object=False): cleanup_work_dir() # remove files from /tmp doc_ids, report_errors = ImageHarvester( url_couchdb=url_couchdb, object_auth=object_auth, get_if_object=get_if_object).by_collection(collection_key)
def main(collection_key=None, url_couchdb=None, object_auth=None, get_if_object=False, ignore_content_type=False): cleanup_work_dir() # remove files from /tmp doc_ids, report_errors = ImageHarvester( url_couchdb=url_couchdb, object_auth=object_auth, get_if_object=get_if_object, ignore_content_type=ignore_content_type).by_collection(collection_key)
def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False, **kwargs): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception as e: msg = 'Exception in Collection {}, init {}'.format( url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler, **kwargs) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp)) logger.info('Enriched records') resp = save_records.main([None, ingest_doc_id]) if not resp >= 0: logger.error("Error saving records {0}".format(str(resp))) raise Exception("Error saving records {0}".format(str(resp))) num_saved = resp logger.info("SAVED RECS : {}".format(num_saved)) resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) subject = format_results_subject(collection.id, 'Harvest to CouchDB {env} ') publish_to_harvesting( subject, 'Finished metadata harvest for CID: {}\n' 'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved)) log_handler.pop_application() mail_handler.pop_application()
def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False, **kwargs): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler( EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception as e: msg = 'Exception in Collection {}, init {}'.format(url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler, **kwargs) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp)) logger.info('Enriched records') resp = save_records.main([None, ingest_doc_id]) if not resp >= 0: logger.error("Error saving records {0}".format(str(resp))) raise Exception("Error saving records {0}".format(str(resp))) num_saved = resp logger.info("SAVED RECS : {}".format(num_saved)) resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) subject = format_results_subject(collection.id, 'Harvest to CouchDB {env} ') publish_to_harvesting(subject, 'Finished metadata harvest for CID: {}\n' 'Fetched: {}\nSaved: {}'.format( collection.id, num_recs, num_saved)) log_handler.pop_application() mail_handler.pop_application()