def by_collection(self, collection_key=None): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' if collection_key: v = couchdb_pager( self._couchdb, view_name=self._view, startkey='"{0}"'.format(collection_key), endkey='"{0}"'.format(collection_key), include_docs='true') else: # use _all_docs view v = couchdb_pager(self._couchdb, include_docs='true') doc_ids = [] report_errors = defaultdict(list) for r in v: dt_start = dt_end = datetime.datetime.now() try: reports = self.harvest_image_for_doc(r.doc) except ImageHarvestError as e: report_errors[e.dict_key].append((e.doc_id, str(e))) doc_ids.append(r.doc['_id']) dt_end = datetime.datetime.now() time.sleep((dt_end - dt_start).total_seconds()) report_list = [ ' : '.join((key, str(val))) for key, val in report_errors.items() ] report_msg = '\n'.join(report_list) subject = format_results_subject(collection_key, 'Image harvest to CouchDB {env}') publish_to_harvesting(subject, ''.join( ('Processed {} documents\n'.format(len(doc_ids)), report_msg))) return doc_ids, report_errors
def by_collection(self, collection_key=None): '''If collection_key is none, trying to grab all of the images. (Not recommended) ''' if collection_key: v = couchdb_pager(self._couchdb, view_name=self._view, startkey='"{0}"'.format(collection_key), endkey='"{0}"'.format(collection_key), include_docs='true') else: # use _all_docs view v = couchdb_pager(self._couchdb, include_docs='true') doc_ids = [] report_errors = defaultdict(list) for r in v: dt_start = dt_end = datetime.datetime.now() try: reports = self.harvest_image_for_doc(r.doc) except ImageHarvestError as e: report_errors[e.dict_key].append((e.doc_id, str(e))) doc_ids.append(r.doc['_id']) dt_end = datetime.datetime.now() time.sleep((dt_end - dt_start).total_seconds()) report_list = [ ' : '.join((key, str(val))) for key, val in report_errors.items() ] report_msg = '\n'.join(report_list) subject = format_results_subject(collection_key, 'Image harvest to CouchDB {env}') publish_to_harvesting( subject, ''.join( ('Processed {} documents\n'.format(len(doc_ids)), report_msg))) return doc_ids, report_errors
def by_collection(self, collection_key=None): """If collection_key is none, trying to grab all of the images. (Not recommended) """ if collection_key: v = couchdb_pager( self._couchdb, view_name=self._view, startkey='"{0}"'.format(collection_key), endkey='"{0}"'.format(collection_key), include_docs="true", ) else: # use _all_docs view v = couchdb_pager(self._couchdb, include_docs="true") doc_ids = [] for r in v: dt_start = dt_end = datetime.datetime.now() reports = self.harvest_image_for_doc(r.doc) doc_ids.append(r.doc["_id"]) dt_end = datetime.datetime.now() time.sleep((dt_end - dt_start).total_seconds()) publish_to_harvesting( "Image harvested {}".format(collection_key), "Processed {} documents".format(len(doc_ids)) ) return doc_ids
def delete_solr_item_by_id(item_id): url_solr = os.environ['URL_SOLR'] body = 'stream.body=<delete><id>{}</id></delete>'.format(item_id) url_delete = '{}/update?{}&commit=true'.format(url_solr, body) response = requests.get(url_delete) response.raise_for_status() subject = format_results_subject(item_id, 'Deleted document from Solr {env} ') publish_to_harvesting(subject, 'DELETED {}'.format(item_id))
def delete_collection(cid): print >> sys.stderr, "DELETING COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb) publish_to_harvesting( 'Deleted CouchDB Collection {}'.format(cid), 'Deleted {} documents from CouchDB collection {}'.format(num_deleted, cid)) return num_deleted, deleted_docs
def delete_solr_collection(collection_key): '''Delete a solr collection for the environment''' url_solr = os.environ['URL_SOLR'] COLLECTION_URL_FORMAT = 'https://registry.cdlib.org/api/v1/collection/{}/' collection_url = COLLECTION_URL_FORMAT.format(collection_key) query = 'stream.body=<delete><query>collection_url:\"{}\"</query>' \ '</delete>&commit=true'.format(collection_url) url_delete = '{}/update?{}'.format(url_solr, query) response = requests.get(url_delete) response.raise_for_status() publish_to_harvesting('Deleted solr collection {}'.format(collection_key), 'DELETED {}'.format(collection_key))
def delete_collection(cid): print >> sys.stderr, "DELETING COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb) subject = format_results_subject(cid, 'Deleted documents from CouchDB {env} ') publish_to_harvesting( subject, 'Deleted {} documents from CouchDB collection CID: {}'.format( num_deleted, cid)) return num_deleted, deleted_docs
def main(url_remote_couchdb, url_api_collection): '''Update to the current environment's couchdb a remote couchdb collection ''' collection = Collection(url_api_collection) total, updated, created = update_collection_from_remote( url_remote_couchdb, url_api_collection) msg = 'Synced {} documents to production for CouchDB collection {}'.format( total, collection.id) msg += '\nUpdated {} documents, created {} documents.'.format( updated, created) publish_to_harvesting('Synced CouchDB Collection {}'.format(collection.id), msg)
def delete_solr_collection(collection_key): '''Delete a solr collection for the environment''' url_solr = os.environ['URL_SOLR'] COLLECTION_URL_FORMAT = 'https://registry.cdlib.org/api/v1/collection/{}/' collection_url = COLLECTION_URL_FORMAT.format(collection_key) query = 'stream.body=<delete><query>collection_url:\"{}\"</query>' \ '</delete>&commit=true'.format(collection_url) url_delete = '{}/update?{}'.format(url_solr, query) response = requests.get(url_delete) response.raise_for_status() subject = format_results_subject(collection_key, 'Deleted documents from Solr {env} ') publish_to_harvesting(subject, 'DELETED {}'.format(collection_key))
def update_couch_docs_by_collection(cid, fieldName, newValue): print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_updated, updated_docs = update_by_id_list( ids, fieldName, newValue, _couchdb=_couchdb) subject = format_results_subject(cid, 'Updated documents from CouchDB {env} ') publish_to_harvesting( subject, 'Updated {} documents from CouchDB collection CID: {}'.format( num_updated, cid)) return num_updated, updated_docs
def main(url_remote_couchdb, url_api_collection): '''Update to the current environment's couchdb a remote couchdb collection ''' collection = Collection(url_api_collection) total, updated, created = update_collection_from_remote( url_remote_couchdb, url_api_collection) msg = 'Synced {} documents to production for CouchDB collection {}'.format( total, collection.id) msg += '\nUpdated {} documents, created {} documents.'.format( updated, created) publish_to_harvesting( 'Synced CouchDB Collection {}'.format(collection.id), msg)
def update_couch_docs_by_collection(cid, fieldName, newValue, substring): print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid) _couchdb = get_couchdb() rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb) ids = [row['id'] for row in rows] num_updated, updated_docs = update_by_id_list(ids, fieldName, newValue, substring, _couchdb=_couchdb) subject = format_results_subject(cid, 'Updated documents from CouchDB {env} ') publish_to_harvesting( subject, 'Updated {} documents from CouchDB collection CID: {}'.format( num_updated, cid)) return num_updated, updated_docs
def execute_job(self, job, queue): """Spawns a work horse to perform the actual work and passes it a job. The worker will wait for the work horse and make sure it executes within the given timeout bounds, or will end the work horse with SIGALRM. """ worker_name = (self.key.rsplit(':', 1)[1]).rsplit('.', 1)[0] subject, msg = create_execute_job_message("Started", worker_name, job) logging.info(msg) publish_to_harvesting(subject, msg) self.set_state('busy') self.fork_work_horse(job, queue) self.monitor_work_horse(job) subject, msg = create_execute_job_message("Completed", worker_name, job) logging.info(msg) publish_to_harvesting(subject, msg) self.set_state('idle')
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter( couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report( collection_key, updated_docs, num_added, report)) return updated_docs, report
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr delete_solr_collection(collection_key) URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report(collection_key, updated_docs, num_added, report)) return updated_docs, report
couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) results = [] for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError, e: print(e.message) continue solr_doc = map_couch_to_solr_doc(r.doc) results.append(solr_doc) solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), '{} documents updated'.format(len(results))) return results def main(url_couchdb=None, dbname=None, url_solr=None, all_docs=False, since=None): '''Use the _changes feed with a "since" parameter to only catch new changes to docs. The _changes feed will only have the *last* event on a document and does not retain intermediate changes. Setting the "since" to 0 will result in getting a _changes record for each document, essentially dumping the db to solr '''
def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False, **kwargs): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception as e: msg = 'Exception in Collection {}, init {}'.format( url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler, **kwargs) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp)) logger.info('Enriched records') resp = save_records.main([None, ingest_doc_id]) if not resp >= 0: logger.error("Error saving records {0}".format(str(resp))) raise Exception("Error saving records {0}".format(str(resp))) num_saved = resp logger.info("SAVED RECS : {}".format(num_saved)) resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) subject = format_results_subject(collection.id, 'Harvest to CouchDB {env} ') publish_to_harvesting( subject, 'Finished metadata harvest for CID: {}\n' 'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved)) log_handler.pop_application() mail_handler.pop_application()
resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) publish_to_harvesting('Harvesting completed for {}'.format(collection.id), 'Finished harvest for {}'.format(collection.id)) # the image_harvest should be a separate job, with a long timeout if run_image_harvest: job = queue_image_harvest( config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], config['couchdb_url'], collection.id, rq_queue, object_auth=collection.auth) logger.info("Started job for image_harvest:{}".format(job.result)) log_handler.pop_application() mail_handler.pop_application()
def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False, **kwargs): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler( EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception as e: msg = 'Exception in Collection {}, init {}'.format(url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler, **kwargs) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp)) logger.info('Enriched records') resp = save_records.main([None, ingest_doc_id]) if not resp >= 0: logger.error("Error saving records {0}".format(str(resp))) raise Exception("Error saving records {0}".format(str(resp))) num_saved = resp logger.info("SAVED RECS : {}".format(num_saved)) resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) subject = format_results_subject(collection.id, 'Harvest to CouchDB {env} ') publish_to_harvesting(subject, 'Finished metadata harvest for CID: {}\n' 'Fetched: {}\nSaved: {}'.format( collection.id, num_recs, num_saved)) log_handler.pop_application() mail_handler.pop_application()
def exception_to_sns(job, *exc_info): '''Make an exception handler to report exceptions to SNS msg queue''' subject = 'FAILED: job {}'.format(job.description) message = 'ERROR: job {} failed\n{}'.format(job.description, exc_info[1]) logging.error(message) publish_to_harvesting(subject, message)