def test_iso_utc_with_tz(): """iso_utc_with_tz() should yield a UTC datetime as a string with timezone symbol Z""" utc_now = datetime.utcnow() utc_now_iso_string_with_tz = utc_now.isoformat() + "Z" assert utc_now_iso_string_with_tz == iso_utc_with_tz(utc_now) # The following was added because of the regression described in #7699. time1 = iso_utc_with_tz() sleep(1) time2 = iso_utc_with_tz() assert time1 != time2
def main(argv): print "WARNING: Bulk data is now exported/maintained using elasticdump." print "See https://github.com/dpla/automation/blob/develop/ansible/roles/exporter/files/export-provider.sh" parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "dashboard_cleanup_process/status") != "complete": print "Error, dashboard cleanup process did not complete" return -1 # Update ingestion document kwargs = { "upload_bulk_data_process/status": "running", "upload_bulk_data_process/start_time": iso_utc_with_tz(), "upload_bulk_data_process/end_time": None, "upload_bulk_data_process/error": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # TODO: as in the fetch_records.py script, we need profile in this scope # and the file shouldn't have to be opened again with open(ingestion_doc["profile_path"], "r") as profile: contributor = getprop(json.load(profile), "contributor/name") resp = export_database.main([None, "source", contributor, "upload"]) if resp == -1: status = "error" error_msg = "Error uploading bulk data" else: status = "complete" error_msg = None # Update ingestion document kwargs = { "upload_bulk_data_process/status": status, "upload_bulk_data_process/error": error_msg, "upload_bulk_data_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def create_ingestion_doc_and_backup_db(self, provider): """Creates the ingestion document and backs up the provider documents if this is not the first ingestion, then returns the ingestion document id. """ ingestion_doc = { "provider": provider, "type": "ingestion", "ingestDate": iso_utc_with_tz(), "countAdded": 0, "countChanged": 0, "countDeleted": 0 } last_ingestion_doc = self._get_last_ingestion_doc_for(provider) if not last_ingestion_doc: ingestion_sequence = 1 else: # Since this is not the first ingestion we will back up the # provider documents and upate the current ingestion document with # the backup database name. ingestion_sequence = last_ingestion_doc["ingestionSequence"] + 1 backup_db_name = self._backup_db(provider) ingestion_doc["backupDB"] = backup_db_name self.dashboard_db.save(ingestion_doc) ingestion_doc["ingestionSequence"] = ingestion_sequence ingestion_doc_id = self.dashboard_db.save(ingestion_doc)[0] return ingestion_doc_id
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "save_process/status") != "complete": print "Error, save process did not complete" return -1 # Update ingestion document kwargs = { "delete_process/status": "running", "delete_process/start_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 resp, total_deleted = couch.process_deleted_docs(ingestion_doc) if resp == -1: status = "error" error_msg = "Error deleting documents; only %s deleted" % total_deleted else: status = "complete" error_msg = None msg = "Total documents deleted: %s" % total_deleted print msg logger.info(msg) # Update ingestion document kwargs = { "delete_process/status": status, "delete_process/error": error_msg, "delete_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "check_counts_process/status") != "complete": print "Error, checkk counts process did not complete" return -1 # Update ingestion document kwargs = { "dashboard_cleanup_process/status": "running", "dashboard_cleanup_process/start_time": iso_utc_with_tz(), "dashboard_cleanup_process/end_time": None, "dashboard_cleanup_process/error": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 resp, total_deleted = couch.dashboard_cleanup(ingestion_doc) if resp == -1: status = "error" error_msg = "Error deleting documents; only %s deleted" % total_deleted else: status = "complete" error_msg = None print "Total dashboard documents deleted: %s" % total_deleted # Update ingestion document kwargs = { "dashboard_cleanup_process/status": status, "dashboard_cleanup_process/error": error_msg, "dashboard_cleanup_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "delete_process/status") != "complete": print "Error, delete process did not complete" return -1 # Update ingestion document kwargs = { "check_counts_process/status": "running", "check_counts_process/start_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Check each count against the threshold alerts = [] count_type = ("Added", "Changed", "Deleted") for ctype in count_type: count = int(ingestion_doc["count" + ctype]) threshold = int(ingestion_doc["thresholds"][ctype.lower()]) if count > threshold: alerts.append("%s items %s exceeds threshold of %s" % (count, ctype.lower(), threshold)) error_msg = None if alerts: config_file = "akara.ini" config = ConfigParser.ConfigParser() config.readfp(open(config_file)) to = [s.strip() for s in config.get("Alert", "To").split(",")] frm = config.get("Alert", "From") month = dateparser.parse(ingestion_doc["ingestDate"]).strftime("%B") alerts = "\n".join(alerts) msg = MIMEText(alerts) msg["Subject"] = "Threshold(s) exceeded for %s ingestion of %s" % \ (month, ingestion_doc["provider"]) msg["To"] = ", ".join(to) msg["From"] = frm try: s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e
def update_bulk_download_document(self, contributor, file_path, file_size): """Creates/updates a document for a contributor's bulk data file and returns the document id """ bulk_download_doc = self._get_bulk_download_doc(contributor) bulk_download_doc.update({ "file_path": file_path, "file_size": file_size, "last_updated": iso_utc_with_tz() }) return self.bulk_download_db.save(bulk_download_doc)[0]
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() i_doc = couch.dashboard_db[args.ingestion_document_id] if i_doc['delete_process']['status'] != 'complete': print >> sys.stderr, 'Error: delete process did not complete' return 1 # Update ingestion document to indicate that we're running kwargs = { 'check_counts_process/status': 'running', 'check_counts_process/start_time': iso_utc_with_tz() } try: couch.update_ingestion_doc(i_doc, **kwargs) except: tb = traceback.format_exc(5) print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb) return 1 error_msg = None try: config = ConfigParser.ConfigParser() config.readfp(open('akara.ini')) to = [s.strip() for s in config.get('Alert', 'To').split(',')] frm = config.get('Alert', 'From') body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc)) msg = MIMEText(body) msg['Subject'] = "%s ingest #%s" % (i_doc['provider'], i_doc['ingestionSequence']) msg['To'] = ', '.join(to) msg['From'] = frm s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e tb = traceback.format_exc(5) print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg, tb)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() i_doc = couch.dashboard_db[args.ingestion_document_id] if i_doc['delete_process']['status'] != 'complete': print >> sys.stderr, 'Error: delete process did not complete' return 1 # Update ingestion document to indicate that we're running kwargs = {'check_counts_process/status': 'running', 'check_counts_process/start_time': iso_utc_with_tz()} try: couch.update_ingestion_doc(i_doc, **kwargs) except: tb = traceback.format_exc(5) print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb) return 1 error_msg = None try: config = ConfigParser.ConfigParser() config.readfp(open('akara.ini')) to = [s.strip() for s in config.get('Alert', 'To').split(',')] frm = config.get('Alert', 'From') body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc)) msg = MIMEText(body) msg['Subject'] = "%s ingest #%s" % (i_doc['provider'], i_doc['ingestionSequence']) msg['To'] = ', '.join(to) msg['From'] = frm s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e tb = traceback.format_exc(5) print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg, tb)
def create_sitemap_index(path): global CONFIG site_map_uri = CONFIG.get("Sitemap", "SitemapURI") fpath = os.path.join(path, "all_item_urls.xml") with open(fpath, "w") as f: line = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ '<sitemapindex xmlns="' + \ 'http://www.sitemaps.org/schemas/sitemap/0.9">\n' f.write(line) for item in os.listdir(path): # Skip file being written to if item == "all_item_urls.xml": continue # sitemaps.dp.la is a CNAME pointing to the CDN host file_uri = url_join(site_map_uri, item) lastmod_dt = datetime.utcfromtimestamp( os.path.getmtime(os.path.join(path, item))) line = "\t<sitemap>\n" + \ "\t\t<loc>%s</loc>\n\t\t<lastmod>%s</lastmod>\n" % \ (file_uri, iso_utc_with_tz(lastmod_dt)) + "\t</sitemap>\n" f.write(line) f.write("</sitemapindex>")
def create_sitemap_index(path): global CONFIG site_map_uri = CONFIG.get("Sitemap", "SitemapURI") fpath = os.path.join(path, "all_item_urls.xml") with open(fpath, "w") as f: line = '<?xml version="1.0" encoding="UTF-8"?>\n' + \ '<sitemapindex xmlns="' + \ 'http://www.sitemaps.org/schemas/sitemap/0.9">\n' f.write(line) for item in os.listdir(path): # Skip file being written to if item == "all_item_urls.xml": continue # sitemaps.dp.la is a CNAME pointing to the CDN host file_uri = url_join(site_map_uri, item) lastmod_dt = datetime.utcfromtimestamp(os.path.getmtime( os.path.join(path, item) )) line = "\t<sitemap>\n" + \ "\t\t<loc>%s</loc>\n\t\t<lastmod>%s</lastmod>\n" % \ (file_uri, iso_utc_with_tz(lastmod_dt)) + "\t</sitemap>\n" f.write(line) f.write("</sitemapindex>")
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = "akara.ini" fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e if error_msg: print >> sys.stderr, ("********************\n" + "Error sending alert email: %s" % error_msg) print >> sys.stderr, ("Alerts:\n%s" % alerts + "\n********************") # Update ingestion document status = "complete" kwargs = { "check_counts_process/status": status, "check_counts_process/error": error_msg, "check_counts_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1 if __name__ == '__main__': main(sys.argv)
msg['Subject'] = "%s ingest #%s" % (i_doc['provider'], i_doc['ingestionSequence']) msg['To'] = ', '.join(to) msg['From'] = frm s = smtplib.SMTP("localhost") s.sendmail(frm, to, msg.as_string()) s.quit() except Exception, e: error_msg = e tb = traceback.format_exc(5) print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg, tb) # Update ingestion document kwargs = {'check_counts_process/status': 'complete', 'check_counts_process/error': error_msg, 'check_counts_process/end_time': iso_utc_with_tz()} try: couch.update_ingestion_doc(i_doc, **kwargs) except: tb = traceback.format_exc(5) print >> sys.stderr, "Error updating ingestion document %s\n%s" \ % (i_doc["_id"], tb) return 1 return 0 if __name__ == '__main__': rv = main(sys.argv) sys.exit(rv)
pipeline = ",".join(profile[args.pipeline]) else: pipeline = args.pipeline provider = profile.get(u"name") contributor = profile.get(u"contributor", {}) # Create ingestion document couch = Couch() ingestion_doc_id = create_ingestion_document.main([None, args.profile_path]) ingestion_doc = couch.dashboard_db[ingestion_doc_id] # Update ingestion document kwargs = { "poll_storage_process/status": "running", "poll_storage_process/start_time": iso_utc_with_tz(), "poll_storage_process/end_time": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return # Back up current data resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "poll_storage_process/status": "error",
def _ts_for_err(self): return "[%s]" % iso_utc_with_tz()
def enrich(body, ctype): """ Establishes a pipeline of services identified by an ordered list of URIs provided in two request headers, one for collections and one for items. Returns a JSON dump of the collections and records enriched along with a count of records enriched. """ request_headers = copy_headers_to_dict(request.environ) item_enrichments = request_headers.get(u"Pipeline-Item", "").split(",") coll_enrichments = request_headers.get(u"Pipeline-Coll", "").split(",") records = json.loads(body) # Counts for enrich script enriched_coll_count = 0 enriched_item_count = 0 missing_id_count = 0 missing_source_resource_count = 0 errors = [] enriched_records = {} for record in records: if record.get("ingestType") == "collection": wsgi_header = "HTTP_PIPELINE_COLL" enrichments = coll_enrichments else: wsgi_header = "HTTP_PIPELINE_ITEM" enrichments = item_enrichments # Preserve record prior to any enrichments record["originalRecord"] = record.copy() record["ingestType"] = "item" # Explicitly populate ingestDate as UTC record["ingestDate"] = iso_utc_with_tz() error, enriched_record_text = pipe(record, ctype, enrichments, wsgi_header) enriched_record = json.loads(enriched_record_text) if error: errors.append(error) ingest_type = record.get("ingestType") # Enriched record should have an _id if enriched_record.get("_id", None): # Item records should have sourceResource if (ingest_type == "item" and not "sourceResource" in enriched_record): logger.error("Records %s does not have sourceResource: %s" % (enriched_record["_id"], enriched_record)) missing_source_resource_count += 1 else: enriched_records[enriched_record["_id"]] = enriched_record if ingest_type == "item": enriched_item_count += 1 else: enriched_coll_count += 1 else: missing_id_count += 1 data = { "enriched_records": enriched_records, "enriched_coll_count": enriched_coll_count, "enriched_item_count": enriched_item_count, "missing_id_count": missing_id_count, "missing_source_resource_count": missing_source_resource_count, "errors": errors } return json.dumps(data)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) batch_size = 500 couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": print "Cannot save, enrich process did not complete" return -1 # Update ingestion document kwargs = { "save_process/status": "running", "save_process/start_time": iso_utc_with_tz(), "save_process/end_time": None, "save_process/error": None, "save_process/total_saved": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Back up provider data if args.backup: resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "save_process/status": "error", "save_process/end_time": iso_utc_with_tz(), "save_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return resp error_msg = None enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir") total_items = 0 total_collections = 0 sync_point = 5000 docs = {} for file in os.listdir(enrich_dir): filename = os.path.join(enrich_dir, file) with open(filename, "r") as f: try: file_docs = json.loads(f.read()) except: error_msg = "Error loading " + filename break # Save when docs is about to exceed the batch size print >> sys.stderr, "Read file %s" % filename if docs and len(docs) + len(file_docs) > batch_size: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp == -1: docs = None break items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) #if total_items > sync_point: # print "Syncing views" # couch.sync_views(couch.dpla_db.name) # sync_point = total_items + 10000 # Set docs for the next iteration docs = file_docs else: docs.update(file_docs) # Last save if docs: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp != -1: items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) #print "Syncing views" #couch.sync_views(couch.dpla_db.name) print "Total items: %s" % total_items print "Total collections: %s" % total_collections if error_msg: status = "error" else: status = "complete" kwargs = { "save_process/status": status, "save_process/error": error_msg, "save_process/end_time": iso_utc_with_tz(), "save_process/total_items": total_items, "save_process/total_collections": total_collections } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress enrich dir, then delete make_tarfile(enrich_dir) shutil.rmtree(enrich_dir) return total_items if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) batch_size = 500 couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": print "Cannot save, enrich process did not complete" return -1 # Update ingestion document kwargs = { "save_process/status": "running", "save_process/start_time": iso_utc_with_tz(), "save_process/end_time": None, "save_process/error": None, "save_process/total_saved": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Back up provider data if args.backup: resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "save_process/status": "error", "save_process/end_time": iso_utc_with_tz(), "save_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return resp error_msg = None enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir") total_items = 0 total_collections = 0 sync_point = 5000 docs = {} for file in os.listdir(enrich_dir): filename = os.path.join(enrich_dir, file) with open(filename, "r") as f: try: file_docs = json.loads(f.read()) except: error_msg = "Error loading " + filename break # Save when docs is about to exceed the batch size print >> sys.stderr, "Read file %s" % filename if docs and len(docs) + len(file_docs) > batch_size: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp == -1: docs = None break items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) if total_items > sync_point: print "Syncing views" couch.sync_views(couch.dpla_db.name) sync_point = total_items + 10000 # Set docs for the next iteration docs = file_docs else: docs.update(file_docs) # Last save if docs: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp != -1: items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) print "Syncing views" couch.sync_views(couch.dpla_db.name) print "Total items: %s" % total_items print "Total collections: %s" % total_collections if error_msg: status = "error" else: status = "complete" kwargs = { "save_process/status": status, "save_process/error": error_msg, "save_process/end_time": iso_utc_with_tz(), "save_process/total_items": total_items, "save_process/total_collections": total_collections } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress enrich dir, then delete make_tarfile(enrich_dir) shutil.rmtree(enrich_dir) return 0 if status == "complete" else -1
def _create_ingestion_document(self, provider, uri_base, profile_path, thresholds, fetcher_threads=1): """Creates and returns an ingestion document for the provider. """ ingestion_doc = { "provider": provider, "type": "ingestion", "ingestionSequence": None, "ingestDate": iso_utc_with_tz(), "countAdded": 0, "countChanged": 0, "countDeleted": 0, "uri_base": uri_base, "profile_path": profile_path, "fetcher_threads": fetcher_threads, "fetch_process": { "status": None, "start_time": None, "end_time": None, "data_dir": None, "error": None, "total_items": None, "total_collections": None }, "enrich_process": { "status": None, "start_time": None, "end_time": None, "data_dir": None, "error": None, "total_items": None, "thresholdDeleted": 0, "total_collections": None, "missing_id": None, "missing_source_resource": None }, "save_process": { "status": None, "start_time": None, "end_time": None, "error": None, "total_items": None, "total_collections": None }, "delete_process": { "status": None, "start_time": None, "end_time": None, "error": None }, "check_counts_process": { "status": None, "start_time": None, "end_time": None, "error": None }, "dashboard_cleanup_process": { "status": None, "start_time": None, "end_time": None, "error": None }, "upload_bulk_data_process": { "status": None, "start_time": None, "end_time": None, "error": None }, "poll_storage_process": { "status": None, "start_time": None, "end_time": None, "error": None, "total_items": None, "total_collections": None, "missing_id": None, "missing_source_resource": None } } ingestion_doc.update({"thresholds": thresholds}) # Set the ingestion sequence latest_ingestion_doc = self._get_last_ingestion_doc_for(provider) if latest_ingestion_doc is None: ingestion_sequence = 1 else: ingestion_sequence = 1 + latest_ingestion_doc["ingestionSequence"] ingestion_doc["ingestionSequence"] = ingestion_sequence # Save the ingestion document and get its ID ingestion_doc_id = self.dashboard_db.save(ingestion_doc)[0] return ingestion_doc_id
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini") fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def main(argv): global threads_working parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir") enrich_dir = create_enrich_dir(ingestion_doc["provider"]) if getprop(ingestion_doc, "fetch_process/status") != "complete": print >> sys.stderr, "Cannot enrich, fetch process did not complete" return 1 # Update ingestion document status = "running" kwargs = { "enrich_process/status": status, "enrich_process/data_dir": enrich_dir, "enrich_process/start_time": iso_utc_with_tz(), "enrich_process/end_time": None, "enrich_process/error": None, "enrich_process/total_items": None, "enrich_process/total_collections": None, "enrich_process/missing_id": None, "enrich_process/missing_source_resource": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 with open(ingestion_doc["profile_path"], "r") as f: profile = json.loads(f.read()) # Counts for logger info stats = { 'enriched_items': 0, 'enriched_colls': 0, 'missing_id': 0, 'missing_source_resource': 0 } # Initialize queue and threads queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc, profile=profile, stats=stats, enrich_dir=enrich_dir) # Initialize list of input files listing = os.listdir(fetch_dir) # Initialize counters and statistics dashboard_errors = [] file_count = 0 status = None total_files = len(listing) files = iter(listing) try: # Keep the queue full of filenames while True: time.sleep(0.25) try: if print_errors_thrown(thread_errors): dashboard_errors.extend(thread_errors) raise Exception() if not queue.full(): basename = files.next() filename = os.path.join(fetch_dir, basename) file_count += 1 print "Enqueuing: %s (%s of %s)" % \ (filename, file_count, total_files) queue.put(filename) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(thread_errors): break else: dashboard_errors.extend(thread_errors) raise Exception() time.sleep(0.25) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg dashboard_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" dashboard_errors.append(e.message) finally: print "Enriched items: %s" % stats['enriched_items'] print "Enriched collections: %s" % stats['enriched_colls'] print "Missing ID: %s" % stats['missing_id'] print "Missing sourceResource: %s" % stats['missing_source_resource'] if not status == "error": status = "complete" # Prepare fields for ingestion document update couch_kwargs = { "enrich_process/status": status, "enrich_process/error": dashboard_errors, "enrich_process/end_time": iso_utc_with_tz(), "enrich_process/total_items": stats['enriched_items'], "enrich_process/total_collections": stats['enriched_colls'], "enrich_process/missing_id": stats['missing_id'], "enrich_process/missing_source_resource": \ stats['missing_source_resource'] } try: # Update ingestion document couch.update_ingestion_doc(ingestion_doc, **couch_kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 return 0 if status == "complete" else 1
def main(argv): global threads_working parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir") enrich_dir = create_enrich_dir(ingestion_doc["provider"]) if getprop(ingestion_doc, "fetch_process/status") != "complete": print >> sys.stderr, "Cannot enrich, fetch process did not complete" return 1 # Update ingestion document status = "running" kwargs = { "enrich_process/status": status, "enrich_process/data_dir": enrich_dir, "enrich_process/start_time": iso_utc_with_tz(), "enrich_process/end_time": None, "enrich_process/error": None, "enrich_process/total_items": None, "enrich_process/total_collections": None, "enrich_process/missing_id": None, "enrich_process/missing_source_resource": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 with open(ingestion_doc["profile_path"], "r") as f: profile = json.loads(f.read()) # Counts for logger info stats = {'enriched_items': 0, 'enriched_colls': 0, 'missing_id': 0, 'missing_source_resource': 0 } # Initialize queue and threads queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc, profile=profile, stats=stats, enrich_dir=enrich_dir) # Initialize list of input files listing = os.listdir(fetch_dir) # Initialize counters and statistics dashboard_errors = [] file_count = 0 status = None total_files = len(listing) files = iter(listing) try: # Keep the queue full of filenames while True: time.sleep(0.25) try: if print_errors_thrown(thread_errors): dashboard_errors.extend(thread_errors) raise Exception() if not queue.full(): basename = files.next() filename = os.path.join(fetch_dir, basename) file_count += 1 print "Enqueuing: %s (%s of %s)" % \ (filename, file_count, total_files) queue.put(filename) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(thread_errors): break else: dashboard_errors.extend(thread_errors) raise Exception() time.sleep(0.25) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg dashboard_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" dashboard_errors.append(e.message) finally: print "Enriched items: %s" % stats['enriched_items'] print "Enriched collections: %s" % stats['enriched_colls'] print "Missing ID: %s" % stats['missing_id'] print "Missing sourceResource: %s" % stats['missing_source_resource'] if not status == "error": status = "complete" # Prepare fields for ingestion document update couch_kwargs = { "enrich_process/status": status, "enrich_process/error": dashboard_errors, "enrich_process/end_time": iso_utc_with_tz(), "enrich_process/total_items": stats['enriched_items'], "enrich_process/total_collections": stats['enriched_colls'], "enrich_process/missing_id": stats['missing_id'], "enrich_process/missing_source_resource": \ stats['missing_source_resource'] } try: # Update ingestion document couch.update_ingestion_doc(ingestion_doc, **couch_kwargs) except: print >> sys.stderr, "Error updating ingestion document " + \ ingestion_doc["_id"] return 1 return 0 if status == "complete" else 1