def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) config = ConfigParser.ConfigParser() config.readfp(open("akara.ini")) batch_size = int(config.get("CouchDb", "BatchSize")) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": print "Cannot save, enrich process did not complete" return -1 # Update ingestion document kwargs = { "save_process/status": "running", "save_process/start_time": datetime.now().isoformat(), "save_process/end_time": None, "save_process/error": None, "save_process/total_saved": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Back up provider data if args.backup: resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "save_process/status": "error", "save_process/end_time": datetime.now().isoformat(), "save_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return resp error_msg = None enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir") total_items = 0 total_collections = 0 docs = {} for file in os.listdir(enrich_dir): filename = os.path.join(enrich_dir, file) with open(filename, "r") as f: try: file_docs = json.loads(f.read()) except: error_msg = "Error loading " + filename break # Save when docs is about to exceed the batch size print >> sys.stderr, "Read file %s" % filename if docs and len(docs) + len(file_docs) > batch_size: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp == -1: docs = None break items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) # Set docs for the next iteration docs = file_docs else: docs.update(file_docs) # Last save if docs: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp != -1: items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) print "Total items: %s" % total_items print "Total collections: %s" % total_collections if error_msg: status = "error" else: status = "complete" kwargs = { "save_process/status": status, "save_process/error": error_msg, "save_process/end_time": datetime.now().isoformat(), "save_process/total_items": total_items, "save_process/total_collections": total_collections } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress enrich dir, then delete make_tarfile(enrich_dir) shutil.rmtree(enrich_dir) return 0 if status == "complete" else -1
ingestion_doc = couch.dashboard_db[ingestion_doc_id] # Update ingestion document kwargs = { "poll_storage_process/status": "running", "poll_storage_process/start_time": iso_utc_with_tz(), "poll_storage_process/end_time": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return # Back up current data resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "poll_storage_process/status": "error", "poll_storage_process/end_time": iso_utc_with_tz(), "poll_storage_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return # Fetch provider documents docs = [] enrich_errors = [] save_errors = []
ingestion_doc = couch.dashboard_db[ingestion_doc_id] # Update ingestion document kwargs = { "poll_storage_process/status": "running", "poll_storage_process/start_time": datetime.now().isoformat(), "poll_storage_process/end_time": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return # Back up current data resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "poll_storage_process/status": "error", "poll_storage_process/end_time": datetime.now().isoformat(), "poll_storage_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return # Fetch provider documents docs = [] errors = [] total_enriched = 0
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) batch_size = 500 couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "enrich_process/status") != "complete": print "Cannot save, enrich process did not complete" return -1 # Update ingestion document kwargs = { "save_process/status": "running", "save_process/start_time": iso_utc_with_tz(), "save_process/end_time": None, "save_process/error": None, "save_process/total_saved": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Back up provider data if args.backup: resp = couch._back_up_data(ingestion_doc) if resp == -1: # Fatal error, do not continue with save process kwargs = { "save_process/status": "error", "save_process/end_time": iso_utc_with_tz(), "save_process/error": "Error backing up DPLA records" } couch.update_ingestion_doc(ingestion_doc, **kwargs) return resp error_msg = None enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir") total_items = 0 total_collections = 0 sync_point = 5000 docs = {} for file in os.listdir(enrich_dir): filename = os.path.join(enrich_dir, file) with open(filename, "r") as f: try: file_docs = json.loads(f.read()) except: error_msg = "Error loading " + filename break # Save when docs is about to exceed the batch size print >> sys.stderr, "Read file %s" % filename if docs and len(docs) + len(file_docs) > batch_size: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp == -1: docs = None break items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) #if total_items > sync_point: # print "Syncing views" # couch.sync_views(couch.dpla_db.name) # sync_point = total_items + 10000 # Set docs for the next iteration docs = file_docs else: docs.update(file_docs) # Last save if docs: resp, error_msg = couch.process_and_post_to_dpla(docs, ingestion_doc) if resp != -1: items = len([doc for doc in docs.values() if doc.get("ingestType") == "item"]) total_items += items total_collections += len(docs) - items print "Saved %s documents" % (total_items + total_collections) #print "Syncing views" #couch.sync_views(couch.dpla_db.name) print "Total items: %s" % total_items print "Total collections: %s" % total_collections if error_msg: status = "error" else: status = "complete" kwargs = { "save_process/status": status, "save_process/error": error_msg, "save_process/end_time": iso_utc_with_tz(), "save_process/total_items": total_items, "save_process/total_collections": total_collections } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # Compress enrich dir, then delete make_tarfile(enrich_dir) shutil.rmtree(enrich_dir) return total_items if status == "complete" else -1