def nara_update_links(): couch = Couch() url = "http://research.archives.gov/description/" docs = [] print >> sys.stderr, "Fetching all documents" count = 0 start = time.time() for doc in couch._query_all_dpla_provider_docs("nara"): if count == 0: view_time = time.time() - start start = time.time() count += 1 arc_id_desc = getprop(doc, "originalRecord/arc-id-desc", keyErrorAsNone=True) if arc_id_desc: doc.update({"isShownAt": url + arc_id_desc}) docs.append(doc) # POST every 1000 documents if len(docs) == 1000: print >> sys.stderr, "Processed %s documents" % count couch._bulk_post_to(couch.dpla_db, docs) docs = [] # Last POST if docs: print >> sys.stderr, "Processed %s documents" % count couch.bulk_post_to(couch.dpla_db, docs) process_time = time.time() - start print >> sys.stderr, "Done" print >> sys.stderr, "View time: %s" % view_time print >> sys.stderr, "Process time: %s" % process_time
def item_docs(provider_name=None): """Yield all item documents for the given provider, else all providers""" couch = Couch() if provider_name: docs = couch._query_all_dpla_provider_docs(provider_name) else: docs = couch.all_dpla_docs() for doc in docs: if doc.get("ingestType") == "item": yield doc
def item_docs(provider_name=None): """Yield all item documents for the given provider, else all providers""" couch = Couch() if provider_name: docs = couch._query_all_dpla_provider_docs(provider_name) else: docs = couch.all_dpla_docs() for doc in docs: if doc.get("ingestType") == "item": yield doc
def main(argv=None, couch=None, provider_name=None): # For testing, couch and provider_name will be provided as params if couch: provider_name = provider_name else: couch = Couch() parser = define_arguments() args = parser.parse_args(argv[1:]) provider_name = args.provider_name provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name) ingest_docs = couch._query_all_provider_ingestion_docs(provider_name) # Proceed only if there are no ingestion documents for the provider but # there are provider_legacy_docs. proceed = True if len(ingest_docs) > 0: num = len(ingest_docs) print >> sys.stderr, "Error: %s ingestion document(s) exists" % num proceed = False try: next_item = next(couch._query_all_dpla_provider_docs(provider_name)) except: print >> sys.stderr, "Error: No documents found for %s" % provider_name proceed = False def _post(dpla_docs, dashboard_docs, ingest_doc_id): couch._bulk_post_to(couch.dpla_db, dpla_docs) couch._bulk_post_to(couch.dashboard_db, dashboard_docs) couch._update_ingestion_doc_counts(ingest_doc_id, countAdded=len(dashboard_docs)) if proceed: ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name) docs = [] added_docs = [] print >> sys.stderr, "Fetching all docs..." count = 0 for doc in provider_legacy_docs: count += 1 doc["ingestionSequence"] = 1 docs.append(doc) added_docs.append({ "id": doc["_id"], "type": "record", "status": "added", "provider": provider_name, "ingestionSequence": 1 }) # POST every 1000 if len(docs) == 1000: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc_id) # Reset docs = [] added_docs = [] # Last POST if docs: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc_id) print >> sys.stderr, "Complete"
if args.pipeline in profile: pipeline = ",".join(profile[args.pipeline]) else: pipeline = args.pipeline provider = profile.get(u"name") contributor = profile.get(u"contributor", {}) # Create ingestion document couch = Couch() ingestion_doc_id = couch.create_ingestion_doc_and_backup_db(provider) # Fetch provider documents docs = [] count = 0 for doc in couch._query_all_dpla_provider_docs(provider): docs.append(doc) count += 1 # Enrich in batches of 1000 if len(docs) == 1000: enriched_docs = enrich(docs, args.uri_base, pipeline) couch.process_and_post_to_dpla(enriched_docs, ingestion_doc_id) print "Enriched %s documents" % count docs = [] # Enrich last batch if docs: enriched_docs = enrich(docs, args.uri_base, pipeline) couch.process_and_post_to_dpla(enriched_docs, ingestion_doc_id) print "Enriched %s documents" % count if __name__ == "__main__":
def main(argv=None, couch=None, provider_name=None): # For testing, couch and provider_name will be provided as params if couch: provider_name = provider_name else: couch = Couch() parser = define_arguments() args = parser.parse_args(argv[1:]) provider_name = args.provider_name provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name) ingest_docs = couch._query_all_provider_ingestion_docs(provider_name) # Proceed only if there are no ingestion documents for the provider but # there are provider_legacy_docs. proceed = True if len(ingest_docs) > 0: num = len(ingest_docs) print >> sys.stderr, "Error: %s ingestion document(s) exists" % num proceed = False try: next_item = next(couch._query_all_dpla_provider_docs(provider_name)) except: print >> sys.stderr, "Error: No documents found for %s" % provider_name proceed = False def _post(dpla_docs, dashboard_docs, ingest_doc): couch._bulk_post_to(couch.dpla_db, dpla_docs) couch._bulk_post_to(couch.dashboard_db, dashboard_docs) couch._update_ingestion_doc_counts(ingest_doc, countAdded=len(dashboard_docs)) if proceed: ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name) ingest_doc = couch.dashboard_db[ingest_doc_id] docs = [] added_docs = [] print >> sys.stderr, "Fetching all docs..." count = 0 for doc in provider_legacy_docs: count += 1 doc["ingestionSequence"] = 1 docs.append(doc) added_docs.append({"id": doc["_id"], "type": "record", "status": "added", "provider": provider_name, "ingestionSequence": 1}) # POST every 1000 if len(docs) == 1000: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc) # Reset docs = [] added_docs = [] # Last POST if docs: print >> sys.stderr, "Processed %s docs" % count _post(docs, added_docs, ingest_doc) print >> sys.stderr, "Complete"
couch.update_ingestion_doc(ingestion_doc, **kwargs) return # Fetch provider documents docs = [] enrich_errors = [] save_errors = [] total_enriched = 0 enriched_items = 0 enriched_colls = 0 missing_id = 0 missing_source_resource = 0 saved_items = 0 saved_colls = 0 for doc in couch._query_all_dpla_provider_docs(provider): docs.append(doc) # Enrich in batches of batch_size if len(docs) == couch.batch_size: error, data = enrich_batch(docs, ingestion_doc, pipeline, couch) docs = [] if error is None: # Update counts enrich_errors.extend(data["errors"]) enriched_items += data["enriched_item_count"] enriched_colls += data["enriched_coll_count"] missing_id += data["missing_id_count"] missing_source_resource += data["missing_source_resource_count"] total_enriched += len(data["enriched_records"]) print "Enriched %s" % total_enriched