Ejemplo n.º 1
0
def nara_update_links():
    couch = Couch()
    url = "http://research.archives.gov/description/"
    docs = []
    print >> sys.stderr, "Fetching all documents"
    count = 0
    start = time.time()
    for doc in couch._query_all_dpla_provider_docs("nara"):
        if count == 0:
            view_time = time.time() - start
            start = time.time()
        count += 1
        arc_id_desc = getprop(doc, "originalRecord/arc-id-desc",
                              keyErrorAsNone=True)
        if arc_id_desc:
            doc.update({"isShownAt": url + arc_id_desc})
            docs.append(doc)

        # POST every 1000 documents
        if len(docs) == 1000:
            print >> sys.stderr, "Processed %s documents" % count
            couch._bulk_post_to(couch.dpla_db, docs)
            docs = []

    # Last POST
    if docs:
        print >> sys.stderr, "Processed %s documents" % count
        couch.bulk_post_to(couch.dpla_db, docs)

    process_time = time.time() - start
    print >> sys.stderr, "Done"
    print >> sys.stderr, "View time: %s" % view_time
    print >> sys.stderr, "Process time: %s" % process_time
Ejemplo n.º 2
0
def item_docs(provider_name=None):
    """Yield all item documents for the given provider, else all providers"""
    couch = Couch()
    if provider_name:
        docs = couch._query_all_dpla_provider_docs(provider_name)
    else:
        docs = couch.all_dpla_docs()
    for doc in docs:
        if doc.get("ingestType") == "item":
            yield doc
Ejemplo n.º 3
0
def item_docs(provider_name=None):
    """Yield all item documents for the given provider, else all providers"""
    couch = Couch()
    if provider_name:
        docs = couch._query_all_dpla_provider_docs(provider_name)
    else:
        docs = couch.all_dpla_docs()
    for doc in docs:
        if doc.get("ingestType") == "item":
            yield doc
def main(argv=None, couch=None, provider_name=None):
    # For testing, couch and provider_name will be provided as params
    if couch:
        provider_name = provider_name
    else:
        couch = Couch()
        parser = define_arguments()
        args = parser.parse_args(argv[1:])
        provider_name = args.provider_name

    provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name)
    ingest_docs = couch._query_all_provider_ingestion_docs(provider_name)

    # Proceed only if there are no ingestion documents for the provider but
    # there are provider_legacy_docs.
    proceed = True
    if len(ingest_docs) > 0:
        num = len(ingest_docs)
        print >> sys.stderr, "Error: %s ingestion document(s) exists" % num
        proceed = False
    try:
        next_item = next(couch._query_all_dpla_provider_docs(provider_name))
    except:
        print >> sys.stderr, "Error: No documents found for %s" % provider_name
        proceed = False

    def _post(dpla_docs, dashboard_docs, ingest_doc_id):
        couch._bulk_post_to(couch.dpla_db, dpla_docs)
        couch._bulk_post_to(couch.dashboard_db, dashboard_docs)
        couch._update_ingestion_doc_counts(ingest_doc_id,
                                           countAdded=len(dashboard_docs))

    if proceed:
        ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name)

        docs = []
        added_docs = []
        print >> sys.stderr, "Fetching all docs..."
        count = 0
        for doc in provider_legacy_docs:
            count += 1
            doc["ingestionSequence"] = 1
            docs.append(doc)

            added_docs.append({
                "id": doc["_id"],
                "type": "record",
                "status": "added",
                "provider": provider_name,
                "ingestionSequence": 1
            })
            # POST every 1000
            if len(docs) == 1000:
                print >> sys.stderr, "Processed %s docs" % count
                _post(docs, added_docs, ingest_doc_id)
                # Reset
                docs = []
                added_docs = []

        # Last POST
        if docs:
            print >> sys.stderr, "Processed %s docs" % count
            _post(docs, added_docs, ingest_doc_id)

        print >> sys.stderr, "Complete"
Ejemplo n.º 5
0
    if args.pipeline in profile:
        pipeline = ",".join(profile[args.pipeline])
    else:
        pipeline = args.pipeline
    provider = profile.get(u"name")
    contributor = profile.get(u"contributor", {})

    # Create ingestion document
    couch = Couch()
    ingestion_doc_id = couch.create_ingestion_doc_and_backup_db(provider)

    # Fetch provider documents
    docs = []
    count = 0
    for doc in couch._query_all_dpla_provider_docs(provider):
        docs.append(doc)
        count += 1
        # Enrich in batches of 1000
        if len(docs) == 1000:
            enriched_docs = enrich(docs, args.uri_base, pipeline)
            couch.process_and_post_to_dpla(enriched_docs, ingestion_doc_id)
            print "Enriched %s documents" % count
            docs = []
    # Enrich last batch
    if docs:
        enriched_docs = enrich(docs, args.uri_base, pipeline)
        couch.process_and_post_to_dpla(enriched_docs, ingestion_doc_id)
        print "Enriched %s documents" % count

if __name__ == "__main__":
def main(argv=None, couch=None, provider_name=None):
    # For testing, couch and provider_name will be provided as params
    if couch:
        provider_name = provider_name
    else:
        couch = Couch()
        parser = define_arguments()
        args = parser.parse_args(argv[1:])
        provider_name = args.provider_name

    provider_legacy_docs = couch._query_all_dpla_provider_docs(provider_name)
    ingest_docs = couch._query_all_provider_ingestion_docs(provider_name)

    # Proceed only if there are no ingestion documents for the provider but
    # there are provider_legacy_docs.
    proceed = True
    if len(ingest_docs) > 0:
        num = len(ingest_docs)
        print >> sys.stderr, "Error: %s ingestion document(s) exists" % num
        proceed = False
    try:
        next_item = next(couch._query_all_dpla_provider_docs(provider_name))
    except:
        print >> sys.stderr, "Error: No documents found for %s" % provider_name
        proceed = False

    def _post(dpla_docs, dashboard_docs, ingest_doc):
        couch._bulk_post_to(couch.dpla_db, dpla_docs)
        couch._bulk_post_to(couch.dashboard_db, dashboard_docs)
        couch._update_ingestion_doc_counts(ingest_doc,
                                           countAdded=len(dashboard_docs))

    if proceed:
        ingest_doc_id = couch.create_ingestion_doc_and_backup_db(provider_name)
        ingest_doc = couch.dashboard_db[ingest_doc_id]

        docs = []
        added_docs = []
        print >> sys.stderr, "Fetching all docs..."
        count = 0
        for doc in provider_legacy_docs:
            count += 1
            doc["ingestionSequence"] = 1
            docs.append(doc)

            added_docs.append({"id": doc["_id"],
                               "type": "record",
                               "status": "added",
                               "provider": provider_name,
                               "ingestionSequence": 1})
            # POST every 1000
            if len(docs) == 1000:
                print >> sys.stderr, "Processed %s docs" % count
                _post(docs, added_docs, ingest_doc)
                # Reset
                docs = []
                added_docs = []

        # Last POST
        if docs:
            print >> sys.stderr, "Processed %s docs" % count
            _post(docs, added_docs, ingest_doc)

        print >> sys.stderr, "Complete" 
Ejemplo n.º 7
0
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
        return

    # Fetch provider documents
    docs = []
    enrich_errors = []
    save_errors = []
    total_enriched = 0
    enriched_items = 0
    enriched_colls = 0
    missing_id = 0
    missing_source_resource = 0
    saved_items = 0
    saved_colls = 0

    for doc in couch._query_all_dpla_provider_docs(provider):
        docs.append(doc)
        # Enrich in batches of batch_size
        if len(docs) == couch.batch_size:
            error, data = enrich_batch(docs, ingestion_doc, pipeline, couch)
            docs = []
            if error is None:
                # Update counts
                enrich_errors.extend(data["errors"])
                enriched_items += data["enriched_item_count"]
                enriched_colls += data["enriched_coll_count"]
                missing_id += data["missing_id_count"]
                missing_source_resource += data["missing_source_resource_count"]
                total_enriched += len(data["enriched_records"])
                print "Enriched %s" % total_enriched