コード例 #1
0
def test_iso_utc_with_tz():
    """iso_utc_with_tz() should yield a UTC datetime as a string with timezone symbol Z"""
    utc_now = datetime.utcnow()
    utc_now_iso_string_with_tz = utc_now.isoformat() + "Z"
    assert utc_now_iso_string_with_tz == iso_utc_with_tz(utc_now)
    # The following was added because of the regression described in #7699.
    time1 = iso_utc_with_tz()
    sleep(1)
    time2 = iso_utc_with_tz() 
    assert time1 != time2
コード例 #2
0
def test_iso_utc_with_tz():
    """iso_utc_with_tz() should yield a UTC datetime as a string with timezone symbol Z"""
    utc_now = datetime.utcnow()
    utc_now_iso_string_with_tz = utc_now.isoformat() + "Z"
    assert utc_now_iso_string_with_tz == iso_utc_with_tz(utc_now)
    # The following was added because of the regression described in #7699.
    time1 = iso_utc_with_tz()
    sleep(1)
    time2 = iso_utc_with_tz()
    assert time1 != time2
コード例 #3
0
def main(argv):
    print "WARNING: Bulk data is now exported/maintained using elasticdump."
    print "See https://github.com/dpla/automation/blob/develop/ansible/roles/exporter/files/export-provider.sh"

    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc,
               "dashboard_cleanup_process/status") != "complete":
        print "Error, dashboard cleanup process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "upload_bulk_data_process/status": "running",
        "upload_bulk_data_process/start_time": iso_utc_with_tz(),
        "upload_bulk_data_process/end_time": None,
        "upload_bulk_data_process/error": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # TODO: as in the fetch_records.py script, we need profile in this scope
    #       and the file shouldn't have to be opened again 
    with open(ingestion_doc["profile_path"], "r") as profile:
        contributor = getprop(json.load(profile), "contributor/name")

    resp = export_database.main([None, "source", contributor, "upload"])
    if resp == -1:
        status = "error"
        error_msg = "Error uploading bulk data"
    else:
        status = "complete"
        error_msg = None

    # Update ingestion document
    kwargs = {
        "upload_bulk_data_process/status": status,
        "upload_bulk_data_process/error": error_msg,
        "upload_bulk_data_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
コード例 #4
0
    def create_ingestion_doc_and_backup_db(self, provider):
        """Creates the ingestion document and backs up the provider documents
           if this is not the first ingestion, then returns the ingestion
           document id.
        """
        ingestion_doc = {
            "provider": provider,
            "type": "ingestion",
            "ingestDate": iso_utc_with_tz(),
            "countAdded": 0,
            "countChanged": 0,
            "countDeleted": 0
        }

        last_ingestion_doc = self._get_last_ingestion_doc_for(provider)
        if not last_ingestion_doc:
            ingestion_sequence = 1
        else:
            # Since this is not the first ingestion we will back up the
            # provider documents and upate the current ingestion document with
            # the backup database name.
            ingestion_sequence = last_ingestion_doc["ingestionSequence"] + 1
            backup_db_name = self._backup_db(provider)
            ingestion_doc["backupDB"] = backup_db_name
            self.dashboard_db.save(ingestion_doc)

        ingestion_doc["ingestionSequence"] = ingestion_sequence
        ingestion_doc_id = self.dashboard_db.save(ingestion_doc)[0]
        return ingestion_doc_id
コード例 #5
0
ファイル: couch.py プロジェクト: dpla/ingestion
    def create_ingestion_doc_and_backup_db(self, provider):
        """Creates the ingestion document and backs up the provider documents
           if this is not the first ingestion, then returns the ingestion
           document id.
        """
        ingestion_doc = {
            "provider": provider,
            "type": "ingestion",
            "ingestDate": iso_utc_with_tz(),
            "countAdded": 0,
            "countChanged": 0,
            "countDeleted": 0
        }

        last_ingestion_doc = self._get_last_ingestion_doc_for(provider)
        if not last_ingestion_doc:
            ingestion_sequence = 1
        else:
            # Since this is not the first ingestion we will back up the
            # provider documents and upate the current ingestion document with
            # the backup database name.
            ingestion_sequence = last_ingestion_doc["ingestionSequence"] + 1
            backup_db_name = self._backup_db(provider)
            ingestion_doc["backupDB"] = backup_db_name
            self.dashboard_db.save(ingestion_doc)
            

        ingestion_doc["ingestionSequence"] = ingestion_sequence
        ingestion_doc_id = self.dashboard_db.save(ingestion_doc)[0]
        return ingestion_doc_id
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "save_process/status") != "complete":
        print "Error, save process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "delete_process/status": "running",
        "delete_process/start_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    resp, total_deleted = couch.process_deleted_docs(ingestion_doc)
    if resp == -1:
        status = "error"
        error_msg = "Error deleting documents; only %s deleted" % total_deleted
    else:
        status = "complete"
        error_msg = None

    msg = "Total documents deleted: %s" % total_deleted
    print msg
    logger.info(msg)

    # Update ingestion document
    kwargs = {
        "delete_process/status": status,
        "delete_process/error": error_msg,
        "delete_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
コード例 #7
0
ファイル: dashboard_cleanup.py プロジェクト: dpla/ingestion
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "check_counts_process/status") != "complete":
        print "Error, checkk counts process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "dashboard_cleanup_process/status": "running",
        "dashboard_cleanup_process/start_time": iso_utc_with_tz(),
        "dashboard_cleanup_process/end_time": None,
        "dashboard_cleanup_process/error": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    resp, total_deleted = couch.dashboard_cleanup(ingestion_doc)
    if resp == -1:
        status = "error"
        error_msg = "Error deleting documents; only %s deleted" % total_deleted
    else:
        status = "complete"
        error_msg = None
    print "Total dashboard documents deleted: %s" % total_deleted

    # Update ingestion document
    kwargs = {
        "dashboard_cleanup_process/status": status,
        "dashboard_cleanup_process/error": error_msg,
        "dashboard_cleanup_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
コード例 #8
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "delete_process/status") != "complete":
        print "Error, delete process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "check_counts_process/status": "running",
        "check_counts_process/start_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Check each count against the threshold
    alerts = []
    count_type = ("Added", "Changed", "Deleted")
    for ctype in count_type:
        count = int(ingestion_doc["count" + ctype])
        threshold = int(ingestion_doc["thresholds"][ctype.lower()])
        if count > threshold:
            alerts.append("%s items %s exceeds threshold of %s" %
                          (count, ctype.lower(), threshold))

    error_msg = None
    if alerts:
        config_file = "akara.ini"
        config = ConfigParser.ConfigParser()                                    
        config.readfp(open(config_file))
        to = [s.strip() for s in config.get("Alert", "To").split(",")]
        frm = config.get("Alert", "From")

        month = dateparser.parse(ingestion_doc["ingestDate"]).strftime("%B")
        alerts = "\n".join(alerts)
        msg = MIMEText(alerts)
        msg["Subject"] = "Threshold(s) exceeded for %s ingestion of %s" % \
                         (month, ingestion_doc["provider"])
        msg["To"] = ", ".join(to)
        msg["From"] = frm

        try:
            s = smtplib.SMTP("localhost")
            s.sendmail(frm, to, msg.as_string())
            s.quit()
        except Exception, e:
            error_msg = e
コード例 #9
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "delete_process/status") != "complete":
        print "Error, delete process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "check_counts_process/status": "running",
        "check_counts_process/start_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Check each count against the threshold
    alerts = []
    count_type = ("Added", "Changed", "Deleted")
    for ctype in count_type:
        count = int(ingestion_doc["count" + ctype])
        threshold = int(ingestion_doc["thresholds"][ctype.lower()])
        if count > threshold:
            alerts.append("%s items %s exceeds threshold of %s" %
                          (count, ctype.lower(), threshold))

    error_msg = None
    if alerts:
        config_file = "akara.ini"
        config = ConfigParser.ConfigParser()
        config.readfp(open(config_file))
        to = [s.strip() for s in config.get("Alert", "To").split(",")]
        frm = config.get("Alert", "From")

        month = dateparser.parse(ingestion_doc["ingestDate"]).strftime("%B")
        alerts = "\n".join(alerts)
        msg = MIMEText(alerts)
        msg["Subject"] = "Threshold(s) exceeded for %s ingestion of %s" % \
                         (month, ingestion_doc["provider"])
        msg["To"] = ", ".join(to)
        msg["From"] = frm

        try:
            s = smtplib.SMTP("localhost")
            s.sendmail(frm, to, msg.as_string())
            s.quit()
        except Exception, e:
            error_msg = e
コード例 #10
0
    def update_bulk_download_document(self, contributor, file_path, file_size):
        """Creates/updates a document for a contributor's bulk data file and
           returns the document id
        """

        bulk_download_doc = self._get_bulk_download_doc(contributor)
        bulk_download_doc.update({
            "file_path": file_path,
            "file_size": file_size,
            "last_updated": iso_utc_with_tz()
        })

        return self.bulk_download_db.save(bulk_download_doc)[0]
コード例 #11
0
ファイル: couch.py プロジェクト: dpla/ingestion
    def update_bulk_download_document(self, contributor, file_path,
                                      file_size):
        """Creates/updates a document for a contributor's bulk data file and
           returns the document id
        """

        bulk_download_doc = self._get_bulk_download_doc(contributor)
        bulk_download_doc.update({
            "file_path": file_path,
            "file_size": file_size,
            "last_updated": iso_utc_with_tz()
            })

        return self.bulk_download_db.save(bulk_download_doc)[0]
コード例 #12
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    i_doc = couch.dashboard_db[args.ingestion_document_id]
    if i_doc['delete_process']['status'] != 'complete':
        print >> sys.stderr, 'Error: delete process did not complete'
        return 1

    # Update ingestion document to indicate that we're running
    kwargs = {
        'check_counts_process/status': 'running',
        'check_counts_process/start_time': iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(i_doc, **kwargs)
    except:
        tb = traceback.format_exc(5)
        print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb)
        return 1

    error_msg = None
    try:
        config = ConfigParser.ConfigParser()
        config.readfp(open('akara.ini'))
        to = [s.strip() for s in config.get('Alert', 'To').split(',')]
        frm = config.get('Alert', 'From')
        body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc))
        msg = MIMEText(body)
        msg['Subject'] = "%s ingest #%s" % (i_doc['provider'],
                                            i_doc['ingestionSequence'])
        msg['To'] = ', '.join(to)
        msg['From'] = frm
        s = smtplib.SMTP("localhost")
        s.sendmail(frm, to, msg.as_string())
        s.quit()
    except Exception, e:
        error_msg = e
        tb = traceback.format_exc(5)
        print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg,
                                                                    tb)
コード例 #13
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    i_doc = couch.dashboard_db[args.ingestion_document_id]
    if i_doc['delete_process']['status'] != 'complete':
        print >> sys.stderr, 'Error: delete process did not complete'
        return 1

    # Update ingestion document to indicate that we're running
    kwargs = {'check_counts_process/status': 'running',
              'check_counts_process/start_time': iso_utc_with_tz()}
    try:
        couch.update_ingestion_doc(i_doc, **kwargs)
    except:
        tb = traceback.format_exc(5)
        print "Error updating ingestion document %s\n%s" % (i_doc["_id"], tb)
        return 1

    error_msg = None
    try:
        config = ConfigParser.ConfigParser()                                    
        config.readfp(open('akara.ini'))
        to = [s.strip() for s in config.get('Alert', 'To').split(',')]
        frm = config.get('Alert', 'From')
        body = "%s\n\n%s" % (alerts(i_doc), statistics(i_doc))
        msg = MIMEText(body)
        msg['Subject'] = "%s ingest #%s" % (i_doc['provider'],
                                            i_doc['ingestionSequence'])
        msg['To'] = ', '.join(to)
        msg['From'] = frm
        s = smtplib.SMTP("localhost")
        s.sendmail(frm, to, msg.as_string())
        s.quit()
    except Exception, e:
        error_msg = e
        tb = traceback.format_exc(5)
        print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg,
                                                                    tb)
コード例 #14
0
def create_sitemap_index(path):
    global CONFIG
    site_map_uri = CONFIG.get("Sitemap", "SitemapURI")
    fpath = os.path.join(path, "all_item_urls.xml")
    with open(fpath, "w") as f:
        line = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
               '<sitemapindex xmlns="' + \
               'http://www.sitemaps.org/schemas/sitemap/0.9">\n'
        f.write(line)
        for item in os.listdir(path):
            # Skip file being written to
            if item == "all_item_urls.xml":
                continue
            # sitemaps.dp.la is a CNAME pointing to the CDN host
            file_uri = url_join(site_map_uri, item)
            lastmod_dt = datetime.utcfromtimestamp(
                os.path.getmtime(os.path.join(path, item)))
            line = "\t<sitemap>\n" + \
                   "\t\t<loc>%s</loc>\n\t\t<lastmod>%s</lastmod>\n" % \
                   (file_uri, iso_utc_with_tz(lastmod_dt)) + "\t</sitemap>\n"
            f.write(line)
        f.write("</sitemapindex>")
コード例 #15
0
ファイル: create_sitemap.py プロジェクト: dpla/ingestion
def create_sitemap_index(path):
    global CONFIG
    site_map_uri = CONFIG.get("Sitemap", "SitemapURI")
    fpath = os.path.join(path, "all_item_urls.xml")
    with open(fpath, "w") as f:
        line = '<?xml version="1.0" encoding="UTF-8"?>\n' + \
               '<sitemapindex xmlns="' + \
               'http://www.sitemaps.org/schemas/sitemap/0.9">\n'
        f.write(line)
        for item in os.listdir(path):
            # Skip file being written to
            if item == "all_item_urls.xml":
                continue
            # sitemaps.dp.la is a CNAME pointing to the CDN host
            file_uri = url_join(site_map_uri, item)
            lastmod_dt = datetime.utcfromtimestamp(os.path.getmtime(
                                                os.path.join(path, item)
                                                ))
            line = "\t<sitemap>\n" + \
                   "\t\t<loc>%s</loc>\n\t\t<lastmod>%s</lastmod>\n" % \
                   (file_uri, iso_utc_with_tz(lastmod_dt)) + "\t</sitemap>\n"
            f.write(line)
        f.write("</sitemapindex>")
コード例 #16
0
ファイル: fetch_records.py プロジェクト: dpla/ingestion
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = "akara.ini"

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
コード例 #17
0
            s = smtplib.SMTP("localhost")
            s.sendmail(frm, to, msg.as_string())
            s.quit()
        except Exception, e:
            error_msg = e

    if error_msg:
        print >> sys.stderr, ("********************\n" +
                              "Error sending alert email: %s" % error_msg)
        print >> sys.stderr, ("Alerts:\n%s" % alerts +
                              "\n********************")

    # Update ingestion document
    status = "complete"
    kwargs = {
        "check_counts_process/status": status,
        "check_counts_process/error": error_msg,
        "check_counts_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1


if __name__ == '__main__':
    main(sys.argv)
コード例 #18
0
        msg['Subject'] = "%s ingest #%s" % (i_doc['provider'],
                                            i_doc['ingestionSequence'])
        msg['To'] = ', '.join(to)
        msg['From'] = frm
        s = smtplib.SMTP("localhost")
        s.sendmail(frm, to, msg.as_string())
        s.quit()
    except Exception, e:
        error_msg = e
        tb = traceback.format_exc(5)
        print >> sys.stderr, "Error sending alert email: %s\n%s" % (error_msg,
                                                                    tb)

    # Update ingestion document
    kwargs = {'check_counts_process/status': 'complete',
              'check_counts_process/error': error_msg,
              'check_counts_process/end_time': iso_utc_with_tz()}
    try:
        couch.update_ingestion_doc(i_doc, **kwargs)
    except:
        tb = traceback.format_exc(5)
        print >> sys.stderr, "Error updating ingestion document %s\n%s" \
                             % (i_doc["_id"], tb)
        return 1

    return 0

if __name__ == '__main__':
    rv = main(sys.argv)
    sys.exit(rv)
コード例 #19
0
        pipeline = ",".join(profile[args.pipeline])
    else:
        pipeline = args.pipeline
    provider = profile.get(u"name")
    contributor = profile.get(u"contributor", {})

    # Create ingestion document
    couch = Couch()
    ingestion_doc_id = create_ingestion_document.main([None,
                                                       args.profile_path])
    ingestion_doc = couch.dashboard_db[ingestion_doc_id]

    # Update ingestion document
    kwargs = {
        "poll_storage_process/status": "running",
        "poll_storage_process/start_time": iso_utc_with_tz(),
        "poll_storage_process/end_time": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return

    # Back up current data
    resp = couch._back_up_data(ingestion_doc)

    if resp == -1: 
        # Fatal error, do not continue with save process
        kwargs = { 
            "poll_storage_process/status": "error",
コード例 #20
0
ファイル: couch.py プロジェクト: dpla/ingestion
 def _ts_for_err(self):
     return "[%s]" % iso_utc_with_tz()
コード例 #21
0
ファイル: enrich.py プロジェクト: dpla/ingestion
def enrich(body, ctype):
    """
    Establishes a pipeline of services identified by an ordered list of URIs
    provided in two request headers, one for collections and one for items.

    Returns a JSON dump of the collections and records enriched along with a
    count of records enriched.
    """
    request_headers = copy_headers_to_dict(request.environ)
    item_enrichments = request_headers.get(u"Pipeline-Item", "").split(",")
    coll_enrichments = request_headers.get(u"Pipeline-Coll", "").split(",")

    records = json.loads(body)

    # Counts for enrich script
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        if record.get("ingestType") == "collection":
            wsgi_header = "HTTP_PIPELINE_COLL"
            enrichments = coll_enrichments
        else:
            wsgi_header = "HTTP_PIPELINE_ITEM"
            enrichments = item_enrichments
            # Preserve record prior to any enrichments
            record["originalRecord"] = record.copy()         
            record["ingestType"] = "item"

        # Explicitly populate ingestDate as UTC
        record["ingestDate"] = iso_utc_with_tz()

        error, enriched_record_text = pipe(record, ctype, enrichments,
                                           wsgi_header)
        enriched_record = json.loads(enriched_record_text)
        if error:
            errors.append(error)

        ingest_type = record.get("ingestType")
        # Enriched record should have an _id
        if enriched_record.get("_id", None):
            # Item records should have sourceResource
            if (ingest_type == "item" and not "sourceResource" in
                enriched_record):
                logger.error("Records %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)
コード例 #22
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    batch_size = 500

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "enrich_process/status") != "complete":
        print "Cannot save, enrich process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "save_process/status": "running",
        "save_process/start_time": iso_utc_with_tz(),
        "save_process/end_time": None,
        "save_process/error": None,
        "save_process/total_saved": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Back up provider data
    if args.backup:
        resp = couch._back_up_data(ingestion_doc)

        if resp == -1:
            # Fatal error, do not continue with save process
            kwargs = {
                "save_process/status": "error",
                "save_process/end_time": iso_utc_with_tz(),
                "save_process/error": "Error backing up DPLA records"
            }
            couch.update_ingestion_doc(ingestion_doc, **kwargs)
            return resp

    error_msg = None
    enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir")
    total_items = 0
    total_collections = 0
    sync_point = 5000
    docs = {}
    for file in os.listdir(enrich_dir):
        filename = os.path.join(enrich_dir, file)
        with open(filename, "r") as f:
            try:
                file_docs = json.loads(f.read())
            except:
                error_msg = "Error loading " + filename
                break

        # Save when docs is about to exceed the batch size
        print >> sys.stderr, "Read file %s" % filename
        if docs and len(docs) + len(file_docs) > batch_size:
            resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                             ingestion_doc)
            if resp == -1:
                docs = None
                break

            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)

            #if total_items > sync_point:
            #    print "Syncing views"
            #    couch.sync_views(couch.dpla_db.name)
            #    sync_point = total_items + 10000

            # Set docs for the next iteration
            docs = file_docs
        else:
            docs.update(file_docs)

    # Last save
    if docs:
        resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                         ingestion_doc)
        if resp != -1:
            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)
            #print "Syncing views"
            #couch.sync_views(couch.dpla_db.name)

    print "Total items: %s" % total_items
    print "Total collections: %s" % total_collections

    if error_msg:
        status = "error"
    else:
        status = "complete"
    kwargs = {
        "save_process/status": status,
        "save_process/error": error_msg,
        "save_process/end_time": iso_utc_with_tz(),
        "save_process/total_items": total_items,
        "save_process/total_collections": total_collections
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Compress enrich dir, then delete
    make_tarfile(enrich_dir)
    shutil.rmtree(enrich_dir)

    return total_items if status == "complete" else -1
コード例 #23
0
 def _ts_for_err(self):
     return "[%s]" % iso_utc_with_tz()
コード例 #24
0
ファイル: save_records.py プロジェクト: dpla/ingestion
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    batch_size = 500

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc, "enrich_process/status") != "complete":
        print "Cannot save, enrich process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "save_process/status": "running",
        "save_process/start_time": iso_utc_with_tz(),
        "save_process/end_time": None,
        "save_process/error": None,
        "save_process/total_saved": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Back up provider data
    if args.backup:
        resp = couch._back_up_data(ingestion_doc)

        if resp == -1:
            # Fatal error, do not continue with save process
            kwargs = {
                "save_process/status": "error",
                "save_process/end_time": iso_utc_with_tz(),
                "save_process/error": "Error backing up DPLA records"
            }
            couch.update_ingestion_doc(ingestion_doc, **kwargs)
            return resp

    error_msg = None
    enrich_dir = getprop(ingestion_doc, "enrich_process/data_dir")
    total_items = 0
    total_collections = 0
    sync_point = 5000
    docs = {}
    for file in os.listdir(enrich_dir):
        filename = os.path.join(enrich_dir, file)
        with open(filename, "r") as f:
            try:
                file_docs = json.loads(f.read())
            except:
                error_msg = "Error loading " + filename
                break

        # Save when docs is about to exceed the batch size
        print >> sys.stderr, "Read file %s" % filename
        if docs and len(docs) + len(file_docs) > batch_size:
            resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                             ingestion_doc)
            if resp == -1:
                docs = None
                break

            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)

            if total_items > sync_point:
                print "Syncing views"
                couch.sync_views(couch.dpla_db.name)
                sync_point = total_items + 10000

            # Set docs for the next iteration
            docs = file_docs
        else:
            docs.update(file_docs)

    # Last save
    if docs:
        resp, error_msg = couch.process_and_post_to_dpla(docs,
                                                         ingestion_doc)
        if resp != -1:
            items = len([doc for doc in docs.values() if
                         doc.get("ingestType") == "item"])
            total_items += items
            total_collections += len(docs) - items
            print "Saved %s documents" % (total_items + total_collections)
            print "Syncing views"
            couch.sync_views(couch.dpla_db.name)

    print "Total items: %s" % total_items
    print "Total collections: %s" % total_collections

    if error_msg:
        status = "error"
    else:
        status = "complete"
    kwargs = {
        "save_process/status": status,
        "save_process/error": error_msg,
        "save_process/end_time": iso_utc_with_tz(),
        "save_process/total_items": total_items,
        "save_process/total_collections": total_collections
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # Compress enrich dir, then delete
    make_tarfile(enrich_dir)
    shutil.rmtree(enrich_dir)

    return 0 if status == "complete" else -1
コード例 #25
0
ファイル: couch.py プロジェクト: dpla/ingestion
    def _create_ingestion_document(self, provider, uri_base, profile_path,
                                   thresholds, fetcher_threads=1):
        """Creates and returns an ingestion document for the provider.
        """

        ingestion_doc = {
            "provider": provider,
            "type": "ingestion",
            "ingestionSequence": None,
            "ingestDate": iso_utc_with_tz(),
            "countAdded": 0,
            "countChanged": 0,
            "countDeleted": 0,
            "uri_base": uri_base,
            "profile_path": profile_path,
            "fetcher_threads": fetcher_threads,
            "fetch_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "data_dir": None,
                "error": None,
                "total_items": None,
                "total_collections": None 
            },
            "enrich_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "data_dir": None,
                "error": None,
                "total_items": None,
            "thresholdDeleted": 0,
                "total_collections": None,
                "missing_id": None,
                "missing_source_resource": None
            },
            "save_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None,
                "total_items": None,
                "total_collections": None
            },
            "delete_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },
            "check_counts_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },
            "dashboard_cleanup_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },
            "upload_bulk_data_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },

            "poll_storage_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None,
                "total_items": None,
                "total_collections": None,
                "missing_id": None,
                "missing_source_resource": None
            }
        }
        ingestion_doc.update({"thresholds": thresholds})

        # Set the ingestion sequence
        latest_ingestion_doc = self._get_last_ingestion_doc_for(provider)
        if latest_ingestion_doc is None:
            ingestion_sequence = 1
        else:
            ingestion_sequence = 1 + latest_ingestion_doc["ingestionSequence"]
        ingestion_doc["ingestionSequence"] = ingestion_sequence

        # Save the ingestion document and get its ID
        ingestion_doc_id = self.dashboard_db.save(ingestion_doc)[0]

        return ingestion_doc_id
コード例 #26
0
    def _create_ingestion_document(self,
                                   provider,
                                   uri_base,
                                   profile_path,
                                   thresholds,
                                   fetcher_threads=1):
        """Creates and returns an ingestion document for the provider.
        """

        ingestion_doc = {
            "provider": provider,
            "type": "ingestion",
            "ingestionSequence": None,
            "ingestDate": iso_utc_with_tz(),
            "countAdded": 0,
            "countChanged": 0,
            "countDeleted": 0,
            "uri_base": uri_base,
            "profile_path": profile_path,
            "fetcher_threads": fetcher_threads,
            "fetch_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "data_dir": None,
                "error": None,
                "total_items": None,
                "total_collections": None
            },
            "enrich_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "data_dir": None,
                "error": None,
                "total_items": None,
                "thresholdDeleted": 0,
                "total_collections": None,
                "missing_id": None,
                "missing_source_resource": None
            },
            "save_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None,
                "total_items": None,
                "total_collections": None
            },
            "delete_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },
            "check_counts_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },
            "dashboard_cleanup_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },
            "upload_bulk_data_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None
            },
            "poll_storage_process": {
                "status": None,
                "start_time": None,
                "end_time": None,
                "error": None,
                "total_items": None,
                "total_collections": None,
                "missing_id": None,
                "missing_source_resource": None
            }
        }
        ingestion_doc.update({"thresholds": thresholds})

        # Set the ingestion sequence
        latest_ingestion_doc = self._get_last_ingestion_doc_for(provider)
        if latest_ingestion_doc is None:
            ingestion_sequence = 1
        else:
            ingestion_sequence = 1 + latest_ingestion_doc["ingestionSequence"]
        ingestion_doc["ingestionSequence"] = ingestion_sequence

        # Save the ingestion document and get its ID
        ingestion_doc_id = self.dashboard_db.save(ingestion_doc)[0]

        return ingestion_doc_id
コード例 #27
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini")

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
コード例 #28
0
def enrich(body, ctype):
    """
    Establishes a pipeline of services identified by an ordered list of URIs
    provided in two request headers, one for collections and one for items.

    Returns a JSON dump of the collections and records enriched along with a
    count of records enriched.
    """
    request_headers = copy_headers_to_dict(request.environ)
    item_enrichments = request_headers.get(u"Pipeline-Item", "").split(",")
    coll_enrichments = request_headers.get(u"Pipeline-Coll", "").split(",")

    records = json.loads(body)

    # Counts for enrich script
    enriched_coll_count = 0
    enriched_item_count = 0
    missing_id_count = 0
    missing_source_resource_count = 0

    errors = []
    enriched_records = {}
    for record in records:
        if record.get("ingestType") == "collection":
            wsgi_header = "HTTP_PIPELINE_COLL"
            enrichments = coll_enrichments
        else:
            wsgi_header = "HTTP_PIPELINE_ITEM"
            enrichments = item_enrichments
            # Preserve record prior to any enrichments
            record["originalRecord"] = record.copy()
            record["ingestType"] = "item"

        # Explicitly populate ingestDate as UTC
        record["ingestDate"] = iso_utc_with_tz()

        error, enriched_record_text = pipe(record, ctype, enrichments,
                                           wsgi_header)
        enriched_record = json.loads(enriched_record_text)
        if error:
            errors.append(error)

        ingest_type = record.get("ingestType")
        # Enriched record should have an _id
        if enriched_record.get("_id", None):
            # Item records should have sourceResource
            if (ingest_type == "item"
                    and not "sourceResource" in enriched_record):
                logger.error("Records %s does not have sourceResource: %s" %
                             (enriched_record["_id"], enriched_record))
                missing_source_resource_count += 1
            else:
                enriched_records[enriched_record["_id"]] = enriched_record
                if ingest_type == "item":
                    enriched_item_count += 1
                else:
                    enriched_coll_count += 1
        else:
            missing_id_count += 1

    data = {
        "enriched_records": enriched_records,
        "enriched_coll_count": enriched_coll_count,
        "enriched_item_count": enriched_item_count,
        "missing_id_count": missing_id_count,
        "missing_source_resource_count": missing_source_resource_count,
        "errors": errors
    }

    return json.dumps(data)
コード例 #29
0
ファイル: enrich_records.py プロジェクト: mlhale7/ingestion
def main(argv):
    global threads_working
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir")
    enrich_dir = create_enrich_dir(ingestion_doc["provider"])
    if getprop(ingestion_doc, "fetch_process/status") != "complete":
        print >> sys.stderr, "Cannot enrich, fetch process did not complete"
        return 1

    # Update ingestion document
    status = "running"
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/data_dir": enrich_dir,
        "enrich_process/start_time": iso_utc_with_tz(),
        "enrich_process/end_time": None,
        "enrich_process/error": None,
        "enrich_process/total_items": None,
        "enrich_process/total_collections": None,
        "enrich_process/missing_id": None,
        "enrich_process/missing_source_resource": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                ingestion_doc["_id"]
        return 1

    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.loads(f.read())

    # Counts for logger info
    stats = {
        'enriched_items': 0,
        'enriched_colls': 0,
        'missing_id': 0,
        'missing_source_resource': 0
    }

    # Initialize queue and threads
    queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc,
                                            profile=profile,
                                            stats=stats,
                                            enrich_dir=enrich_dir)
    # Initialize list of input files
    listing = os.listdir(fetch_dir)
    # Initialize counters and statistics
    dashboard_errors = []
    file_count = 0
    status = None
    total_files = len(listing)
    files = iter(listing)

    try:
        # Keep the queue full of filenames
        while True:
            time.sleep(0.25)
            try:
                if print_errors_thrown(thread_errors):
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
                if not queue.full():
                    basename = files.next()
                    filename = os.path.join(fetch_dir, basename)
                    file_count += 1
                    print "Enqueuing: %s (%s of %s)" % \
                            (filename, file_count, total_files)
                    queue.put(filename)
            except StopIteration:
                break
        # Wait for queue to be empty before returning
        while True:
            if queue.empty() and not threads_working:
                if not print_errors_thrown(thread_errors):
                    break
                else:
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
            time.sleep(0.25)
    except KeyboardInterrupt:
        status = "error"
        msg = "\nCaught keyboard interrupt"
        print >> sys.stderr, msg
        dashboard_errors.append(msg)
    except Exception as e:
        if e.message:
            print >> sys.stderr, e.message
        status = "error"
        dashboard_errors.append(e.message)
    finally:
        print "Enriched items: %s" % stats['enriched_items']
        print "Enriched collections: %s" % stats['enriched_colls']
        print "Missing ID: %s" % stats['missing_id']
        print "Missing sourceResource: %s" % stats['missing_source_resource']
        if not status == "error":
            status = "complete"
        # Prepare fields for ingestion document update
        couch_kwargs = {
            "enrich_process/status": status,
            "enrich_process/error": dashboard_errors,
            "enrich_process/end_time": iso_utc_with_tz(),
            "enrich_process/total_items": stats['enriched_items'],
            "enrich_process/total_collections": stats['enriched_colls'],
            "enrich_process/missing_id": stats['missing_id'],
            "enrich_process/missing_source_resource": \
                    stats['missing_source_resource']
        }

    try:
        # Update ingestion document
        couch.update_ingestion_doc(ingestion_doc, **couch_kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                             ingestion_doc["_id"]
        return 1

    return 0 if status == "complete" else 1
コード例 #30
0
ファイル: enrich_records.py プロジェクト: dpla/ingestion
def main(argv):
    global threads_working
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    fetch_dir = getprop(ingestion_doc, "fetch_process/data_dir")
    enrich_dir = create_enrich_dir(ingestion_doc["provider"])
    if getprop(ingestion_doc, "fetch_process/status") != "complete":
        print >> sys.stderr, "Cannot enrich, fetch process did not complete"
        return 1

    # Update ingestion document
    status = "running"
    kwargs = {
        "enrich_process/status": status,
        "enrich_process/data_dir": enrich_dir,
        "enrich_process/start_time": iso_utc_with_tz(),
        "enrich_process/end_time": None,
        "enrich_process/error": None,
        "enrich_process/total_items": None,
        "enrich_process/total_collections": None,
        "enrich_process/missing_id": None,
        "enrich_process/missing_source_resource": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                ingestion_doc["_id"]
        return 1

    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.loads(f.read())

    # Counts for logger info
    stats = {'enriched_items': 0,
             'enriched_colls': 0,
             'missing_id': 0,
             'missing_source_resource': 0
            }

    # Initialize queue and threads
    queue, thread_errors = queue_and_errors(ingestion_doc=ingestion_doc,
                                            profile=profile,
                                            stats=stats,
                                            enrich_dir=enrich_dir)
    # Initialize list of input files
    listing = os.listdir(fetch_dir)
    # Initialize counters and statistics
    dashboard_errors = []
    file_count = 0
    status = None
    total_files = len(listing)
    files = iter(listing)

    try:
        # Keep the queue full of filenames
        while True:
            time.sleep(0.25)
            try:
                if print_errors_thrown(thread_errors):
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
                if not queue.full():
                    basename = files.next()
                    filename = os.path.join(fetch_dir, basename)
                    file_count += 1
                    print "Enqueuing: %s (%s of %s)" % \
                            (filename, file_count, total_files)
                    queue.put(filename)
            except StopIteration:
                break
        # Wait for queue to be empty before returning
        while True:
            if queue.empty() and not threads_working:
                if not print_errors_thrown(thread_errors):
                    break
                else:
                    dashboard_errors.extend(thread_errors)
                    raise Exception()
            time.sleep(0.25)
    except KeyboardInterrupt:
        status = "error"
        msg = "\nCaught keyboard interrupt"
        print >> sys.stderr, msg
        dashboard_errors.append(msg)
    except Exception as e:
        if e.message:
            print >> sys.stderr, e.message
        status = "error"
        dashboard_errors.append(e.message)
    finally:
        print "Enriched items: %s" % stats['enriched_items']
        print "Enriched collections: %s" % stats['enriched_colls']
        print "Missing ID: %s" % stats['missing_id']
        print "Missing sourceResource: %s" % stats['missing_source_resource']
        if not status == "error":
            status = "complete"
        # Prepare fields for ingestion document update
        couch_kwargs = {
            "enrich_process/status": status,
            "enrich_process/error": dashboard_errors,
            "enrich_process/end_time": iso_utc_with_tz(),
            "enrich_process/total_items": stats['enriched_items'],
            "enrich_process/total_collections": stats['enriched_colls'],
            "enrich_process/missing_id": stats['missing_id'],
            "enrich_process/missing_source_resource": \
                    stats['missing_source_resource']
        }

    try:
        # Update ingestion document
        couch.update_ingestion_doc(ingestion_doc, **couch_kwargs)
    except:
        print >> sys.stderr, "Error updating ingestion document " + \
                             ingestion_doc["_id"]
        return 1

    return 0 if status == "complete" else 1