Ejemplo n.º 1
0
def test_webfeedjson():
    from amara.thirdparty import json
    import json
    url = server() + "akara.webfeed.json?url=http://feeds.delicious.com/v2/rss/recent%3Fmin=1%26count=15"
    response = urlopen(url)
    results = json.load(response)
    print results
Ejemplo n.º 2
0
def test_get_last_ingestion_document():
    with open(DATA) as f:
        data = json.load(f)

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 1

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 2

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 3

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 4

    couch.rollback(PROVIDER, 2)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 4

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 5
Ejemplo n.º 3
0
def _make_log2json_request(query_args):
    from amara.thirdparty import json
    url = server() + "akara.wwwlog.json" + query_args
    req = urllib2.Request(url)
    req.add_header("Content-Type", "text/plain")
    response = urllib2.urlopen(req, _apache_query_data)
    return json.load(response)
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    config_file = ("akara.ini")
    config = ConfigParser.ConfigParser()
    config.readfp(open(config_file))
    uri_base = "http://localhost:" + config.get("Akara", "Port")

    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except:
            print "Error, could not load profile in %s" % __name__
            return None
    provider = profile["name"]

    couch = Couch()
    latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider)
    if latest_ingestion_doc and \
       getprop(latest_ingestion_doc, "delete_process/status") != "complete":
        error_msg = "Error, last ingestion did not complete. Review " + \
                    "dashboard document %s for errors." % \
                    latest_ingestion_doc["_id"]
        logger.error(error_msg)
        print error_msg
        return None

    ingestion_document_id = couch._create_ingestion_document(provider,
                                                             uri_base,
                                                             args.profile_path)
    logger.debug("Ingestion document %s created." % ingestion_document_id)

    return ingestion_document_id
Ejemplo n.º 5
0
def _make_log2json_request(query_args):
    from amara.thirdparty import json
    url = server() + "akara.wwwlog.json" + query_args
    req = urllib2.Request(url)
    req.add_header("Content-Type", "text/plain")
    response = urllib2.urlopen(req, _apache_query_data)
    return json.load(response)
Ejemplo n.º 6
0
def test_get_last_ingestion_document():
    with open(DATA) as f:
        data = json.load(f)

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 1

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 2

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 3

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 4

    couch.rollback(PROVIDER, 2)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 4

    couch.ingest(data, PROVIDER, json_content=True)
    ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER)
    assert ingestion_doc["ingestionSequence"] == 5
Ejemplo n.º 7
0
def get_provider_id(profile_path):
    with open(profile_path, "r") as f:
        try:
            profile = json.load(f)
            return profile['contributor']['@id']
        except Exception, err:
            print "Error, could not load profile in %s: %s" % (__name__, err)
            return None
Ejemplo n.º 8
0
def get_provider_id(profile_path):
    with open(profile_path, "r") as f:
        try:
            profile = json.load(f)
            return profile['contributor']['@id']
        except Exception, err:
            print "Error, could not load profile in %s: %s" % (__name__, err)
            return None
Ejemplo n.º 9
0
def test_atom_json():
    from amara.thirdparty import json
    url = server() + "akara.atom.json?url=http://zepheira.com/feed/atom/"
    response = urlopen(url)
    results = json.load(response)
    items = results["items"]
    for item in items:
        assert "title" in item
Ejemplo n.º 10
0
def test_webfeedjson():
    from amara.thirdparty import json
    import json
    url = server(
    ) + "akara.webfeed.json?url=http://feeds.delicious.com/v2/rss/recent%3Fmin=1%26count=15"
    response = urlopen(url)
    results = json.load(response)
    print results
Ejemplo n.º 11
0
def test_atom_json():
    from amara.thirdparty import json
    url = server() + "akara.atom.json?url=http://zepheira.com/feed/atom/"
    response = urlopen(url)
    results = json.load(response)
    items = results["items"]
    for item in items:
        assert "title" in item
Ejemplo n.º 12
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except Exception, e:
            print "Error reading profile: %s" % e
            return False
Ejemplo n.º 13
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])
    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except Exception, e:
            print "Error reading profile: %s" % e
            return False
Ejemplo n.º 14
0
def test_rdfa2json():
    from amara.thirdparty import json
    url = server() + "akara.rdfa.json?url=http://zepheira.com/"
    results = json.load(urllib2.urlopen(url))
    for item in results["items"]:
        if "canonical" in item:
            assert "zepheira.com" in item["canonical"]
            break
    else:
        raise AssertionError("Could not find 'canonical'")
Ejemplo n.º 15
0
def test_rdfa2json_with_date():
    from amara.thirdparty import json
    url = server() + "akara.rdfa.json?url=http://www.myspace.com/parishilton"
    results = json.load(urllib2.urlopen(url))
    for item in results["items"]:
        if "canonical" in item:
            assert True
            break
    else:
        raise AssertionError("Could not find myspace:lastLogin")
Ejemplo n.º 16
0
def test_rdfa2json_with_date():
    from amara.thirdparty import json
    url = server() + "akara.rdfa.json?url=http://www.myspace.com/parishilton"
    results = json.load(urllib2.urlopen(url))
    for item in results["items"]:
        if "canonical" in item:
            assert True
            break
    else:
        raise AssertionError("Could not find myspace:lastLogin")
Ejemplo n.º 17
0
def test_rdfa2json():
    from amara.thirdparty import json
    url = server() + "akara.rdfa.json?url=http://zepheira.com/"
    results = json.load(urllib2.urlopen(url))
    for item in results["items"]:
        if "canonical" in item:
            assert "zepheira.com" in item["canonical"]
            break
    else:
        raise AssertionError("Could not find 'canonical'")
Ejemplo n.º 18
0
def test_basic():
    "test ..."
    records = [
    {
        "id": "uogbuji",
        "label": "Uche Ogbuji",
        "birthstone": "Topaz",
        "country": "US",
        "mystery_code": 1,
        "type": "Person"
    },
    {
        "id": "emiller",
        "label": "Eric Miller",
        "birthstone": "Agate?",
        "country": "US",
        "mystery_code": 2,
        "type": "Person"
    },
    {
        "id": "mbaker",
        "label": "Mark Baker",
        "country": "US",
        "mystery_code": 3,
        "type": "Person"
    }
    ]

    outf_handle, outf_file = tempfile.mkstemp(prefix='exhibit_emitter_test_')

    outf = open(outf_file, 'w')
    emitter1 = emitter.emitter(outf)
    for rec in records:
       emitter1.send(rec)
    emitter1.send(emitter.ITEMS_DONE_SIGNAL)
    TYPES1 = {
            "Person" : {
                "mystery_code": { "valueType": "number" },
            }
        }

    emitter1.send(TYPES1)
    emitter1.close()
    outf.close()
    result = json.load(open(outf_file, 'r'))
    #logging.debug('Result: {0}'.format(repr(result)))

    items = result[u"items"]

    #logging.debug('Result: {0}'.format(repr(result)))
    assert items[0] == records[0]
    assert items[1] == records[1]
    assert items[2] == records[2]
    #assert results == None, "Boo! "
    return
Ejemplo n.º 19
0
def main(argv):
    print "WARNING: Bulk data is now exported/maintained using elasticdump."
    print "See https://github.com/dpla/automation/blob/develop/ansible/roles/exporter/files/export-provider.sh"

    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]
    if getprop(ingestion_doc,
               "dashboard_cleanup_process/status") != "complete":
        print "Error, dashboard cleanup process did not complete"
        return -1

    # Update ingestion document
    kwargs = {
        "upload_bulk_data_process/status": "running",
        "upload_bulk_data_process/start_time": iso_utc_with_tz(),
        "upload_bulk_data_process/end_time": None,
        "upload_bulk_data_process/error": None,
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    # TODO: as in the fetch_records.py script, we need profile in this scope
    #       and the file shouldn't have to be opened again 
    with open(ingestion_doc["profile_path"], "r") as profile:
        contributor = getprop(json.load(profile), "contributor/name")

    resp = export_database.main([None, "source", contributor, "upload"])
    if resp == -1:
        status = "error"
        error_msg = "Error uploading bulk data"
    else:
        status = "complete"
        error_msg = None

    # Update ingestion document
    kwargs = {
        "upload_bulk_data_process/status": status,
        "upload_bulk_data_process/error": error_msg,
        "upload_bulk_data_process/end_time": iso_utc_with_tz()
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        print "Error updating ingestion document " + ingestion_doc["_id"]
        return -1

    return 0 if status == "complete" else -1
Ejemplo n.º 20
0
def parse_documents(documents):
    """
    Parses the provided string with json into object.

    Arguments:
        documents String - documents from couchdb in string format

    Returns:
        Object with parsed json.
    """
    io = StringIO(documents)
    return json.load(io)
Ejemplo n.º 21
0
def main(argv):
    couch = Couch()
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    with open(args.profile_path, "r") as f:
        profile = json.load(f)

    provider = profile.get("name")
    if confirm_deletion(provider):
        couch._delete_all_provider_documents(provider)
    else:
        return False
def main(argv):
    couch = Couch()
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    with open(args.profile_path, "r") as f:
        profile = json.load(f)

    provider = profile.get("name")
    if confirm_deletion(provider):
        couch._delete_all_provider_documents(provider)
    else:
        return False
Ejemplo n.º 23
0
def lucky_google(q=None):
    '''
    A simple and fun transform to return the first hit for a given search
    
    Sample request:
    * curl "http://localhost:8880/akara.luckygoogle?q=zepheira"
    '''
    if q is None:
        raise AssertionError(Q_REQUIRED)
    query = urllib.urlencode({'q': q})
    url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query
    json_content = json.load(urllib.urlopen(url))
    results = json_content['responseData']['results']
    return results[0]['url'].encode('utf-8') + '\n'
Ejemplo n.º 24
0
def lucky_google(q=None):
    '''
    A simple and fun transform to return the first hit for a given search
    
    Sample request:
    * curl "http://localhost:8880/akara.luckygoogle?q=zepheira"
    '''
    if q is None:
        raise AssertionError(Q_REQUIRED)
    query = urllib.urlencode({'q' : q})
    url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query
    json_content = json.load(urllib.urlopen(url))
    results = json_content['responseData']['results']
    return results[0]['url'].encode('utf-8') + '\n'
Ejemplo n.º 25
0
Archivo: geo.py Proyecto: dpla/zen
 def __call__(self, place):
     query = urllib.urlencode(dict(username=self._user, q=place.encode('utf-8'), maxRows='2'))
     #print self._servicebase + query
     req = self._servicebase + query
     stream = urllib2.urlopen(req)
     resultset = json.load(stream)
     #print resultset.get(u'geonames')
     if resultset.get(u'geonames'):
         result = resultset[u'geonames'][0]
         #lat, long_ = result[u'lat'], result[u'lng']
         ll = "{lat},{lng}".format(**result)
         self._logger.debug(u"geolookup via geonames {0} yields: {1}".format(self._servicebase + query, repr((place, ll))))
         return {place: ll} if ll else {}
     else:
         return {}
Ejemplo n.º 26
0
    def ingest(self, file, provider, json_content=None):
        if not json_content:
            with open(file) as f:
                content = json.load(f)
        else:
            content = file

        ingestion_doc_id = self.create_ingestion_doc_and_backup_db(provider)
        url = server() + "enrich"
        resp, content = H.request(url,
                                  "POST",
                                  body=json.dumps(content),
                                  headers=headers)
        docs = json.loads(content)
        self.process_and_post_to_dpla(docs, ingestion_doc_id)
        self.process_deleted_docs(ingestion_doc_id)
        return ingestion_doc_id
Ejemplo n.º 27
0
def update_document(body, ctype):
    from StringIO import StringIO
    io = StringIO(body) 
    parsed_doc = json.load(io) 
    document_id = parsed_doc[u"id"]
    document  = body

    logger.debug("Storing the document: " + document_id)
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, document_id)
    resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
Ejemplo n.º 28
0
def test_ical2json():
    from amara.thirdparty import json

    ical_filename = os.path.join(RESOURCE_DIR, "icalendar_test.ics")
    url = server() + "ical.json"

    req = urllib2.Request(url)
    req.add_header('Content-Type', 'text/calendar')

    data = open(ical_filename).read()

    response = urllib2.urlopen(req, data)
    results = json.load(response)

    items = results["items"]
    assert len(items) == 2
    assert items[0]["summary"] == "Bastille Day Party"
    assert items[1]["summary"] == "Akara test"
Ejemplo n.º 29
0
def create_fetcher(profile_path, uri_base, config_file):
    fetcher_types = {
        'ia': lambda p, u, c: IAFetcher(p, u, c),
        'uva': lambda p, u, c: UVAFetcher(p, u, c),
        'mwdl': lambda p, u, c: MWDLFetcher(p, u, c),
        'nypl': lambda p, u, c: NYPLFetcher(p, u, c),
        'nara': lambda p, u, c: NARAFetcher(p, u, c),
        'edan': lambda p, u, c: EDANFetcher(p, u, c),
        'hathi': lambda p, u, c: HathiFetcher(p, u, c),
        'oai_verbs': lambda p, u, c: OAIVerbsFetcher(p, u, c),
    }

    with open(profile_path, "r") as f:
        profile = json.load(f)
    type = profile.get("type")
    fetcher = fetcher_types.get(type)(profile, uri_base, config_file)

    return fetcher
Ejemplo n.º 30
0
def update_document(body, ctype):
    logger.debug(body)
    from StringIO import StringIO
    io = StringIO(body) 
    parsed_doc = json.load(io) 
    document_id = parsed_doc[u"id"]
    document  = body

    logger.debug("Storing the document: " + document_id)
    import httplib
    h = httplib2.Http()
    h.force_exception_as_status_code = True
    url = join(COUCH_DATABASE, document_id)
    resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER)
    if str(resp.status).startswith('2'):
        return content
    else:
        logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
Ejemplo n.º 31
0
def test_ical2json():
    from amara.thirdparty import json

    ical_filename = os.path.join(RESOURCE_DIR, "icalendar_test.ics")
    url = server() + "ical.json"

    req = urllib2.Request(url)
    req.add_header('Content-Type', 'text/calendar')

    data = open(ical_filename).read()

    response = urllib2.urlopen(req, data)
    results = json.load(response)

    items = results["items"]
    assert len(items) == 2
    assert items[0]["summary"] == "Bastille Day Party"
    assert items[1]["summary"] == "Akara test"
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini")
    config = ConfigParser.ConfigParser()
    config.readfp(open(config_file))
    uri_base = "http://localhost:" + config.get("Akara", "Port")

    with open(args.profile_path, "r") as f:
        try:
            profile = json.load(f)
        except:
            print "Error, could not load profile in %s" % __name__
            return None
    provider = profile["name"]
    thresholds = profile["thresholds"]
    fetcher_threads = profile.get("fetcher_threads") or 1

    couch = Couch()
    latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider)
    if (latest_ingestion_doc and
        getprop(latest_ingestion_doc,
                "dashboard_cleanup_process/status") != "complete"):
        error_msg = "Error, last ingestion did not complete. Review " + \
                    "dashboard document %s for errors." % \
                    latest_ingestion_doc["_id"]
        logger.error(error_msg)
        print error_msg
        return None

    ingestion_document_id = couch._create_ingestion_document(provider,
                                                             uri_base,
                                                             args.profile_path,
                                                             thresholds,
                                                             fetcher_threads)
    msg = "Ingestion document %s created." % ingestion_document_id
    logger.debug(msg)
    print msg

    return ingestion_document_id
Ejemplo n.º 33
0
    def ingest(self, file, provider, json_content=None):
        if not json_content:
            with open(file) as f:
                content = json.load(f)
        else:
            content = file

        uri_base = server()[:-1]
        ingestion_doc_id = self._create_ingestion_document(provider, uri_base,
                                                           "profiles/clemson.pjs")
        ingestion_doc = self.dashboard_db[ingestion_doc_id]

        url = server() + "enrich"
        body = json.dumps(content)
        resp, content = H.request(url, "POST", body=body, headers=headers)
        data = json.loads(content)
        docs = data["enriched_records"]
        self._back_up_data(ingestion_doc)
        self.process_and_post_to_dpla(docs, ingestion_doc)
        self.process_deleted_docs(ingestion_doc)
        return ingestion_doc_id
Ejemplo n.º 34
0
 def item_page(self, url, logtag="Requesting item at URL: {0}"):
     if self._proxy:
         url = "{0}?url={1}".format(self._proxy, quote(url))
     self._logger.debug(logtag.format(url))
     start_t = time.time()
     resp, content = self._h.request(url)
     retrieved_t = time.time()
     self._logger.debug("Retrieved in {0}s".format(retrieved_t - start_t))
     cachekey = hashlib.md5(content).hexdigest()
     self._logger.debug('MD5 Hash of HTTP body: {0}'.format(cachekey))
     if self._cachedir:
         try:
             json_stream = open(os.path.join(self._cachedir, cachekey+'.extract.js'))
             cached = json.load(json_stream)
             self._logger.debug('Loaded from cache: {0}'.format(cachekey))
             doc = None
         except (IOError, ValueError):
             doc = htmlparse(content)
             cached = None
     parsed_t = time.time()
     self._logger.debug("Parsed in {0}s".format(parsed_t - retrieved_t))
     return resp, doc, cachekey, cached
Ejemplo n.º 35
0
def create_fetcher(profile_path, uri_base, config_file):
    """
    Given a fetcher type, creates, imports, and instantiates the appropriate
    Fetcher subclass.
    """

    def _create_ia_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.ia_fetcher import IAFetcher
        return IAFetcher(profile, uri_base, config_file)

    def _create_uva_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.uva_fetcher import UVAFetcher
        return UVAFetcher(profile, uri_base, config_file)

    def _create_nypl_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.nypl_fetcher import NYPLFetcher
        return NYPLFetcher(profile, uri_base, config_file)

    def _create_nara_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.nara_fetcher import NARAFetcher
        return NARAFetcher(profile, uri_base, config_file)

    def _create_edan_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.edan_fetcher import EDANFetcher
        return EDANFetcher(profile, uri_base, config_file)

    def _create_mwdl_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.mwdl_fetcher import MWDLFetcher
        return MWDLFetcher(profile, uri_base, config_file)

    def _create_getty_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.getty_fetcher import GettyFetcher
        return GettyFetcher(profile, uri_base, config_file)

    def _create_hathi_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.hathi_fetcher import HathiFetcher
        return HathiFetcher(profile, uri_base, config_file)

    def _create_oai_verbs_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.oai_verbs_fetcher import OAIVerbsFetcher
        return OAIVerbsFetcher(profile, uri_base, config_file)

    def _create_mdl_api_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.mdl_api_fetcher import MDLAPIFetcher
        return MDLAPIFetcher(profile, uri_base, config_file)

    def _create_cdl_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.cdl_fetcher import CDLFetcher
        return CDLFetcher(profile, uri_base, config_file)

    fetchers = {
        'ia':           lambda p, u, c: _create_ia_fetcher(p, u, c),
        'uva':          lambda p, u, c: _create_uva_fetcher(p, u, c),
        'nypl':         lambda p, u, c: _create_nypl_fetcher(p, u, c),
        'nara':         lambda p, u, c: _create_nara_fetcher(p, u, c),
        'edan':         lambda p, u, c: _create_edan_fetcher(p, u, c),
        'mwdl':         lambda p, u, c: _create_mwdl_fetcher(p, u, c),
        'getty':        lambda p, u, c: _create_getty_fetcher(p, u, c),
        'hathi':        lambda p, u, c: _create_hathi_fetcher(p, u, c),
        'oai_verbs':    lambda p, u, c: _create_oai_verbs_fetcher(p, u, c),
        'mdl':          lambda p, u, c: _create_mdl_api_fetcher(p, u, c),
        'cdl':          lambda p, u, c: _create_cdl_fetcher(p, u, c)
    }

    with open(profile_path, "r") as f:
        profile = json.load(f)
    type = profile.get("type")

    return fetchers.get(type)(profile, uri_base, config_file)
Ejemplo n.º 36
0
def test_multiple_ingestions():
    import copy

    with open(DATA) as f:
        data = json.load(f)

    data_deleted = copy.deepcopy(data)
    add_later = []
    for i in range(10):
        add_later.append(data_deleted["records"].pop(2*i))

    data_changed = copy.deepcopy(data_deleted)
    for i in range(5):
        data_changed["records"][3*i]["title"] = "Changed"

    data_added = copy.deepcopy(data_changed)
    data_added["records"] += add_later

    first_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True)
    dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)]
    total_dashboard_docs_first = len(dashboard_db_docs)

    second_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True)
    dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)]
    total_dashboard_docs_second = len(dashboard_db_docs)

    third_ingestion_doc_id = couch.ingest(data_deleted, PROVIDER, json_content=True)
    dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)]
    total_dashboard_docs_third = len(dashboard_db_docs)

    fourth_ingestion_doc_id = couch.ingest(data_changed, PROVIDER, json_content=True)
    dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)]
    total_dashboard_docs_fourth = len(dashboard_db_docs)

    fifth_ingestion_doc_id = couch.ingest(data_added, PROVIDER, json_content=True)
    dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)]
    total_dashboard_docs_fifth = len(dashboard_db_docs)

    # Second ingestion should have an extra ingestion doc
    assert int(total_dashboard_docs_first) + 1 == int(total_dashboard_docs_second)
    # Third ingestion should have extra ingestion doc + 10 deleted
    assert int(total_dashboard_docs_second) + 11 == int(total_dashboard_docs_third)
    # Fourth ingestion should have extra ingestion doc + 5 changed
    assert int(total_dashboard_docs_third) + 6 == int(total_dashboard_docs_fourth)
    # Fifth ingestion should have extra ingestion doc + 10 added
    assert int(total_dashboard_docs_fourth) + 11 == int(total_dashboard_docs_fifth)
    

    assert couch.dashboard_db.get(first_ingestion_doc_id)["countAdded"] == 244
    assert couch.dashboard_db.get(first_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(first_ingestion_doc_id)["countDeleted"] == 0

    assert couch.dashboard_db.get(second_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(second_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(second_ingestion_doc_id)["countDeleted"] == 0
    

    assert couch.dashboard_db.get(third_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(third_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(third_ingestion_doc_id)["countDeleted"] == 10

    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countChanged"] == 5
    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countDeleted"] == 0

    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countAdded"] == 10
    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countDeleted"] == 0
Ejemplo n.º 37
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini")

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
def create_fetcher(profile_path, uri_base, config_file):
    """
    Given a fetcher type, creates, imports, and instantiates the appropriate
    Fetcher subclass.
    """
    def _create_ia_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.ia_fetcher import IAFetcher
        return IAFetcher(profile, uri_base, config_file)

    def _create_uva_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.uva_fetcher import UVAFetcher
        return UVAFetcher(profile, uri_base, config_file)

    def _create_nypl_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.nypl_fetcher import NYPLFetcher
        return NYPLFetcher(profile, uri_base, config_file)

    def _create_nara_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.nara_fetcher import NARAFetcher
        return NARAFetcher(profile, uri_base, config_file)

    def _create_edan_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.edan_fetcher import EDANFetcher
        return EDANFetcher(profile, uri_base, config_file)

    def _create_mwdl_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.mwdl_fetcher import MWDLFetcher
        return MWDLFetcher(profile, uri_base, config_file)

    def _create_getty_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.getty_fetcher import GettyFetcher
        return GettyFetcher(profile, uri_base, config_file)

    def _create_hathi_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.hathi_fetcher import HathiFetcher
        return HathiFetcher(profile, uri_base, config_file)

    def _create_oai_verbs_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.oai_verbs_fetcher import OAIVerbsFetcher
        return OAIVerbsFetcher(profile, uri_base, config_file)

    def _create_mdl_api_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.mdl_api_fetcher import MDLAPIFetcher
        return MDLAPIFetcher(profile, uri_base, config_file)

    def _create_cdl_fetcher(profile, uri_base, config_file):
        from dplaingestion.fetchers.cdl_fetcher import CDLFetcher
        return CDLFetcher(profile, uri_base, config_file)

    fetchers = {
        'ia': lambda p, u, c: _create_ia_fetcher(p, u, c),
        'uva': lambda p, u, c: _create_uva_fetcher(p, u, c),
        'nypl': lambda p, u, c: _create_nypl_fetcher(p, u, c),
        'nara': lambda p, u, c: _create_nara_fetcher(p, u, c),
        'edan': lambda p, u, c: _create_edan_fetcher(p, u, c),
        'mwdl': lambda p, u, c: _create_mwdl_fetcher(p, u, c),
        'getty': lambda p, u, c: _create_getty_fetcher(p, u, c),
        'hathi': lambda p, u, c: _create_hathi_fetcher(p, u, c),
        'oai_verbs': lambda p, u, c: _create_oai_verbs_fetcher(p, u, c),
        'mdl': lambda p, u, c: _create_mdl_api_fetcher(p, u, c),
        'cdl': lambda p, u, c: _create_cdl_fetcher(p, u, c)
    }

    with open(profile_path, "r") as f:
        profile = json.load(f)
    type = profile.get("type")

    return fetchers.get(type)(profile, uri_base, config_file)
Ejemplo n.º 39
0
def test_multiplex():
    records = [
    {
        "id": "uogbuji",
        "label": "Uche Ogbuji",
        "birthstone": "Topaz",
        "country": "US",
        "mystery_code": 1,
        "type": "Person"
    },
    {
        "id": "emiller",
        "label": "Eric Miller",
        "birthstone": "Agate?",
        "country": "US",
        "mystery_code": 2,
        "type": "Person"
    },
    {
        "id": "mbaker",
        "label": "Mark Baker",
        "country": "US",
        "mystery_code": 3,
        "type": "Person"
    }
    ]

    outf1_handle, outf1_file = tempfile.mkstemp(prefix='exhibit_emitter_test_')
    outf2_handle, outf2_file = tempfile.mkstemp(prefix='exhibit_emitter_test_')

    outf1 = open(outf1_file, 'w')
    outf2 = open(outf2_file, 'w')
    emitter1 = emitter.emitter(outf1)
    emitter2 = emitter.emitter(outf2)
    for rec in records:
       emitter1.send(rec)
       rec2 = { u"id": rec[u"id"] }
       emitter2.send(rec2)
    emitter1.send(emitter.ITEMS_DONE_SIGNAL)
    emitter2.send(emitter.ITEMS_DONE_SIGNAL)
    TYPES1 = {
            "Person" : {
                "mystery_code": { "valueType": "number" },
            }
        }

    emitter1.send(TYPES1)
    emitter2.send(None)
    emitter1.close()
    emitter2.close()
    outf1.close()
    outf2.close()
    result1 = json.load(open(outf1_file, 'r'))
    result2 = json.load(open(outf2_file, 'r'))
    #logging.debug('Result: {0}'.format(repr(result)))

    items1 = result1[u"items"]
    items2 = result2[u"items"]

    #logging.debug('Result: {0}'.format(repr(result)))
    assert items1[0] == records[0]
    assert items1[1] == records[1]
    assert items1[2] == records[2]
    assert items2[0] == { u"id": records[0][u"id"] }
    assert items2[1] == { u"id": records[1][u"id"] }
    assert items2[2] == { u"id": records[2][u"id"] }
    #assert results == None, "Boo! "
    return
Ejemplo n.º 40
0
def test_multiple_ingestions():
    import copy

    with open(DATA) as f:
        data = json.load(f)

    data_deleted = copy.deepcopy(data)
    add_later = []
    for i in range(10):
        add_later.append(data_deleted["items"].pop(2 * i))

    data_changed = copy.deepcopy(data_deleted)
    for i in range(5):
        data_changed["items"][3 * i]["title"] = "Changed"

    data_added = copy.deepcopy(data_changed)
    data_added["items"] += add_later

    first_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True)
    dashboard_db_docs = [
        doc for doc in couch._query_all_docs(couch.dashboard_db)
    ]
    total_dashboard_docs_first = len(dashboard_db_docs)

    second_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True)
    dashboard_db_docs = [
        doc for doc in couch._query_all_docs(couch.dashboard_db)
    ]
    total_dashboard_docs_second = len(dashboard_db_docs)

    third_ingestion_doc_id = couch.ingest(data_deleted,
                                          PROVIDER,
                                          json_content=True)
    dashboard_db_docs = [
        doc for doc in couch._query_all_docs(couch.dashboard_db)
    ]
    total_dashboard_docs_third = len(dashboard_db_docs)

    fourth_ingestion_doc_id = couch.ingest(data_changed,
                                           PROVIDER,
                                           json_content=True)
    dashboard_db_docs = [
        doc for doc in couch._query_all_docs(couch.dashboard_db)
    ]
    total_dashboard_docs_fourth = len(dashboard_db_docs)

    fifth_ingestion_doc_id = couch.ingest(data_added,
                                          PROVIDER,
                                          json_content=True)
    dashboard_db_docs = [
        doc for doc in couch._query_all_docs(couch.dashboard_db)
    ]
    total_dashboard_docs_fifth = len(dashboard_db_docs)

    # Second ingestion should have an extra ingestion doc
    assert int(total_dashboard_docs_first) + 1 == int(
        total_dashboard_docs_second)
    # Third ingestion should have extra ingestion doc + 10 deleted
    assert int(total_dashboard_docs_second) + 11 == int(
        total_dashboard_docs_third)
    # Fourth ingestion should have extra ingestion doc + 5 changed
    assert int(total_dashboard_docs_third) + 6 == int(
        total_dashboard_docs_fourth)
    # Fifth ingesiton should have extra ingestion doc + 10 added
    assert int(total_dashboard_docs_fourth) + 11 == int(
        total_dashboard_docs_fifth)

    assert couch.dashboard_db.get(first_ingestion_doc_id)["countAdded"] == 244
    assert couch.dashboard_db.get(first_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(first_ingestion_doc_id)["countDeleted"] == 0

    assert couch.dashboard_db.get(second_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(second_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(second_ingestion_doc_id)["countDeleted"] == 0

    assert couch.dashboard_db.get(third_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(third_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(third_ingestion_doc_id)["countDeleted"] == 10

    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countChanged"] == 5
    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countDeleted"] == 0

    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countAdded"] == 10
    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countDeleted"] == 0
Ejemplo n.º 41
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = "akara.ini"

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
Ejemplo n.º 42
0
def test_multiple_ingestions():
    import copy

    with open(DATA) as f:
        data = json.load(f)

    data_deleted = copy.deepcopy(data)
    add_later = []
    for i in range(10):
        add_later.append(data_deleted.pop(2*i))

    data_changed = copy.deepcopy(data_deleted)
    for i in range(5):
        data_changed[3*i]["title"] = "Changed"

    data_added = copy.deepcopy(data_changed)
    data_added += add_later

    # Verify only item-level documents for the most recent three ingestions
    # exist in the dashboard database
    first_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True)
    dashboard_docs = [doc for doc in
                      couch._query_records_by_ingestion_sequence_include_status(1)]
    first_ingestion_dashboard_items = len(dashboard_docs)
    ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)])
    # Exclude design docs
    total_dashboard_records = len([doc for doc in
                                   couch._query_all_docs(couch.dashboard_db) if
                                   doc.get("type")])

    assert total_dashboard_records == first_ingestion_dashboard_items + \
                                      ingestions
    
    # Verify only item-level documents for the most recent three ingestions
    # exist in the dashboard database
    second_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True)
    dashboard_docs = [doc for doc in
                      couch._query_records_by_ingestion_sequence_include_status(2)]
    second_ingestion_dashboard_items = len(dashboard_docs)
    ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)])
    # Exclude design docs
    total_dashboard_records = len([doc for doc in
                                   couch._query_all_docs(couch.dashboard_db) if
                                   doc.get("type")])

    assert total_dashboard_records == first_ingestion_dashboard_items + \
                                      second_ingestion_dashboard_items + \
                                      ingestions
    # Verify second backup exists
    second_backup = couch.dashboard_db[second_ingestion_doc_id]["backupDB"]
    assert second_backup in couch.server

    # Verify only item-level documents for the most recent three ingestions
    # exist in the dashboard database
    third_ingestion_doc_id = couch.ingest(data_deleted, PROVIDER, json_content=True)
    dashboard_docs = [doc for doc in
                      couch._query_records_by_ingestion_sequence_include_status(3)]
    third_ingestion_dashboard_items = len(dashboard_docs)
    ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)])
    # Exclude design docs
    total_dashboard_records = len([doc for doc in
                                   couch._query_all_docs(couch.dashboard_db) if
                                   doc.get("type")])

    assert total_dashboard_records == first_ingestion_dashboard_items + \
                                      second_ingestion_dashboard_items + \
                                      third_ingestion_dashboard_items + \
                                      ingestions
    # Verify second and third backups exist
    third_backup = couch.dashboard_db[third_ingestion_doc_id]["backupDB"]
    assert second_backup in couch.server
    assert third_backup in couch.server

    # Verify only item-level documents for the most recent three ingestions
    # exist in the dashboard database
    fourth_ingestion_doc_id = couch.ingest(data_changed, PROVIDER, json_content=True)
    dashboard_docs = [doc for doc in
                      couch._query_records_by_ingestion_sequence_include_status(4)]
    fourth_ingestion_dashboard_items = len(dashboard_docs)
    ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)])
    # Exclude design docs
    total_dashboard_records = len([doc for doc in
                                   couch._query_all_docs(couch.dashboard_db) if
                                   doc.get("type")])

    assert total_dashboard_records == second_ingestion_dashboard_items + \
                                      third_ingestion_dashboard_items + \
                                      fourth_ingestion_dashboard_items + \
                                      ingestions
    # Verify second, third, and fourth backups exist
    fourth_backup = couch.dashboard_db[fourth_ingestion_doc_id]["backupDB"]
    assert second_backup in couch.server
    assert third_backup in couch.server
    assert fourth_backup in couch.server

    # Verify only item-level documents for the most recent three ingestions
    # exist in the dashboard database
    fifth_ingestion_doc_id = couch.ingest(data_added, PROVIDER, json_content=True)
    dashboard_docs = [doc for doc in
                      couch._query_records_by_ingestion_sequence_include_status(5)]
    fifth_ingestion_dashboard_items = len(dashboard_docs)
    ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)])
    # Exclude design docs
    total_dashboard_records = len([doc for doc in
                                   couch._query_all_docs(couch.dashboard_db) if
                                   doc.get("type")])

    assert total_dashboard_records == third_ingestion_dashboard_items + \
                                      fourth_ingestion_dashboard_items + \
                                      fifth_ingestion_dashboard_items + \
                                      ingestions
    # Verify second backup was removed
    assert second_backup not in couch.server
    # Verify third, fourth, and fifth backups exist
    fifth_backup = couch.dashboard_db[fifth_ingestion_doc_id]["backupDB"]
    assert third_backup in couch.server
    assert fourth_backup in couch.server
    assert fifth_backup in couch.server

    # Verify count fields for each ingestion document
    assert couch.dashboard_db.get(first_ingestion_doc_id)["countAdded"] == 243
    assert couch.dashboard_db.get(first_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(first_ingestion_doc_id)["countDeleted"] == 0

    assert couch.dashboard_db.get(second_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(second_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(second_ingestion_doc_id)["countDeleted"] == 0
    
    assert couch.dashboard_db.get(third_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(third_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(third_ingestion_doc_id)["countDeleted"] == 10

    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countAdded"] == 0
    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countChanged"] == 5
    assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countDeleted"] == 0

    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countAdded"] == 10
    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countChanged"] == 0
    assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countDeleted"] == 0