def test_webfeedjson(): from amara.thirdparty import json import json url = server() + "akara.webfeed.json?url=http://feeds.delicious.com/v2/rss/recent%3Fmin=1%26count=15" response = urlopen(url) results = json.load(response) print results
def test_get_last_ingestion_document(): with open(DATA) as f: data = json.load(f) couch.ingest(data, PROVIDER, json_content=True) ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER) assert ingestion_doc["ingestionSequence"] == 1 couch.ingest(data, PROVIDER, json_content=True) ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER) assert ingestion_doc["ingestionSequence"] == 2 couch.ingest(data, PROVIDER, json_content=True) ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER) assert ingestion_doc["ingestionSequence"] == 3 couch.ingest(data, PROVIDER, json_content=True) ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER) assert ingestion_doc["ingestionSequence"] == 4 couch.rollback(PROVIDER, 2) ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER) assert ingestion_doc["ingestionSequence"] == 4 couch.ingest(data, PROVIDER, json_content=True) ingestion_doc = couch._get_last_ingestion_doc_for(PROVIDER) assert ingestion_doc["ingestionSequence"] == 5
def _make_log2json_request(query_args): from amara.thirdparty import json url = server() + "akara.wwwlog.json" + query_args req = urllib2.Request(url) req.add_header("Content-Type", "text/plain") response = urllib2.urlopen(req, _apache_query_data) return json.load(response)
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) config_file = ("akara.ini") config = ConfigParser.ConfigParser() config.readfp(open(config_file)) uri_base = "http://localhost:" + config.get("Akara", "Port") with open(args.profile_path, "r") as f: try: profile = json.load(f) except: print "Error, could not load profile in %s" % __name__ return None provider = profile["name"] couch = Couch() latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider) if latest_ingestion_doc and \ getprop(latest_ingestion_doc, "delete_process/status") != "complete": error_msg = "Error, last ingestion did not complete. Review " + \ "dashboard document %s for errors." % \ latest_ingestion_doc["_id"] logger.error(error_msg) print error_msg return None ingestion_document_id = couch._create_ingestion_document(provider, uri_base, args.profile_path) logger.debug("Ingestion document %s created." % ingestion_document_id) return ingestion_document_id
def get_provider_id(profile_path): with open(profile_path, "r") as f: try: profile = json.load(f) return profile['contributor']['@id'] except Exception, err: print "Error, could not load profile in %s: %s" % (__name__, err) return None
def test_atom_json(): from amara.thirdparty import json url = server() + "akara.atom.json?url=http://zepheira.com/feed/atom/" response = urlopen(url) results = json.load(response) items = results["items"] for item in items: assert "title" in item
def test_webfeedjson(): from amara.thirdparty import json import json url = server( ) + "akara.webfeed.json?url=http://feeds.delicious.com/v2/rss/recent%3Fmin=1%26count=15" response = urlopen(url) results = json.load(response) print results
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) with open(args.profile_path, "r") as f: try: profile = json.load(f) except Exception, e: print "Error reading profile: %s" % e return False
def test_rdfa2json(): from amara.thirdparty import json url = server() + "akara.rdfa.json?url=http://zepheira.com/" results = json.load(urllib2.urlopen(url)) for item in results["items"]: if "canonical" in item: assert "zepheira.com" in item["canonical"] break else: raise AssertionError("Could not find 'canonical'")
def test_rdfa2json_with_date(): from amara.thirdparty import json url = server() + "akara.rdfa.json?url=http://www.myspace.com/parishilton" results = json.load(urllib2.urlopen(url)) for item in results["items"]: if "canonical" in item: assert True break else: raise AssertionError("Could not find myspace:lastLogin")
def test_basic(): "test ..." records = [ { "id": "uogbuji", "label": "Uche Ogbuji", "birthstone": "Topaz", "country": "US", "mystery_code": 1, "type": "Person" }, { "id": "emiller", "label": "Eric Miller", "birthstone": "Agate?", "country": "US", "mystery_code": 2, "type": "Person" }, { "id": "mbaker", "label": "Mark Baker", "country": "US", "mystery_code": 3, "type": "Person" } ] outf_handle, outf_file = tempfile.mkstemp(prefix='exhibit_emitter_test_') outf = open(outf_file, 'w') emitter1 = emitter.emitter(outf) for rec in records: emitter1.send(rec) emitter1.send(emitter.ITEMS_DONE_SIGNAL) TYPES1 = { "Person" : { "mystery_code": { "valueType": "number" }, } } emitter1.send(TYPES1) emitter1.close() outf.close() result = json.load(open(outf_file, 'r')) #logging.debug('Result: {0}'.format(repr(result))) items = result[u"items"] #logging.debug('Result: {0}'.format(repr(result))) assert items[0] == records[0] assert items[1] == records[1] assert items[2] == records[2] #assert results == None, "Boo! " return
def main(argv): print "WARNING: Bulk data is now exported/maintained using elasticdump." print "See https://github.com/dpla/automation/blob/develop/ansible/roles/exporter/files/export-provider.sh" parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] if getprop(ingestion_doc, "dashboard_cleanup_process/status") != "complete": print "Error, dashboard cleanup process did not complete" return -1 # Update ingestion document kwargs = { "upload_bulk_data_process/status": "running", "upload_bulk_data_process/start_time": iso_utc_with_tz(), "upload_bulk_data_process/end_time": None, "upload_bulk_data_process/error": None, } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 # TODO: as in the fetch_records.py script, we need profile in this scope # and the file shouldn't have to be opened again with open(ingestion_doc["profile_path"], "r") as profile: contributor = getprop(json.load(profile), "contributor/name") resp = export_database.main([None, "source", contributor, "upload"]) if resp == -1: status = "error" error_msg = "Error uploading bulk data" else: status = "complete" error_msg = None # Update ingestion document kwargs = { "upload_bulk_data_process/status": status, "upload_bulk_data_process/error": error_msg, "upload_bulk_data_process/end_time": iso_utc_with_tz() } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: print "Error updating ingestion document " + ingestion_doc["_id"] return -1 return 0 if status == "complete" else -1
def parse_documents(documents): """ Parses the provided string with json into object. Arguments: documents String - documents from couchdb in string format Returns: Object with parsed json. """ io = StringIO(documents) return json.load(io)
def main(argv): couch = Couch() parser = define_arguments() args = parser.parse_args(argv[1:]) with open(args.profile_path, "r") as f: profile = json.load(f) provider = profile.get("name") if confirm_deletion(provider): couch._delete_all_provider_documents(provider) else: return False
def lucky_google(q=None): ''' A simple and fun transform to return the first hit for a given search Sample request: * curl "http://localhost:8880/akara.luckygoogle?q=zepheira" ''' if q is None: raise AssertionError(Q_REQUIRED) query = urllib.urlencode({'q': q}) url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query json_content = json.load(urllib.urlopen(url)) results = json_content['responseData']['results'] return results[0]['url'].encode('utf-8') + '\n'
def lucky_google(q=None): ''' A simple and fun transform to return the first hit for a given search Sample request: * curl "http://localhost:8880/akara.luckygoogle?q=zepheira" ''' if q is None: raise AssertionError(Q_REQUIRED) query = urllib.urlencode({'q' : q}) url = 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query json_content = json.load(urllib.urlopen(url)) results = json_content['responseData']['results'] return results[0]['url'].encode('utf-8') + '\n'
def __call__(self, place): query = urllib.urlencode(dict(username=self._user, q=place.encode('utf-8'), maxRows='2')) #print self._servicebase + query req = self._servicebase + query stream = urllib2.urlopen(req) resultset = json.load(stream) #print resultset.get(u'geonames') if resultset.get(u'geonames'): result = resultset[u'geonames'][0] #lat, long_ = result[u'lat'], result[u'lng'] ll = "{lat},{lng}".format(**result) self._logger.debug(u"geolookup via geonames {0} yields: {1}".format(self._servicebase + query, repr((place, ll)))) return {place: ll} if ll else {} else: return {}
def ingest(self, file, provider, json_content=None): if not json_content: with open(file) as f: content = json.load(f) else: content = file ingestion_doc_id = self.create_ingestion_doc_and_backup_db(provider) url = server() + "enrich" resp, content = H.request(url, "POST", body=json.dumps(content), headers=headers) docs = json.loads(content) self.process_and_post_to_dpla(docs, ingestion_doc_id) self.process_deleted_docs(ingestion_doc_id) return ingestion_doc_id
def update_document(body, ctype): from StringIO import StringIO io = StringIO(body) parsed_doc = json.load(io) document_id = parsed_doc[u"id"] document = body logger.debug("Storing the document: " + document_id) import httplib h = httplib2.Http() h.force_exception_as_status_code = True url = join(COUCH_DATABASE, document_id) resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER) if str(resp.status).startswith('2'): return content else: logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
def test_ical2json(): from amara.thirdparty import json ical_filename = os.path.join(RESOURCE_DIR, "icalendar_test.ics") url = server() + "ical.json" req = urllib2.Request(url) req.add_header('Content-Type', 'text/calendar') data = open(ical_filename).read() response = urllib2.urlopen(req, data) results = json.load(response) items = results["items"] assert len(items) == 2 assert items[0]["summary"] == "Bastille Day Party" assert items[1]["summary"] == "Akara test"
def create_fetcher(profile_path, uri_base, config_file): fetcher_types = { 'ia': lambda p, u, c: IAFetcher(p, u, c), 'uva': lambda p, u, c: UVAFetcher(p, u, c), 'mwdl': lambda p, u, c: MWDLFetcher(p, u, c), 'nypl': lambda p, u, c: NYPLFetcher(p, u, c), 'nara': lambda p, u, c: NARAFetcher(p, u, c), 'edan': lambda p, u, c: EDANFetcher(p, u, c), 'hathi': lambda p, u, c: HathiFetcher(p, u, c), 'oai_verbs': lambda p, u, c: OAIVerbsFetcher(p, u, c), } with open(profile_path, "r") as f: profile = json.load(f) type = profile.get("type") fetcher = fetcher_types.get(type)(profile, uri_base, config_file) return fetcher
def update_document(body, ctype): logger.debug(body) from StringIO import StringIO io = StringIO(body) parsed_doc = json.load(io) document_id = parsed_doc[u"id"] document = body logger.debug("Storing the document: " + document_id) import httplib h = httplib2.Http() h.force_exception_as_status_code = True url = join(COUCH_DATABASE, document_id) resp, content = h.request(url, 'PUT', body=document, headers=COUCH_AUTH_HEADER) if str(resp.status).startswith('2'): return content else: logger.error("Couldn't store the document %s with the id: %s. " % (document, document_id, ) )
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini") config = ConfigParser.ConfigParser() config.readfp(open(config_file)) uri_base = "http://localhost:" + config.get("Akara", "Port") with open(args.profile_path, "r") as f: try: profile = json.load(f) except: print "Error, could not load profile in %s" % __name__ return None provider = profile["name"] thresholds = profile["thresholds"] fetcher_threads = profile.get("fetcher_threads") or 1 couch = Couch() latest_ingestion_doc = couch._get_last_ingestion_doc_for(provider) if (latest_ingestion_doc and getprop(latest_ingestion_doc, "dashboard_cleanup_process/status") != "complete"): error_msg = "Error, last ingestion did not complete. Review " + \ "dashboard document %s for errors." % \ latest_ingestion_doc["_id"] logger.error(error_msg) print error_msg return None ingestion_document_id = couch._create_ingestion_document(provider, uri_base, args.profile_path, thresholds, fetcher_threads) msg = "Ingestion document %s created." % ingestion_document_id logger.debug(msg) print msg return ingestion_document_id
def ingest(self, file, provider, json_content=None): if not json_content: with open(file) as f: content = json.load(f) else: content = file uri_base = server()[:-1] ingestion_doc_id = self._create_ingestion_document(provider, uri_base, "profiles/clemson.pjs") ingestion_doc = self.dashboard_db[ingestion_doc_id] url = server() + "enrich" body = json.dumps(content) resp, content = H.request(url, "POST", body=body, headers=headers) data = json.loads(content) docs = data["enriched_records"] self._back_up_data(ingestion_doc) self.process_and_post_to_dpla(docs, ingestion_doc) self.process_deleted_docs(ingestion_doc) return ingestion_doc_id
def item_page(self, url, logtag="Requesting item at URL: {0}"): if self._proxy: url = "{0}?url={1}".format(self._proxy, quote(url)) self._logger.debug(logtag.format(url)) start_t = time.time() resp, content = self._h.request(url) retrieved_t = time.time() self._logger.debug("Retrieved in {0}s".format(retrieved_t - start_t)) cachekey = hashlib.md5(content).hexdigest() self._logger.debug('MD5 Hash of HTTP body: {0}'.format(cachekey)) if self._cachedir: try: json_stream = open(os.path.join(self._cachedir, cachekey+'.extract.js')) cached = json.load(json_stream) self._logger.debug('Loaded from cache: {0}'.format(cachekey)) doc = None except (IOError, ValueError): doc = htmlparse(content) cached = None parsed_t = time.time() self._logger.debug("Parsed in {0}s".format(parsed_t - retrieved_t)) return resp, doc, cachekey, cached
def create_fetcher(profile_path, uri_base, config_file): """ Given a fetcher type, creates, imports, and instantiates the appropriate Fetcher subclass. """ def _create_ia_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.ia_fetcher import IAFetcher return IAFetcher(profile, uri_base, config_file) def _create_uva_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.uva_fetcher import UVAFetcher return UVAFetcher(profile, uri_base, config_file) def _create_nypl_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.nypl_fetcher import NYPLFetcher return NYPLFetcher(profile, uri_base, config_file) def _create_nara_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.nara_fetcher import NARAFetcher return NARAFetcher(profile, uri_base, config_file) def _create_edan_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.edan_fetcher import EDANFetcher return EDANFetcher(profile, uri_base, config_file) def _create_mwdl_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.mwdl_fetcher import MWDLFetcher return MWDLFetcher(profile, uri_base, config_file) def _create_getty_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.getty_fetcher import GettyFetcher return GettyFetcher(profile, uri_base, config_file) def _create_hathi_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.hathi_fetcher import HathiFetcher return HathiFetcher(profile, uri_base, config_file) def _create_oai_verbs_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.oai_verbs_fetcher import OAIVerbsFetcher return OAIVerbsFetcher(profile, uri_base, config_file) def _create_mdl_api_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.mdl_api_fetcher import MDLAPIFetcher return MDLAPIFetcher(profile, uri_base, config_file) def _create_cdl_fetcher(profile, uri_base, config_file): from dplaingestion.fetchers.cdl_fetcher import CDLFetcher return CDLFetcher(profile, uri_base, config_file) fetchers = { 'ia': lambda p, u, c: _create_ia_fetcher(p, u, c), 'uva': lambda p, u, c: _create_uva_fetcher(p, u, c), 'nypl': lambda p, u, c: _create_nypl_fetcher(p, u, c), 'nara': lambda p, u, c: _create_nara_fetcher(p, u, c), 'edan': lambda p, u, c: _create_edan_fetcher(p, u, c), 'mwdl': lambda p, u, c: _create_mwdl_fetcher(p, u, c), 'getty': lambda p, u, c: _create_getty_fetcher(p, u, c), 'hathi': lambda p, u, c: _create_hathi_fetcher(p, u, c), 'oai_verbs': lambda p, u, c: _create_oai_verbs_fetcher(p, u, c), 'mdl': lambda p, u, c: _create_mdl_api_fetcher(p, u, c), 'cdl': lambda p, u, c: _create_cdl_fetcher(p, u, c) } with open(profile_path, "r") as f: profile = json.load(f) type = profile.get("type") return fetchers.get(type)(profile, uri_base, config_file)
def test_multiple_ingestions(): import copy with open(DATA) as f: data = json.load(f) data_deleted = copy.deepcopy(data) add_later = [] for i in range(10): add_later.append(data_deleted["records"].pop(2*i)) data_changed = copy.deepcopy(data_deleted) for i in range(5): data_changed["records"][3*i]["title"] = "Changed" data_added = copy.deepcopy(data_changed) data_added["records"] += add_later first_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True) dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)] total_dashboard_docs_first = len(dashboard_db_docs) second_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True) dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)] total_dashboard_docs_second = len(dashboard_db_docs) third_ingestion_doc_id = couch.ingest(data_deleted, PROVIDER, json_content=True) dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)] total_dashboard_docs_third = len(dashboard_db_docs) fourth_ingestion_doc_id = couch.ingest(data_changed, PROVIDER, json_content=True) dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)] total_dashboard_docs_fourth = len(dashboard_db_docs) fifth_ingestion_doc_id = couch.ingest(data_added, PROVIDER, json_content=True) dashboard_db_docs = [doc for doc in couch._query_all_docs(couch.dashboard_db)] total_dashboard_docs_fifth = len(dashboard_db_docs) # Second ingestion should have an extra ingestion doc assert int(total_dashboard_docs_first) + 1 == int(total_dashboard_docs_second) # Third ingestion should have extra ingestion doc + 10 deleted assert int(total_dashboard_docs_second) + 11 == int(total_dashboard_docs_third) # Fourth ingestion should have extra ingestion doc + 5 changed assert int(total_dashboard_docs_third) + 6 == int(total_dashboard_docs_fourth) # Fifth ingestion should have extra ingestion doc + 10 added assert int(total_dashboard_docs_fourth) + 11 == int(total_dashboard_docs_fifth) assert couch.dashboard_db.get(first_ingestion_doc_id)["countAdded"] == 244 assert couch.dashboard_db.get(first_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(first_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countDeleted"] == 10 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countChanged"] == 5 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countAdded"] == 10 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countDeleted"] == 0
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini") fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def test_multiplex(): records = [ { "id": "uogbuji", "label": "Uche Ogbuji", "birthstone": "Topaz", "country": "US", "mystery_code": 1, "type": "Person" }, { "id": "emiller", "label": "Eric Miller", "birthstone": "Agate?", "country": "US", "mystery_code": 2, "type": "Person" }, { "id": "mbaker", "label": "Mark Baker", "country": "US", "mystery_code": 3, "type": "Person" } ] outf1_handle, outf1_file = tempfile.mkstemp(prefix='exhibit_emitter_test_') outf2_handle, outf2_file = tempfile.mkstemp(prefix='exhibit_emitter_test_') outf1 = open(outf1_file, 'w') outf2 = open(outf2_file, 'w') emitter1 = emitter.emitter(outf1) emitter2 = emitter.emitter(outf2) for rec in records: emitter1.send(rec) rec2 = { u"id": rec[u"id"] } emitter2.send(rec2) emitter1.send(emitter.ITEMS_DONE_SIGNAL) emitter2.send(emitter.ITEMS_DONE_SIGNAL) TYPES1 = { "Person" : { "mystery_code": { "valueType": "number" }, } } emitter1.send(TYPES1) emitter2.send(None) emitter1.close() emitter2.close() outf1.close() outf2.close() result1 = json.load(open(outf1_file, 'r')) result2 = json.load(open(outf2_file, 'r')) #logging.debug('Result: {0}'.format(repr(result))) items1 = result1[u"items"] items2 = result2[u"items"] #logging.debug('Result: {0}'.format(repr(result))) assert items1[0] == records[0] assert items1[1] == records[1] assert items1[2] == records[2] assert items2[0] == { u"id": records[0][u"id"] } assert items2[1] == { u"id": records[1][u"id"] } assert items2[2] == { u"id": records[2][u"id"] } #assert results == None, "Boo! " return
def test_multiple_ingestions(): import copy with open(DATA) as f: data = json.load(f) data_deleted = copy.deepcopy(data) add_later = [] for i in range(10): add_later.append(data_deleted["items"].pop(2 * i)) data_changed = copy.deepcopy(data_deleted) for i in range(5): data_changed["items"][3 * i]["title"] = "Changed" data_added = copy.deepcopy(data_changed) data_added["items"] += add_later first_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True) dashboard_db_docs = [ doc for doc in couch._query_all_docs(couch.dashboard_db) ] total_dashboard_docs_first = len(dashboard_db_docs) second_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True) dashboard_db_docs = [ doc for doc in couch._query_all_docs(couch.dashboard_db) ] total_dashboard_docs_second = len(dashboard_db_docs) third_ingestion_doc_id = couch.ingest(data_deleted, PROVIDER, json_content=True) dashboard_db_docs = [ doc for doc in couch._query_all_docs(couch.dashboard_db) ] total_dashboard_docs_third = len(dashboard_db_docs) fourth_ingestion_doc_id = couch.ingest(data_changed, PROVIDER, json_content=True) dashboard_db_docs = [ doc for doc in couch._query_all_docs(couch.dashboard_db) ] total_dashboard_docs_fourth = len(dashboard_db_docs) fifth_ingestion_doc_id = couch.ingest(data_added, PROVIDER, json_content=True) dashboard_db_docs = [ doc for doc in couch._query_all_docs(couch.dashboard_db) ] total_dashboard_docs_fifth = len(dashboard_db_docs) # Second ingestion should have an extra ingestion doc assert int(total_dashboard_docs_first) + 1 == int( total_dashboard_docs_second) # Third ingestion should have extra ingestion doc + 10 deleted assert int(total_dashboard_docs_second) + 11 == int( total_dashboard_docs_third) # Fourth ingestion should have extra ingestion doc + 5 changed assert int(total_dashboard_docs_third) + 6 == int( total_dashboard_docs_fourth) # Fifth ingesiton should have extra ingestion doc + 10 added assert int(total_dashboard_docs_fourth) + 11 == int( total_dashboard_docs_fifth) assert couch.dashboard_db.get(first_ingestion_doc_id)["countAdded"] == 244 assert couch.dashboard_db.get(first_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(first_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countDeleted"] == 10 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countChanged"] == 5 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countAdded"] == 10 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countDeleted"] == 0
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = "akara.ini" fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def test_multiple_ingestions(): import copy with open(DATA) as f: data = json.load(f) data_deleted = copy.deepcopy(data) add_later = [] for i in range(10): add_later.append(data_deleted.pop(2*i)) data_changed = copy.deepcopy(data_deleted) for i in range(5): data_changed[3*i]["title"] = "Changed" data_added = copy.deepcopy(data_changed) data_added += add_later # Verify only item-level documents for the most recent three ingestions # exist in the dashboard database first_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True) dashboard_docs = [doc for doc in couch._query_records_by_ingestion_sequence_include_status(1)] first_ingestion_dashboard_items = len(dashboard_docs) ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)]) # Exclude design docs total_dashboard_records = len([doc for doc in couch._query_all_docs(couch.dashboard_db) if doc.get("type")]) assert total_dashboard_records == first_ingestion_dashboard_items + \ ingestions # Verify only item-level documents for the most recent three ingestions # exist in the dashboard database second_ingestion_doc_id = couch.ingest(data, PROVIDER, json_content=True) dashboard_docs = [doc for doc in couch._query_records_by_ingestion_sequence_include_status(2)] second_ingestion_dashboard_items = len(dashboard_docs) ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)]) # Exclude design docs total_dashboard_records = len([doc for doc in couch._query_all_docs(couch.dashboard_db) if doc.get("type")]) assert total_dashboard_records == first_ingestion_dashboard_items + \ second_ingestion_dashboard_items + \ ingestions # Verify second backup exists second_backup = couch.dashboard_db[second_ingestion_doc_id]["backupDB"] assert second_backup in couch.server # Verify only item-level documents for the most recent three ingestions # exist in the dashboard database third_ingestion_doc_id = couch.ingest(data_deleted, PROVIDER, json_content=True) dashboard_docs = [doc for doc in couch._query_records_by_ingestion_sequence_include_status(3)] third_ingestion_dashboard_items = len(dashboard_docs) ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)]) # Exclude design docs total_dashboard_records = len([doc for doc in couch._query_all_docs(couch.dashboard_db) if doc.get("type")]) assert total_dashboard_records == first_ingestion_dashboard_items + \ second_ingestion_dashboard_items + \ third_ingestion_dashboard_items + \ ingestions # Verify second and third backups exist third_backup = couch.dashboard_db[third_ingestion_doc_id]["backupDB"] assert second_backup in couch.server assert third_backup in couch.server # Verify only item-level documents for the most recent three ingestions # exist in the dashboard database fourth_ingestion_doc_id = couch.ingest(data_changed, PROVIDER, json_content=True) dashboard_docs = [doc for doc in couch._query_records_by_ingestion_sequence_include_status(4)] fourth_ingestion_dashboard_items = len(dashboard_docs) ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)]) # Exclude design docs total_dashboard_records = len([doc for doc in couch._query_all_docs(couch.dashboard_db) if doc.get("type")]) assert total_dashboard_records == second_ingestion_dashboard_items + \ third_ingestion_dashboard_items + \ fourth_ingestion_dashboard_items + \ ingestions # Verify second, third, and fourth backups exist fourth_backup = couch.dashboard_db[fourth_ingestion_doc_id]["backupDB"] assert second_backup in couch.server assert third_backup in couch.server assert fourth_backup in couch.server # Verify only item-level documents for the most recent three ingestions # exist in the dashboard database fifth_ingestion_doc_id = couch.ingest(data_added, PROVIDER, json_content=True) dashboard_docs = [doc for doc in couch._query_records_by_ingestion_sequence_include_status(5)] fifth_ingestion_dashboard_items = len(dashboard_docs) ingestions = len([doc for doc in couch._query_all_provider_ingestion_docs(PROVIDER)]) # Exclude design docs total_dashboard_records = len([doc for doc in couch._query_all_docs(couch.dashboard_db) if doc.get("type")]) assert total_dashboard_records == third_ingestion_dashboard_items + \ fourth_ingestion_dashboard_items + \ fifth_ingestion_dashboard_items + \ ingestions # Verify second backup was removed assert second_backup not in couch.server # Verify third, fourth, and fifth backups exist fifth_backup = couch.dashboard_db[fifth_ingestion_doc_id]["backupDB"] assert third_backup in couch.server assert fourth_backup in couch.server assert fifth_backup in couch.server # Verify count fields for each ingestion document assert couch.dashboard_db.get(first_ingestion_doc_id)["countAdded"] == 243 assert couch.dashboard_db.get(first_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(first_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(second_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(third_ingestion_doc_id)["countDeleted"] == 10 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countAdded"] == 0 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countChanged"] == 5 assert couch.dashboard_db.get(fourth_ingestion_doc_id)["countDeleted"] == 0 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countAdded"] == 10 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countChanged"] == 0 assert couch.dashboard_db.get(fifth_ingestion_doc_id)["countDeleted"] == 0