def test_all_oai_verb_fetchers(): # Profiles that are representative of each type and are not restricted: profiles = [ "harvard.pjs", # mods "clemson.pjs", # qdc "texas.pjs", # untl "uiuc.pjs", # oai_qdc "uiuc_book.pjs", # marc "artstor.pjs" # oai_dc ] for profile in profiles: try: profile_path = "profiles/" + profile with open(profile_path, "r") as f: prof = json.loads(f.read()) if prof.get("type") == "oai_verbs": fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" for response in fetcher.fetch_all_data(): if response['errors']: print >> sys.stderr, response['errors'] assert not response["errors"] assert response["records"] break except Exception as e: print >> sys.stderr, "\nError with %s: %s" % (profile, e.message) assert False
def test_all_oai_verb_fetchers(): # Profiles that are representative of each type and are not restricted: profiles = [ "harvard.pjs", # mods "clemson.pjs", # qdc "texas.pjs", # untl "uiuc.pjs", # oai_qdc "uiuc_book.pjs", # marc "artstor.pjs", # oai_dc ] for profile in profiles: try: profile_path = "profiles/" + profile with open(profile_path, "r") as f: prof = json.loads(f.read()) if prof.get("type") == "oai_verbs": fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" for response in fetcher.fetch_all_data(): if response["errors"]: print >>sys.stderr, response["errors"] assert not response["errors"] assert response["records"] break except Exception as e: print >>sys.stderr, "\nError with %s: %s" % (profile, e.message) assert False
def test_oai_fetcher_with_blacklist(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" fetcher.blacklist = [ "ctm", "spg", "jfb", "jbt", "pre", "dnc", "scp", "swl", "weg", "ghs", "wsb", "mbe", "gcj", "cwp", "nev", "hfp", "big", ] for response in fetcher.fetch_all_data(): pass url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets" list_sets = xmltodict.parse(urlopen(url).read()) scdl_all_sets = [s["setSpec"] for s in list_sets["OAI-PMH"]["ListSets"]["set"]] sets = list(set(scdl_all_sets) - set(fetcher.blacklist)) diff = [s for s in sets if s not in fetcher.collections] assert diff == []
def test_absolute_url_fetcher_mwdl(): profile_path = "profiles/mwdl.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "MWDLFetcher" for response in fetcher.fetch_all_data(set=None): assert not response["errors"] assert response["records"] break
def test_absolute_url_fetcher_uva2(): profile_path = "profiles/virginia_books.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "UVAFetcher" for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] break
def test_absolute_url_fetcher_nypl(): profile_path = "profiles/nypl.pjs" fetcher = create_fetcher(profile_path, uri_base, "akara.ini") assert fetcher.__class__.__name__ == "NYPLFetcher" for response in fetcher.fetch_all_data("cd4c3430-c6cb-012f-ccf3-58d385a7bc34"): assert not response["errors"] assert response["records"] break
def test_absolute_url_fetcher_ia(): profile_path = "profiles/ia.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "IAFetcher" fetcher.endpoint_url_params["rows"] = 10 for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] break
def test_file_fetcher_smithsonian(): profile_path = "profiles/smithsonian.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "EDANFetcher" fetcher.endpoint_url = "file:/%s/test/test_data/smithsonian/" % os.getcwd() for response in fetcher.fetch_all_data(): assert response["errors"] == [] assert response["records"] break
def test_absolute_url_fetcher_nypl(): profile_path = "profiles/nypl.pjs" fetcher = create_fetcher(profile_path, uri_base, "akara.ini") assert fetcher.__class__.__name__ == "NYPLFetcher" for response in fetcher.fetch_all_data( "cd4c3430-c6cb-012f-ccf3-58d385a7bc34" ): assert not response["errors"] assert response["records"] break
def test_oai_fetcher_invalid_set(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" fetcher.sets = ["banana"] for response in fetcher.fetch_all_data(): assert response["errors"] assert not response["records"] assert fetcher.collections.keys() == []
def test_oai_fetcher_all_sets(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets" list_sets = xmltodict.parse(urlopen(url).read()) scdl_all_sets = [s["setSpec"] for s in list_sets["OAI-PMH"]["ListSets"]["set"]] for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] diff = [s for s in scdl_all_sets if s not in fetcher.collections] assert diff == []
def test_oai_fetcher_all_sets(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets" list_sets = xmltodict.parse(urlopen(url).read()) scdl_all_sets = [s['setSpec'] for s in list_sets['OAI-PMH']['ListSets']['set']] for response in fetcher.fetch_all_data(): assert not response["errors"] assert response["records"] diff = [s for s in scdl_all_sets if s not in fetcher.collections] assert diff == []
def queue_and_errors(num_threads, in_doc, config_file, fetch_dir, stats): queue = Queue.Queue(num_threads) t_errors = [] d_errors = [] threads = [ FetcherThread( queue, create_fetcher(in_doc["profile_path"], in_doc["uri_base"], config_file), t_errors, d_errors, fetch_dir, stats) for i in range(num_threads) ] for t in threads: t.daemon = True t.start() return queue, t_errors, d_errors
def test_oai_fetcher_with_blacklist(): profile_path = "profiles/clemson.pjs" fetcher = create_fetcher(profile_path, uri_base, config_file) assert fetcher.__class__.__name__ == "OAIVerbsFetcher" fetcher.blacklist = ["ctm", "spg", "jfb", "jbt", "pre", "dnc", "scp", "swl", "weg", "ghs", "wsb", "mbe", "gcj", "cwp", "nev", "hfp", "big"] for response in fetcher.fetch_all_data(): pass url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets" list_sets = xmltodict.parse(urlopen(url).read()) scdl_all_sets = [s['setSpec'] for s in list_sets['OAI-PMH']['ListSets']['set']] sets = list(set(scdl_all_sets) - set(fetcher.blacklist)) diff = [s for s in sets if s not in fetcher.collections] assert diff == []
def queue_and_errors(num_threads, in_doc, config_file, fetch_dir, stats): queue = Queue.Queue(num_threads) t_errors = [] d_errors = [] threads = [FetcherThread(queue, create_fetcher(in_doc["profile_path"], in_doc["uri_base"], config_file), t_errors, d_errors, fetch_dir, stats) for i in range(num_threads)] for t in threads: t.daemon = True t.start() return queue, t_errors, d_errors
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini") fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1
def main(argv): parser = define_arguments() args = parser.parse_args(argv[1:]) couch = Couch() ingestion_doc = couch.dashboard_db[args.ingestion_document_id] # TODO: profile should be passed to create_fetcher, below. # We need profile in this scope, and the file shouldn't have to # be opened again by create_fetcher. with open(ingestion_doc["profile_path"], "r") as f: profile = json.load(f) # Update ingestion document fetch_dir = create_fetch_dir(ingestion_doc["provider"]) kwargs = { "fetch_process/status": "running", "fetch_process/data_dir": fetch_dir, "fetch_process/start_time": iso_utc_with_tz(), "fetch_process/end_time": None, "fetch_process/error": None, "fetch_process/total_items": None, "fetch_process/total_collections": None } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 error_msg = [] config_file = "akara.ini" fetcher = create_fetcher(ingestion_doc["profile_path"], ingestion_doc["uri_base"], config_file) print "Fetching records for %s" % ingestion_doc["provider"] stats = { "total_items": 0, "total_collections": 0 } try: threads = int(profile.get("fetcher_threads")) or 1 print "Threads: %d" % threads except: print >> sys.stderr, "Can not determine fetcher threads, so using 1" threads = 1 sets = None sets_supported = (profile.get("sets") != "NotSupported") if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"): sets_err, s = fetcher.fetch_sets() if s: sets = iter(s) else: print >> sys.stderr, "Could not get sets: ", sets_err return -1 queue, th_errors, d_errors = queue_and_errors(threads, ingestion_doc, config_file, fetch_dir, stats) status = None try: while True: time.sleep(0.1) try: if print_errors_thrown(th_errors): d_errors.extend(th_errors) raise Exception() if not queue.full(): queue.put(sets.next()) except StopIteration: break # Wait for queue to be empty before returning while True: if queue.empty() and not threads_working: if not print_errors_thrown(th_errors): break else: d_errors.extend(th_errors) raise Exception() time.sleep(0.1) except KeyboardInterrupt: status = "error" msg = "\nCaught keyboard interrupt" print >> sys.stderr, msg d_errors.append(msg) except Exception as e: if e.message: print >> sys.stderr, e.message status = "error" d_errors.append(e.message) finally: if not status == "error": status = "complete" else: # not threads rv = fetch_all_for_set(None, fetcher, fetch_dir) stats["total_items"] += rv["total_items"] stats["total_collections"] += rv["total_collections"] error_msg += rv["errors"] print "Total items: %s" % stats["total_items"] print "Total collections: %s" % stats["total_collections"] # Update ingestion document try: os.rmdir(fetch_dir) # Error if fetch_dir was empty status = "error" error_msg.append("Error, no records fetched") logger.error(error_msg) except: status = "complete" kwargs = { "fetch_process/status": status, "fetch_process/error": error_msg, "fetch_process/end_time": iso_utc_with_tz(), "fetch_process/total_items": stats["total_items"], "fetch_process/total_collections": stats["total_collections"] } try: couch.update_ingestion_doc(ingestion_doc, **kwargs) except: logger.error("Error updating ingestion doc %s in %s" % (ingestion_doc["_id"], __name__)) return -1 return 0 if status == "complete" else -1