Beispiel #1
0
def test_all_oai_verb_fetchers():
    # Profiles that are representative of each type and are not restricted:
    profiles = [
        "harvard.pjs",   # mods
        "clemson.pjs",   # qdc
        "texas.pjs",     # untl
        "uiuc.pjs",      # oai_qdc
        "uiuc_book.pjs", # marc
        "artstor.pjs"    # oai_dc
    ]
    for profile in profiles:
        try:
            profile_path = "profiles/" + profile
            with open(profile_path, "r") as f:
                prof = json.loads(f.read())
            if prof.get("type") == "oai_verbs":
                fetcher =  create_fetcher(profile_path,
                                          uri_base,
                                          config_file)
                assert fetcher.__class__.__name__ == "OAIVerbsFetcher"
                for response in fetcher.fetch_all_data():
                    if response['errors']:
                        print >> sys.stderr, response['errors']
                    assert not response["errors"]
                    assert response["records"]
                    break
        except Exception as e:
            print >> sys.stderr, "\nError with %s: %s" % (profile,
                                                          e.message)
            assert False
Beispiel #2
0
def test_all_oai_verb_fetchers():
    # Profiles that are representative of each type and are not restricted:
    profiles = [
        "harvard.pjs",  # mods
        "clemson.pjs",  # qdc
        "texas.pjs",  # untl
        "uiuc.pjs",  # oai_qdc
        "uiuc_book.pjs",  # marc
        "artstor.pjs",  # oai_dc
    ]
    for profile in profiles:
        try:
            profile_path = "profiles/" + profile
            with open(profile_path, "r") as f:
                prof = json.loads(f.read())
            if prof.get("type") == "oai_verbs":
                fetcher = create_fetcher(profile_path, uri_base, config_file)
                assert fetcher.__class__.__name__ == "OAIVerbsFetcher"
                for response in fetcher.fetch_all_data():
                    if response["errors"]:
                        print >>sys.stderr, response["errors"]
                    assert not response["errors"]
                    assert response["records"]
                    break
        except Exception as e:
            print >>sys.stderr, "\nError with %s: %s" % (profile, e.message)
            assert False
Beispiel #3
0
def test_oai_fetcher_with_blacklist():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"
    fetcher.blacklist = [
        "ctm",
        "spg",
        "jfb",
        "jbt",
        "pre",
        "dnc",
        "scp",
        "swl",
        "weg",
        "ghs",
        "wsb",
        "mbe",
        "gcj",
        "cwp",
        "nev",
        "hfp",
        "big",
    ]
    for response in fetcher.fetch_all_data():
        pass
    url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets"
    list_sets = xmltodict.parse(urlopen(url).read())
    scdl_all_sets = [s["setSpec"] for s in list_sets["OAI-PMH"]["ListSets"]["set"]]
    sets = list(set(scdl_all_sets) - set(fetcher.blacklist))
    diff = [s for s in sets if s not in fetcher.collections]
    assert diff == []
Beispiel #4
0
def test_absolute_url_fetcher_mwdl():
    profile_path = "profiles/mwdl.pjs"
    fetcher =  create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "MWDLFetcher"

    for response in fetcher.fetch_all_data(set=None):
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #5
0
def test_absolute_url_fetcher_uva2():
    profile_path = "profiles/virginia_books.pjs"
    fetcher =  create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "UVAFetcher"

    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #6
0
def test_absolute_url_fetcher_uva2():
    profile_path = "profiles/virginia_books.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "UVAFetcher"

    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #7
0
def test_absolute_url_fetcher_mwdl():
    profile_path = "profiles/mwdl.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "MWDLFetcher"

    for response in fetcher.fetch_all_data(set=None):
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #8
0
def test_absolute_url_fetcher_nypl():
    profile_path = "profiles/nypl.pjs"
    fetcher = create_fetcher(profile_path, uri_base, "akara.ini")
    assert fetcher.__class__.__name__ == "NYPLFetcher"

    for response in fetcher.fetch_all_data("cd4c3430-c6cb-012f-ccf3-58d385a7bc34"):
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #9
0
def test_absolute_url_fetcher_ia():
    profile_path = "profiles/ia.pjs"
    fetcher =  create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "IAFetcher"

    fetcher.endpoint_url_params["rows"] = 10
    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #10
0
def test_file_fetcher_smithsonian():
    profile_path = "profiles/smithsonian.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "EDANFetcher"

    fetcher.endpoint_url = "file:/%s/test/test_data/smithsonian/" % os.getcwd()
    for response in fetcher.fetch_all_data():
        assert response["errors"] == []
        assert response["records"]
        break
Beispiel #11
0
def test_absolute_url_fetcher_ia():
    profile_path = "profiles/ia.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "IAFetcher"

    fetcher.endpoint_url_params["rows"] = 10
    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #12
0
def test_file_fetcher_smithsonian():
    profile_path = "profiles/smithsonian.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "EDANFetcher"

    fetcher.endpoint_url = "file:/%s/test/test_data/smithsonian/" % os.getcwd()
    for response in fetcher.fetch_all_data():
        assert response["errors"] == []
        assert response["records"]
        break
Beispiel #13
0
def test_absolute_url_fetcher_nypl():
    profile_path = "profiles/nypl.pjs"
    fetcher =  create_fetcher(profile_path, uri_base, "akara.ini")
    assert fetcher.__class__.__name__ == "NYPLFetcher"

    for response in fetcher.fetch_all_data(
            "cd4c3430-c6cb-012f-ccf3-58d385a7bc34"
            ):
        assert not response["errors"]
        assert response["records"]
        break
Beispiel #14
0
def test_oai_fetcher_invalid_set():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    fetcher.sets = ["banana"]
    for response in fetcher.fetch_all_data():
        assert response["errors"]
        assert not response["records"]

    assert fetcher.collections.keys() == []
Beispiel #15
0
def test_oai_fetcher_invalid_set():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"

    fetcher.sets = ["banana"]
    for response in fetcher.fetch_all_data():
        assert response["errors"]
        assert not response["records"]

    assert fetcher.collections.keys() == []
Beispiel #16
0
def test_oai_fetcher_all_sets():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"
    url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets"
    list_sets = xmltodict.parse(urlopen(url).read())
    scdl_all_sets = [s["setSpec"] for s in list_sets["OAI-PMH"]["ListSets"]["set"]]
    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]

    diff = [s for s in scdl_all_sets if s not in fetcher.collections]
    assert diff == []
Beispiel #17
0
def test_oai_fetcher_all_sets():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"
    url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets"
    list_sets = xmltodict.parse(urlopen(url).read())
    scdl_all_sets = [s['setSpec']
                     for s in list_sets['OAI-PMH']['ListSets']['set']]
    for response in fetcher.fetch_all_data():
        assert not response["errors"]
        assert response["records"]

    diff = [s for s in scdl_all_sets if
            s not in fetcher.collections]
    assert diff == []
Beispiel #18
0
def queue_and_errors(num_threads, in_doc, config_file, fetch_dir, stats):
    queue = Queue.Queue(num_threads)
    t_errors = []
    d_errors = []
    threads = [
        FetcherThread(
            queue,
            create_fetcher(in_doc["profile_path"], in_doc["uri_base"],
                           config_file), t_errors, d_errors, fetch_dir, stats)
        for i in range(num_threads)
    ]
    for t in threads:
        t.daemon = True
        t.start()
    return queue, t_errors, d_errors
Beispiel #19
0
def test_oai_fetcher_with_blacklist():
    profile_path = "profiles/clemson.pjs"
    fetcher = create_fetcher(profile_path, uri_base, config_file)
    assert fetcher.__class__.__name__ == "OAIVerbsFetcher"
    fetcher.blacklist = ["ctm", "spg", "jfb", "jbt", "pre", "dnc", "scp",
                         "swl", "weg", "ghs", "wsb", "mbe", "gcj", "cwp",
                         "nev", "hfp", "big"]
    for response in fetcher.fetch_all_data():
        pass
    url = "http://repository.clemson.edu/cgi-bin/oai.exe?verb=ListSets"
    list_sets = xmltodict.parse(urlopen(url).read())
    scdl_all_sets = [s['setSpec']
                     for s in list_sets['OAI-PMH']['ListSets']['set']]
    sets = list(set(scdl_all_sets) - set(fetcher.blacklist))
    diff = [s for s in sets if
            s not in fetcher.collections]
    assert diff == []
Beispiel #20
0
def queue_and_errors(num_threads, in_doc, config_file, fetch_dir, stats):
    queue = Queue.Queue(num_threads)
    t_errors = []
    d_errors = []
    threads = [FetcherThread(queue,
                             create_fetcher(in_doc["profile_path"],
                                            in_doc["uri_base"],
                                            config_file),
                             t_errors,
                             d_errors,
                             fetch_dir,
                             stats)
               for i in range(num_threads)]
    for t in threads:
        t.daemon = True
        t.start()
    return queue, t_errors, d_errors
Beispiel #21
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = os.environ.get("DPLA_CONFIG_FILE", "akara.ini")

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1
Beispiel #22
0
def main(argv):
    parser = define_arguments()
    args = parser.parse_args(argv[1:])

    couch = Couch()
    ingestion_doc = couch.dashboard_db[args.ingestion_document_id]

    # TODO:  profile should be passed to create_fetcher, below.
    #        We need profile in this scope, and the file shouldn't have to
    #        be opened again by create_fetcher.
    with open(ingestion_doc["profile_path"], "r") as f:
        profile = json.load(f)

    # Update ingestion document
    fetch_dir = create_fetch_dir(ingestion_doc["provider"])
    kwargs = {
        "fetch_process/status": "running",
        "fetch_process/data_dir": fetch_dir,
        "fetch_process/start_time": iso_utc_with_tz(),
        "fetch_process/end_time": None,
        "fetch_process/error": None,
        "fetch_process/total_items": None,
        "fetch_process/total_collections": None
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    error_msg = []
    config_file = "akara.ini"

    fetcher = create_fetcher(ingestion_doc["profile_path"],
                             ingestion_doc["uri_base"],
                             config_file)

    print "Fetching records for %s" % ingestion_doc["provider"]
    stats = {
        "total_items": 0,
        "total_collections": 0
    }
    try:
        threads = int(profile.get("fetcher_threads")) or 1
        print "Threads: %d" % threads
    except:
        print >> sys.stderr, "Can not determine fetcher threads, so using 1"
        threads = 1
    sets = None
    sets_supported = (profile.get("sets") != "NotSupported")
    if threads > 1 and sets_supported and hasattr(fetcher, "fetch_sets"):
        sets_err, s = fetcher.fetch_sets()
        if s:
            sets = iter(s)
        else:
            print >> sys.stderr, "Could not get sets: ", sets_err
            return -1
        queue, th_errors, d_errors = queue_and_errors(threads,
                                                      ingestion_doc,
                                                      config_file,
                                                      fetch_dir,
                                                      stats)
        status = None
        try:
            while True:
                time.sleep(0.1)
                try:
                    if print_errors_thrown(th_errors):
                        d_errors.extend(th_errors)
                        raise Exception()
                    if not queue.full():
                        queue.put(sets.next())
                except StopIteration:
                    break
            # Wait for queue to be empty before returning
            while True:
                if queue.empty() and not threads_working:
                    if not print_errors_thrown(th_errors):
                        break
                    else:
                        d_errors.extend(th_errors)
                        raise Exception()
                time.sleep(0.1)
        except KeyboardInterrupt:
            status = "error"
            msg = "\nCaught keyboard interrupt"
            print >> sys.stderr, msg
            d_errors.append(msg)
        except Exception as e:
            if e.message:
                print >> sys.stderr, e.message
            status = "error"
            d_errors.append(e.message)
        finally:
            if not status == "error":
                status = "complete"
    else:  # not threads
        rv = fetch_all_for_set(None, fetcher, fetch_dir)
        stats["total_items"] += rv["total_items"]
        stats["total_collections"] += rv["total_collections"]
        error_msg += rv["errors"]

    print "Total items: %s" % stats["total_items"]
    print "Total collections: %s" % stats["total_collections"]


    # Update ingestion document
    try:
        os.rmdir(fetch_dir)
        # Error if fetch_dir was empty
        status = "error"
        error_msg.append("Error, no records fetched")
        logger.error(error_msg)
    except:
        status = "complete"
    kwargs = {
        "fetch_process/status": status,
        "fetch_process/error": error_msg,
        "fetch_process/end_time": iso_utc_with_tz(),
        "fetch_process/total_items": stats["total_items"],
        "fetch_process/total_collections": stats["total_collections"]
    }
    try:
        couch.update_ingestion_doc(ingestion_doc, **kwargs)
    except:
        logger.error("Error updating ingestion doc %s in %s" %
                     (ingestion_doc["_id"], __name__))
        return -1

    return 0 if status == "complete" else -1