Example #1
0
def extract_package(package, batch_size, delete_zip, directory):
    try:
        return extractAll(package, delete_zip, directory)
    except BadZipfile:
        write_message("Error BadZipfile %s", (package, ))
        task_update_status("CERROR")
        remove(package)
def generate_xml_for_records(records,
                             directory,
                             prefix="apsharvest_result_",
                             suffix=".xml",
                             pretty=True):
    """
    Given a list of APSRecord objects, generate a MARCXML containing Metadata
    and FFT for all of them.
    """
    new_filename = get_temporary_file(prefix=prefix,
                                      suffix=suffix,
                                      directory=directory)

    generated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                    "<collection>\n%s\n</collection>" % \
                    ("\n".join([record.to_xml() for record in records]),)

    try:
        fd = open(new_filename, 'w')
        fd.write(generated_xml)
        fd.close()
    except IOError, e:
        write_message("\nException caught: %s" % e, sys.stderr)
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Example #3
0
def bst_consyn_harvest(CONSYNATOMURL="https://consyn.elsevier.com/batch/atom?key=QUhvbHRrYW1wOzM0Mjc%253d"):
    """
    Task to download metadata given an ATOM feed from consyn.elsevier.com
    and a folder to store the files.

    @param CONSYNATOMURL: The URL of the atom feed to download.
    """
    if not os.path.exists(CFG_CONSYN_OUT_DIRECTORY):
        folders = CFG_CONSYN_OUT_DIRECTORY.split("/")
        folder = "/"
        for i in range(1, len(folders)):
            folder = os.path.join(folder, folders[i]).strip()
            if not os.path.exists(folder):
                os.mkdir(folder)
    try:
        run_sql("SELECT filename FROM CONSYNHARVEST")
    except:
        run_sql("CREATE TABLE CONSYNHARVEST ("
                "filename VARCHAR(100) NOT NULL PRIMARY KEY,"
                "date VARCHAR(50),"
                "size VARCHAR(30) );")
    # Get list of entries from XML document
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_file = download_url(url=CONSYNATOMURL,
                                   retry_count=5,
                                   timeout=60.0)
        xmlString = open(result_file, 'r').read()
    except InvenioFileDownloadError, err:
        write_message("URL could not be opened: %s" % (CONSYNATOMURL,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Example #4
0
def extract_package(package, batch_size, delete_zip, directory):
    try:
        extractAll(package, delete_zip, directory)
    except BadZipfile:
        write_message("Error BadZipfile %s", (package,))
        task_update_status("CERROR")
        remove(package)
Example #5
0
def extract_package(package, delete_zip, directory, new_sources):
    try:
        new_sources.append(package)
        return extractAll(package, delete_zip, directory)
    except BadZipfile as err:
        _errors_detected.append(err)
        write_message("Error BadZipfile %s", (package, ))
        task_update_status("CERROR")
        remove(package)
Example #6
0
def extract_package(package, delete_zip, directory, new_sources):
    try:
        new_sources.append(package)
        return extractAll(package, delete_zip, directory)
    except BadZipfile as err:
        _errors_detected.append(err)
        write_message("Error BadZipfile %s", (package,))
        task_update_status("CERROR")
        remove(package)
Example #7
0
def _dump_database(dirname, filename):
    """
    Dump Invenio database into SQL file called FILENAME living in
    DIRNAME.
    """
    write_message("... writing %s" % dirname + os.sep + filename)
    cmd = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd):
        write_message("ERROR: cannot find %s." % cmd, stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    cmd += " --skip-opt --add-drop-table --add-locks --create-options " \
           " --quick --extended-insert --set-charset --disable-keys " \
           " --host=%s --user=%s --password=%s %s" % \
           (escape_shell_arg(CFG_DATABASE_HOST),
            escape_shell_arg(CFG_DATABASE_USER),
            escape_shell_arg(CFG_DATABASE_PASS),
            escape_shell_arg(CFG_DATABASE_NAME))
    dummy1, dummy2, dummy3 = run_shell_command(cmd, None,
                                               dirname + os.sep + filename)
    if dummy1:
        write_message("ERROR: mysqldump exit code is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy2:
        write_message("ERROR: mysqldump stdout is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy3:
        write_message("ERROR: mysqldump stderr is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
Example #8
0
def _dump_database(dirname, filename):
    """
    Dump Invenio database into SQL file called FILENAME living in
    DIRNAME.
    """
    write_message("... writing %s" % dirname + os.sep + filename)
    cmd = CFG_PATH_MYSQL + 'dump'
    if not os.path.exists(cmd):
        write_message("ERROR: cannot find %s." % cmd, stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    cmd += " --skip-opt --add-drop-table --add-locks --create-options " \
           " --quick --extended-insert --set-charset --disable-keys " \
           " --host=%s --user=%s --password=%s %s" % \
           (escape_shell_arg(CFG_DATABASE_HOST),
            escape_shell_arg(CFG_DATABASE_USER),
            escape_shell_arg(CFG_DATABASE_PASS),
            escape_shell_arg(CFG_DATABASE_NAME))
    dummy1, dummy2, dummy3 = run_shell_command(cmd, None, dirname + os.sep + filename)
    if dummy1:
        write_message("ERROR: mysqldump exit code is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy2:
        write_message("ERROR: mysqldump stdout is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
    if dummy3:
        write_message("ERROR: mysqldump stderr is %s." % repr(dummy1),
                      stream=sys.stderr)
        task_update_status("ERROR")
        sys.exit(1)
Example #9
0
def prettify_xml(filepath):
    """
    Will prettify an XML file for better readability.

    Returns the new, pretty, file.
    """
    new_filename = "%s_pretty.xml" % (os.path.splitext(filepath)[0],)
    cmd = "xmllint --format %s" % (filepath,)
    exit_code, std_out, err_msg = run_shell_command(cmd=cmd, filename_out=new_filename)
    if exit_code:
        write_message("\nError caught: %s" % (err_msg,))
        task_update_status("CERROR")
        return

    return new_filename
Example #10
0
def prettify_xml(filepath):
    """
    Will prettify an XML file for better readability.

    Returns the new, pretty, file.
    """
    new_filename = "%s_pretty.xml" % (os.path.splitext(filepath)[0],)
    cmd = "xmllint --format %s" % (filepath,)
    exit_code, std_out, err_msg = run_shell_command(cmd=cmd,
                                                    filename_out=new_filename)
    if exit_code:
        write_message("\nError caught: %s" % (err_msg,))
        task_update_status("CERROR")
        return

    return new_filename
Example #11
0
def generate_xml_for_records(records, directory, prefix="apsharvest_result_", suffix=".xml", pretty=True):
    """
    Given a list of APSRecord objects, generate a MARCXML containing Metadata
    and FFT for all of them.
    """
    new_filename = get_temporary_file(prefix=prefix, suffix=suffix, directory=directory)

    generated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' "<collection>\n%s\n</collection>" % (
        "\n".join([record.to_xml() for record in records]),
    )

    try:
        fd = open(new_filename, "w")
        fd.write(generated_xml)
        fd.close()
    except IOError, e:
        write_message("\nException caught: %s" % e, sys.stderr)
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Example #12
0
def download_feed(feed, batch_size, delete_zip, new_sources,
                  directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed,
                                   retry_count=5,
                                   timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError, err:
        write_message("URL could not be opened: %s" % (feed,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
def download_feed(feed, batch_size, delete_zip, new_sources,
                  directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed,
                                   retry_count=5,
                                   timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError, err:
        write_message("URL could not be opened: %s" % (feed,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Example #14
0
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option(
            "run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    try:
        for key in task_get_option("run"):
            task_sleep_now_if_required(can_stop_too=True)
            write_message("")
            filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg"
            write_message("Getting configuration from file: %s" % filename,
                          verbose=9)
            config = ConfigParser.ConfigParser()
            try:
                config.readfp(open(filename))
            except StandardError, e:
                write_message(
                    "Cannot find configurationfile: %s. "
                    "The rankmethod may also not be registered using "
                    "the BibRank Admin Interface." % filename, sys.stderr)
                raise StandardError

            #Using the function variable to call the function related to the
            #rank method
            cfg_function = config.get("rank_method", "function")
            func_object = globals().get(cfg_function)
            if func_object:
                func_object(key)
            else:
                write_message("Cannot run method '%s', no function to call" %
                              key)
    except StandardError, e:
        write_message("\nException caught: %s" % e, sys.stderr)
        register_exception()
        task_update_status("ERROR")
        sys.exit(1)
Example #15
0
def download_feed(feed, batch_size, delete_zip, new_sources,
                  directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed,
                                   retry_count=5,
                                   timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError as err:
        write_message("URL could not be opened: %s" % (feed,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return

    dom = xml.dom.minidom.parseString(xmlString)
    entries = dom.getElementsByTagName("entry")

    # Loop through entries
    for entry in entries:
        # Get URL and filename
        fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href")
        fileName = entry.getElementsByTagName("title")[0].firstChild.data

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                write_message("URL could not be opened: %s" % (fileUrl,))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
Example #16
0
def task_run_core():
    """Run the indexing task. The row argument is the BibSched task
    queue row, containing if, arguments, etc.
    Return 1 in case of success and 0 in case of failure.
    """
    if not task_get_option("run"):
        task_set_option("run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")])

    try:
        for key in task_get_option("run"):
            task_sleep_now_if_required(can_stop_too=True)
            write_message("")
            filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg"
            write_message("Getting configuration from file: %s" % filename,
                verbose=9)
            config = ConfigParser.ConfigParser()
            try:
                config.readfp(open(filename))
            except StandardError, e:
                write_message("Cannot find configurationfile: %s. "
                    "The rankmethod may also not be registered using "
                    "the BibRank Admin Interface." % filename, sys.stderr)
                raise StandardError

            #Using the function variable to call the function related to the
            #rank method
            cfg_function = config.get("rank_method", "function")
            func_object = globals().get(cfg_function)
            if func_object:
                func_object(key)
            else:
                write_message("Cannot run method '%s', no function to call"
                    % key)
    except StandardError, e:
        write_message("\nException caught: %s" % e, sys.stderr)
        register_exception()
        task_update_status("ERROR")
        sys.exit(1)
def prettify_xml(filepath):
    """
    Will prettify an XML file for better readability.

    Returns the new, pretty, file.
    """
    cmd = "xmllint --format %s" % (filepath, )
    exit_code, std_out, err_msg = run_shell_command(cmd=cmd)
    if exit_code:
        write_message("\nError caught: %s" % (err_msg, ))
        task_update_status("CERROR")
        return

    new_filename = "%s.pretty" % (filepath, )
    try:
        fd = open(new_filename, 'w')
        fd.write(std_out)
        fd.close()
    except IOError, e:
        write_message("\nException caught: %s" % e, sys.stderr)
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Example #18
0
def prettify_xml(filepath):
    """
    Will prettify an XML file for better readability.

    Returns the new, pretty, file.
    """
    cmd = "xmllint --format %s" % (filepath,)
    exit_code, std_out, err_msg = run_shell_command(cmd=cmd)
    if exit_code:
        write_message("\nError caught: %s" % (err_msg,))
        task_update_status("CERROR")
        return

    new_filename = "%s.pretty" % (filepath,)
    try:
        fd = open(new_filename, 'w')
        fd.write(std_out)
        fd.close()
    except IOError, e:
        write_message("\nException caught: %s" % e, sys.stderr)
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
Example #19
0
def bst_webcoll_postprocess(recids=[]):
    """Parse recids to POST to remote server to alert that records are visible."""
    if isinstance(recids, str):
        recids = recids.split(",")
    cache = get_redis()
    cached_ids = cache.get("webcoll_pending_recids") or []
    recids += cached_ids

    if not CFG_WEBCOLL_POST_REQUEST_URL:
        write_message("CFG_WEBCOLL_POST_REQUEST_URL is not set.")
        task_update_status('ERROR')
        return 1

    if recids:
        write_message("Going to POST callback to {0}: {1} (total: {2})".format(
            CFG_WEBCOLL_POST_REQUEST_URL,
            recids[:10],
            len(recids))
        )
        session = requests.Session()
        addapter = requests.adapters.HTTPAdapter(max_retries=3)
        session.mount(CFG_WEBCOLL_POST_REQUEST_URL, addapter)
        response = session.post(CFG_WEBCOLL_POST_REQUEST_URL,
                                data={'recids': recids})
        if response.ok:
            write_message("Post request sent successfully")
            cache.set("webcoll_pending_recids", [])
        else:
            write_message("Post request failed!")
            write_message(response.text)
            task_update_status('ERROR')
            cache.set("webcoll_pending_recids", recids)
        session.close()
    else:
        write_message("No recids to POST callback for to {0}.".format(
            CFG_WEBCOLL_POST_REQUEST_URL,
        ))
Example #20
0
def download_feed(feed, batch_size, delete_zip, new_sources, directory):
    """ Get list of entries from XML document """
    xmlString = ""
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed, retry_count=5, timeout=60.0)
        try:
            result_file = open(result_path, 'r')
            xmlString = result_file.read()
        finally:
            result_file.close()
            remove(result_path)
    except InvenioFileDownloadError as err:
        write_message("URL could not be opened: %s" % (feed, ))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return

    dom = parseString(xmlString)
    entries = dom.getElementsByTagName("entry")

    # Loop through entries
    for entry in entries:
        # Get URL and filename
        fileUrl = entry.getElementsByTagName("link")[0].getAttribute("href")
        fileName = entry.getElementsByTagName("title")[0].firstChild.data

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" %
                              (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                write_message("URL could not be opened: %s" % (fileUrl, ))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
                write_message("Error BadZipfile %s", (outFilename, ))
                task_update_status("CERROR")
                remove(outFilename)
Example #21
0
def download_feed(feed_url, batch_size, delete_zip, new_sources,
                  directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)
    for fileUrl, fileName in entries:
        task_sleep_now_if_required()
        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        # Check if file has already been fetched
        existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY))

        if len(existing_files) == 1:
            write_message("Not downloading %s, already found %s in %s\n" %
                          (fileUrl, existing_files[0], outFilename))
        else:
            fileUrl = fileUrl.replace(' ', '%20')
            try:
                write_message("Downloading %s to %s\n" % (fileUrl,
                                                          outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError as err:
                _errors_detected.append(err)
                write_message("URL could not be opened: %s" % fileUrl)
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            try:
                xml_files.extend(extractAll(outFilename,
                                            delete_zip,
                                            directory))
            except BadZipfile:
                _errors_detected.append(err)
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
    return xml_files
        outFilename = outFilename.lstrip()

        #file has already been fetched
        if outFilename in downloaded_files:
            write_message("Not downloading %s, already found %s\n" %
                          (fileUrl, outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError, err:
                write_message("URL could not be opened: %s" % (fileUrl,))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            size = getsize(outFilename)
            run_sql("INSERT INTO CONSYNHARVEST"
                    "(filename,date,size)"
                    "VALUES (%s,%s,%s)",
                    (outFilename, updated, size))
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
                run_sql("DELETE FROM CONSYNHARVEST"
                        "WHERE filename =%s",
                        (outFilename,))
Example #23
0
        outFilename = outFilename.lstrip()

        #file has already been fetched
        if outFilename in downloaded_files:
            write_message("Not downloading %s, already found %s\n" %
                          (fileUrl, outFilename))
        else:
            try:
                write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
                download_url(fileUrl, "zip", outFilename, 5, 60.0)
                new_sources.append(outFilename)
            except InvenioFileDownloadError, err:
                write_message("URL could not be opened: %s" % (fileUrl,))
                write_message(str(err))
                write_message(traceback.format_exc()[:-1])
                task_update_status("CERROR")
                continue
            size = getsize(outFilename)
            run_sql("INSERT INTO CONSYNHARVEST"
                    "(filename,date,size)"
                    "VALUES (%s,%s,%s)",
                    (outFilename, updated, size))
            try:
                extractAll(outFilename, delete_zip, directory)
            except BadZipfile:
                write_message("Error BadZipfile %s", (outFilename,))
                task_update_status("CERROR")
                remove(outFilename)
                run_sql("DELETE FROM CONSYNHARVEST"
                        "WHERE filename =%s",
                        (outFilename,))
Example #24
0
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
        * input_uri - Link to new URI data
            DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
            NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
        * log_dir - Directory to store log files in
        * logging - True or False, default True
    """

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI...")

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal', user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    root = tree.getroot()

    doi_count = 0
    new_count = 0
    missing_count = 0

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            record_xml = append_to_record(rec_id, doi, published_date)
            if record_xml:
                new_count += 1
                _print(
                    "* Now we will run the bibupload and bibindex for " +
                    str(rec_id) + " record", 5)
                _print(
                    "** We will upload the following xml code " +
                    repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
        else:
            missing_count += 1
            _print('No record found matching arxiv ID: ' + arxiv, 9)

    _print("======================== FINAL SCORE ========================", 1)
    _print("DOIs found and processed: " + str(doi_count), 1)
    _print("Arxiv IDs without corresponding records: " + str(missing_count), 1)
    _print("Records requiring appends: " + str(new_count), 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    task_update_progress(SCRIPT_NAME +
                         " finished. %s DOIs processed, %s to add" %
                         (str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True
Example #25
0
def download_feed(feed_url, delete_zip, new_sources, directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url, ))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)

    if not entries:
        return xml_files

    # look what files already exist
    # there are currently O(10^5) files in the directory tree rooted
    # at CFG_CONSYN_OUT_DIRECTORY and it is on AFS and takes upwards
    # of 5 minutes to walk.
    # might make sense to have a db table with already harvested files
    task_sleep_now_if_required()
    allfilenames = find_names_of_existing_files(CFG_CONSYN_OUT_DIRECTORY)
    task_sleep_now_if_required()

    for fileUrl, fileName in entries:
        if fileName in allfilenames:
            write_message(
                "Not downloading %s, found file with same name in %s" % (
                    fileName,
                    CFG_CONSYN_OUT_DIRECTORY,
                ))
            continue
        task_sleep_now_if_required()

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        fileUrl = fileUrl.replace(' ', '%20')
        try:
            write_message("Downloading %s to %s\n" % (fileUrl, outFilename))
            download_url(fileUrl, "zip", outFilename, 5, 60.0)
            new_sources.append(outFilename)
        except InvenioFileDownloadError as err:
            _errors_detected.append(err)
            write_message("URL could not be opened: %s" % fileUrl)
            write_message(str(err))
            write_message(traceback.format_exc()[:-1])
            task_update_status("CERROR")
            continue
        try:
            xml_files.extend(extractAll(outFilename, delete_zip, directory))
        except BadZipfile:
            _errors_detected.append(err)
            write_message("Error BadZipfile %s", (outFilename, ))
            task_update_status("CERROR")
            remove(outFilename)

    return xml_files
Example #26
0
def bst_consyn_harvest(feed_url=None, package=None, feed_file=None,
                       package_list_file=None, batch_size='500',
                       delete_zip='False', submit='False'):
    """ Task to convert xml files from consyn.elsevier.com to Marc xml files.
    There are four execution modes:
    1. Download from an atom feed url.
    2. Extract and convert a zip package.
    3. Download from an atom feed file.
    4. Extract and convert a list of zip packages.

    The feed is stored to the file system under the folder feeds.
    If no errors occur during the execution of the tasklet the feed
    is deleted. Records may be recovered running the tasklet again with
    the modes 2, 3 or 4.

    :param feed_url: A URL to the atom feed.
    :type feed: string.

    :param package: A path to a zip package.
    :type package: string.

    :param package: A path to an atom feed file.
    :type package: string.

    :param package_list_file: A path to a file with a list of paths
                              to zip packages. The file must contain
                              the path to each package in a different
                              line.
    :type package_list_file: string.

    :param batch_size: The number of records contained in each output file.
    :type batch_size: string representation of an integer.

    :param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not.
    :type delete_zip: string representation of a boolean.

    :param submit: Flag to indicate whether the result files
                       should be submited by email and uploaded
                       to FTP server.
    :type submit: string representation of a boolean.
    """
    if not feed_url:
        feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \
                   (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []
    feed_location = ''

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n'
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message('Warning delete_zip parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if submit.lower() == 'true':
        submit = True
    elif submit.lower() == 'false':
        submit = False
    else:
        submit = False
        write_message('Warning upload_FTP parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')

    if not exists(CFG_CONSYN_OUT_DIRECTORY):
        makedirs(CFG_CONSYN_OUT_DIRECTORY)
    out_folder = CFG_CONSYN_OUT_DIRECTORY
    els = ElsevierPackage(CONSYN=True)

    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()

    if package:
        xml_files = extract_package(package, batch_size, delete_zip,
                                    out_folder, new_sources)
    elif package_list_file:
        package_list = []
        with open(package_list_file, 'r') as package_file:
            for line in package_file:
                line = line.strip()
                if line:
                    package_list.append(line)
        xml_files = extract_multiple_packages(
            package_list, batch_size,
            delete_zip, new_sources,
            out_folder
        )
    elif feed_file:
        entries = parse_feed(feed_file)
        links = map(lambda a: a[0], entries)
        package_list = map(lambda a: a[1], entries)
        package_list = map(lambda a: join(CFG_CONSYN_OUT_DIRECTORY, a),
                           package_list)
        for package in package_list:
            if not exists(package):
                index = package_list.index(package)
                link = links[index]
                try:
                    message = ("Downloading %s to %s\n" % (link,
                                                           package))
                    write_message(message)
                    download_url(link, "zip", package, 5, 60.0)
                    package_list.append(package)
                except InvenioFileDownloadError as err:
                    message = "URL could not be opened: " + link
                    write_message(message)
                    write_message(str(err))
                    write_message(traceback.format_exc()[:-1])
                    task_update_status("CERROR")
                    continue
            xml_files = extract_multiple_packages(
                package_list, batch_size,
                delete_zip, new_sources,
                out_folder
            )
    else:
        feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds')
        if not exists(feeds_folder):
            makedirs(feeds_folder)
        date = datetime.now().strftime("%Y.%m.%d")
        feed_location = "feed-%s.xml" % date
        feed_location = join(feeds_folder, feed_location)
        xml_files = download_feed(feed_url, batch_size, delete_zip,
                                  new_sources, out_folder, feed_location)
    task_update_progress("Converting files 2/3...")
    results = convert_files(xml_files, els, prefix=consyn_files)
    for dummy, (status_code, result) in results.iteritems():
        if status_code == StatusCodes.OK:
            new_files.append(result)
    task_update_progress("Compiling output 3/3...")
    create_collection(batch_size, new_files, new_sources,
                      out_folder, submit)
    if feed_location and not _errors_detected:
        remove(feed_location)
    for error in _errors_detected:
        write_message(str(err))
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR, logging=True):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
        * input_uri - Link to new URI data
            DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
            NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
        * log_dir - Directory to store log files in
        * logging - True or False, default True
    """

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI...")

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal',
                               user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    root = tree.getroot()

    doi_count = 0
    new_count = 0
    missing_count = 0

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            record_xml = append_to_record(rec_id, doi, published_date)
            if record_xml:
                new_count += 1
                _print("* Now we will run the bibupload and bibindex for " +
                       str(rec_id) + " record", 5)
                _print("** We will upload the following xml code " +
                       repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
        else:
            missing_count += 1
            _print('No record found matching arxiv ID: ' + arxiv, 9)

    _print("======================== FINAL SCORE ========================", 1)
    _print("DOIs found and processed: " + str(doi_count), 1)
    _print("Arxiv IDs without corresponding records: " + str(missing_count), 1)
    _print("Records requiring appends: " + str(new_count), 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    task_update_progress(SCRIPT_NAME + " finished. %s DOIs processed, %s to add"
                         % (str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True
def bst_arxiv_doi_update(input_uri=None,
                         log_dir=CFG_TMPSHAREDDIR,
                         logging=True,
                         asana_key=CFG_ASANA_API_KEY,
                         asana_parent_id=ASANA_PARENT_TASK_ID,
                         skip_result_types='missing'):
    """
    bst_arxiv_doi_update
    Updates DOIs on documents harvested from ArXiv.

    Parameters:
    :param input_uri: Link to new URI data
        DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml
        NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml
    :param log_dir: Directory to store log files in
    :param logging: True or False, default True
    :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY
        NOTE: Passing the value of None for this parameter will skip writing
        to Asana and instead email the instance admin
    :param asana_parent_id: The taskID of the task in Asana to log subtasks to
    :param skip_result_types: Error messages to not bother with during
        reporting, input as Comma Seperated Values CSVs
        Possible values: missing, ambigous, incorrect
    """
    skip_results = verify_skip_results(skip_result_types)

    if input_uri is None:
        _print("Notice: No URI specified, defaulting to " + URI_DEFAULT)
        input_uri = URI_DEFAULT

    task_update_progress("Resolving URI: %s" % (input_uri,))

    # Testing builds characters
    bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True)
    bibindex = ChunkedBibIndex(indexes='year,global,journal',
                               user=SCRIPT_NAME)
    # open url and parse xml
    try:
        tree = ET.parse(urllib.urlopen(input_uri))
        _print('Opened DOI file ' + input_uri)
    except IOError:
        _print("FATAL ERROR: Could not open URL: " + input_uri, 1)
        task_update_progress("Failed retreiving DOI data")
        task_update_status("FAILED")
        return False
    except ExpatError:
        _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1)
        task_update_progress("Failed parsing DOI data")
        task_update_status("FAILED")
        return False

    root = tree.getroot()

    try:
        date_el = root.find('date')
        date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'),
                                 date_el.get('day'))
        _print("Processing DOIs last updated on date %s" % date_str)
    except AttributeError:
        _print("Warning: Couldn't get last published date of Arxiv DOI feed.")

    doi_count = 0
    new_count = 0

    # Stores any DOIs with have issues with in structure:
    # Missing: (doi, arxiv preprint_id, published date)
    # Ambiguous: (doi, arxiv preprint_id, rec_ids)
    # Incorrect: (rec_id, old-doi, new-doi)
    problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []}

    task_update_progress("Processing records...")
    # NB: Element.getiterator() is deprecated since version 2.7: Use
    # method Element.iter() instead.
    for item in root.getiterator('article'):
        doi_count += 1
        doi = item.get('doi')
        arxiv = item.get('preprint_id')
        published_date = item.get('published')
        _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6)
        rec_id = get_record_by_arxiv_id(arxiv)
        if len(rec_id) == 1:
            rec_id = rec_id[0]
            try:
                record_xml = append_to_record(rec_id, doi, published_date)
            except DOIError as ex:
                problem_dois['incorrect'].append((rec_id, ex.message, doi))
                continue
            if record_xml:
                new_count += 1
                _print("* Now we will run the bibupload and bibindex for " +
                       "%s record" % rec_id, 5)
                _print("** We will upload the following xml code %s" %
                       repr(record_xml), 9)
                bibupload.add(record_xml)
                bibindex.add(rec_id)
        elif len(rec_id) > 1:
            _print('ERROR: %d records found with matching arXiv ID %s' %
                   (len(rec_id), arxiv))
            problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id)))
        else:
            _print('No record found matching arxiv ID: %s' % arxiv, 9)
            problem_dois['missing'].append((doi, arxiv, published_date))

    _print("========================| FINAL SCORE |=======================", 1)
    _print("DOIs found and processed: %d" % doi_count, 1)
    _print("Arxiv IDs without corresponding records: %d"
           % len(problem_dois['missing']), 1)
    _print("Arxiv IDs corresponding to multiple records (duplicates): %d"
           % len(problem_dois['ambiguous']), 1)
    _print("Inspire records with an incorrect DOI: %d"
           % len(problem_dois['incorrect']), 1)
    _print("Records without DOIs requiring appends: %d" % new_count, 1)
    _print("==============================================================", 1)

    if logging:
        task_update_progress("Logging...")
        write_list_to_file(log_dir, 'errors', ERRORS)
        write_list_to_file(log_dir, 'messages', MESSAGES)

    notify_on_errors(problem_dois, log_dir, doi_count, new_count,
                     asana_key, asana_parent_id, skip_results)

    task_update_progress("%s finished. %s DOIs processed, %s to add"
                         % (SCRIPT_NAME, str(doi_count), str(new_count)))
    task_update_status("DONE")

    bibupload.__del__()
    bibindex.__del__()

    return True
Example #29
0
def bst_consyn_harvest(feed_url=None,
                       package=None,
                       feed_file=None,
                       package_list_file=None,
                       batch_size='500',
                       delete_zip='False',
                       submit='False',
                       threshold_date=None):
    """ Task to convert xml files from consyn.elsevier.com to Marc xml files.
    There are four execution modes:
    1. Download from an atom feed url.
    2. Extract and convert a zip package.
    3. Download from an atom feed file.
    4. Extract and convert a list of zip packages.

    The feed is stored to the file system under the folder feeds.
    If no errors occur during the execution of the tasklet the feed
    is deleted. Records may be recovered running the tasklet again with
    the modes 2, 3 or 4.

    :param feed_url: A URL to the atom feed.
    :type feed: string.

    :param package: A path to a zip package.
    :type package: string.

    :param package: A path to an atom feed file.
    :type package: string.

    :param package_list_file: A path to a file with a list of paths
                              to zip packages. The file must contain
                              the path to each package in a different
                              line.
    :type package_list_file: string.

    :param batch_size: The number of records contained in each output file.
    :type batch_size: string representation of an integer.

    :param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not.
    :type delete_zip: string representation of a boolean.

    :param submit: Flag to indicate whether the result files
                       should be submited by email and uploaded
                       to FTP server.
    :type submit: string representation of a boolean.
    :param threshold_date: threshold date only converts records that they were
                      published after threshold_date
    :type threshold_date: string in the format YYYY-MM-DD
    """
    if not feed_url:
        feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \
                   (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []
    feed_location = ''

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n'
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message('Warning delete_zip parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if submit.lower() == 'true':
        submit = True
    elif submit.lower() == 'false':
        submit = False
    else:
        submit = False
        write_message('Warning upload_FTP parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if threshold_date:
        import time
        date_format = "%Y-%m-%d"
        try:
            date = datetime(*(time.strptime(threshold_date, date_format)[0:6]))
            threshold_date = date.strftime('%Y-%m-%d')
        except ValueError:
            write_message('Error threshold_date parameter is not '
                          'in the right format. It should be in '
                          'form "YYYY-MM-DD".')
            task_update_status("ERROR")
            return

    if not exists(CFG_CONSYN_OUT_DIRECTORY):
        makedirs(CFG_CONSYN_OUT_DIRECTORY)
    out_folder = CFG_CONSYN_OUT_DIRECTORY
    journal_mappings = get_kbs()['journals'][1]
    els = ElsevierPackage(CONSYN=True, journal_mappings=journal_mappings)

    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()

    if package:
        xml_files = extract_package(package, delete_zip, out_folder,
                                    new_sources)
    elif package_list_file:
        package_list = []
        with open(package_list_file, 'r') as package_file:
            for line in package_file:
                line = line.strip()
                if line:
                    package_list.append(line)
        xml_files = extract_multiple_packages(package_list, delete_zip,
                                              new_sources, out_folder)
    elif feed_file:
        entries = parse_feed(feed_file)
        links = [a[0] for a in entries]
        package_list = [a[1] for a in entries]
        package_list = [
            join(CFG_CONSYN_OUT_DIRECTORY, a) for a in package_list
        ]
        for package in package_list:
            task_sleep_now_if_required()
            if not exists(package):
                index = package_list.index(package)
                link = links[index]
                link = link.replace(' ', '%20')
                try:
                    message = ("Downloading %s to %s\n" % (link, package))
                    write_message(message)
                    download_url(link, "zip", package, 5, 60.0)
                    package_list.append(package)
                except InvenioFileDownloadError as err:
                    message = "URL could not be opened: " + link
                    write_message(message)
                    write_message(str(err))
                    write_message(traceback.format_exc()[:-1])
                    task_update_status("CERROR")
                    continue
            xml_files = extract_multiple_packages(package_list, delete_zip,
                                                  new_sources, out_folder)
    else:
        feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds')
        if not exists(feeds_folder):
            makedirs(feeds_folder)
        date = datetime.now().strftime("%Y.%m.%d")
        feed_location = "feed-%s.xml" % date
        feed_location = join(feeds_folder, feed_location)
        xml_files = download_feed(feed_url, delete_zip, new_sources,
                                  out_folder, feed_location)
    task_update_progress("Converting files 2/3...")
    task_sleep_now_if_required()
    results = convert_files(xml_files,
                            els,
                            prefix=consyn_files,
                            threshold_date=threshold_date)
    for dummy, (status_code, result) in results.iteritems():
        if status_code == StatusCodes.OK:
            new_files.append(result)
    task_update_progress("Compiling output 3/3...")
    task_sleep_now_if_required()
    create_collection(batch_size, new_files, new_sources, out_folder, submit)
    if feed_location and not _errors_detected:
        remove(feed_location)
    for error in _errors_detected:
        write_message(str(error))
Example #30
0
def download_feed(feed_url, delete_zip, new_sources, directory, feed_location):
    """ Get list of entries from XML document """
    try:
        task_update_progress("Downloading and extracting files 1/2...")
        result_path = download_url(url=feed_url,
                                   content_type="xml",
                                   download_to_file=feed_location,
                                   retry_count=5,
                                   timeout=60.0)
    except InvenioFileDownloadError as err:
        _errors_detected.append(err)
        write_message("URL could not be opened: %s" % (feed_url,))
        write_message(str(err))
        write_message(traceback.format_exc()[:-1])
        task_update_status("CERROR")
        return
    xml_files = []
    entries = parse_feed(result_path)

    if not entries:
        return xml_files

    # look what files already exist
    # there are currently O(10^5) files in the directory tree rooted
    # at CFG_CONSYN_OUT_DIRECTORY and it is on AFS and takes upwards
    # of 5 minutes to walk.
    # might make sense to have a db table with already harvested files
    task_sleep_now_if_required()
    allfilenames = find_names_of_existing_files(CFG_CONSYN_OUT_DIRECTORY)
    task_sleep_now_if_required()

    for fileUrl, fileName in entries:
        if fileName in allfilenames:
            write_message("Not downloading %s, found file with same name in %s"
                          % (fileName, CFG_CONSYN_OUT_DIRECTORY,))
            continue
        task_sleep_now_if_required()

        # Output location is directory + filename
        outFilename = join(directory, fileName)
        outFilename = outFilename.lstrip()

        fileUrl = fileUrl.replace(' ', '%20')
        try:
            write_message("Downloading %s to %s\n" % (fileUrl,
                                                      outFilename))
            download_url(fileUrl, "zip", outFilename, 5, 60.0)
            new_sources.append(outFilename)
        except InvenioFileDownloadError as err:
            _errors_detected.append(err)
            write_message("URL could not be opened: %s" % fileUrl)
            write_message(str(err))
            write_message(traceback.format_exc()[:-1])
            task_update_status("CERROR")
            continue
        try:
            xml_files.extend(extractAll(outFilename,
                                        delete_zip,
                                        directory))
        except BadZipfile:
            _errors_detected.append(err)
            write_message("Error BadZipfile %s", (outFilename,))
            task_update_status("CERROR")
            remove(outFilename)

    return xml_files