Example #1
0
def get_defaults(tarball, sdir, refno_url):
    """
    A function for parameter-checking.

    @param: tarball (string): the location of the tarball to be extracted
    @param: sdir (string): the location of the scratch directory for untarring,
        conversions, and the ultimate destination of the MARCXML
    @param: refno_url (string): server location on where to look for refno

    @return sdir, refno (string, string): the same
        arguments it was sent as is appropriate.
    """

    if sdir == None:
        # Missing sdir: using default directory: CFG_TMPDIR
        sdir = CFG_TMPDIR
    else:
        sdir = os.path.split(tarball)[0]

    # make a subdir in the scratch directory for each tarball
    sdir = make_single_directory(sdir, \
                                 os.path.split(tarball)[-1] + '_' + PLOTS_DIR)
    if refno_url != "":
        refno = get_reference_number(tarball, refno_url)
        if refno == None:
            refno = os.path.basename(tarball)
            write_message('Error: can\'t find record id for %s' % (refno,))
    else:
        refno = os.path.basename(tarball)
        write_message("Skipping ref-no check")
    return sdir, refno
Example #2
0
def get_defaults(tarball, sdir):
    '''
    A function for parameter-checking.

    @param: tarball (string): the location of the tarball to be extracted
    @param: sdir (string): the location of the scratch directory for untarring,
        conversions, and the ultimate destination of the MARCXML

    @return sdir, refno (string, string): the same
        arguments it was sent as is appropriate.
    '''

    if sdir == None:
        write_message('using default directory: ' + CFG_TMPDIR +\
             ' for scratchwork')
        sdir = CFG_TMPDIR

    else:
        sdir = os.path.split(tarball)[0]

    # make a subdir in the scratch directory for each tarball
    sdir = make_single_directory(sdir, \
                                 os.path.split(tarball)[-1] + '_' + PLOTS_DIR)

    arXiv_id = os.path.split(tarball)[-1]

    refno = get_reference_number(tarball)
    if refno == tarball:
        write_message('can\'t find record id for ' + arXiv_id)

    return sdir, refno
def plotextractor_harvest(identifier, active_file):
    """
    Function that calls plotextractor library to download and extract tarball
    and fulltext pdf for each record.

    @param identifier: OAI identifier of the record to harvest
    @param active_file: path to the currently processed file

    @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple
             (exitcode, err_msg, fulltext_xml, plotextracted_xml)
    """
    err_msg = ""
    exitcode = 0
    plotextracted_xml = None
    fulltext_xml = None
    active_dir, active_name = os.path.split(active_file)
    extract_path = make_single_directory(active_dir, active_name + \
                                          "_plotextraction")
    tarball, pdf = harvest_single(identifier, extract_path)
    if tarball != None:
        plotextracted_xml_path = process_single(tarball, clean = True)
        if plotextracted_xml_path != None:
            plotsxml_fd = open(plotextracted_xml_path, 'r')
            plotextracted_xml = plotsxml_fd.read()
            plotsxml_fd.close()
        else:
            err_msg += "Error extracting plots from id: %s %s\n" % \
                     (identifier, tarball)
            exitcode = 1
    else:
        err_msg += "Error harvesting plots from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1

    if pdf != None:
        fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \
                   '<subfield code="a">' + pdf + '</subfield>' + \
                   '<subfield code="t"></subfield>' + \
                   '</datafield>'
    else:
        err_msg += "Error harvesting fulltext from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1
    return exitcode, err_msg, fulltext_xml, plotextracted_xml
def plotextractor_harvest(identifier, active_file):
    """
    Function that calls plotextractor library to download and extract tarball
    and fulltext pdf for each record.

    @param identifier: OAI identifier of the record to harvest
    @param active_file: path to the currently processed file

    @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple
             (exitcode, err_msg, fulltext_xml, plotextracted_xml)
    """
    err_msg = ""
    exitcode = 0
    plotextracted_xml = None
    fulltext_xml = None
    active_dir, active_name = os.path.split(active_file)
    extract_path = make_single_directory(active_dir, active_name + \
                                          "_plotextraction")
    tarball, pdf = harvest_single(identifier, extract_path)
    if tarball != None:
        plotextracted_xml_path = process_single(tarball, clean=True)
        if plotextracted_xml_path != None:
            plotsxml_fd = open(plotextracted_xml_path, 'r')
            plotextracted_xml = plotsxml_fd.read()
            plotsxml_fd.close()
        else:
            err_msg += "Error extracting plots from id: %s %s\n" % \
                     (identifier, tarball)
            exitcode = 1
    else:
        err_msg += "Error harvesting plots from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1

    if pdf != None:
        fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \
                   '<subfield code="a">' + pdf + '</subfield>' + \
                   '<subfield code="t"></subfield>' + \
                   '</datafield>'
    else:
        err_msg += "Error harvesting fulltext from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1
    return exitcode, err_msg, fulltext_xml, plotextracted_xml