def plotextractor_harvest(identifier, active_file):
    """
    Function that calls plotextractor library to download and extract tarball
    and fulltext pdf for each record.

    @param identifier: OAI identifier of the record to harvest
    @param active_file: path to the currently processed file

    @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple
             (exitcode, err_msg, fulltext_xml, plotextracted_xml)
    """
    err_msg = ""
    exitcode = 0
    plotextracted_xml = None
    fulltext_xml = None
    active_dir, active_name = os.path.split(active_file)
    extract_path = make_single_directory(active_dir, active_name + \
                                          "_plotextraction")
    tarball, pdf = harvest_single(identifier, extract_path)
    if tarball != None:
        plotextracted_xml_path = process_single(tarball, clean = True)
        if plotextracted_xml_path != None:
            plotsxml_fd = open(plotextracted_xml_path, 'r')
            plotextracted_xml = plotsxml_fd.read()
            plotsxml_fd.close()
        else:
            err_msg += "Error extracting plots from id: %s %s\n" % \
                     (identifier, tarball)
            exitcode = 1
    else:
        err_msg += "Error harvesting plots from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1

    if pdf != None:
        fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \
                   '<subfield code="a">' + pdf + '</subfield>' + \
                   '<subfield code="t"></subfield>' + \
                   '</datafield>'
    else:
        err_msg += "Error harvesting fulltext from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1
    return exitcode, err_msg, fulltext_xml, plotextracted_xml
def plotextractor_harvest(identifier, active_file):
    """
    Function that calls plotextractor library to download and extract tarball
    and fulltext pdf for each record.

    @param identifier: OAI identifier of the record to harvest
    @param active_file: path to the currently processed file

    @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple
             (exitcode, err_msg, fulltext_xml, plotextracted_xml)
    """
    err_msg = ""
    exitcode = 0
    plotextracted_xml = None
    fulltext_xml = None
    active_dir, active_name = os.path.split(active_file)
    extract_path = make_single_directory(active_dir, active_name + \
                                          "_plotextraction")
    tarball, pdf = harvest_single(identifier, extract_path)
    if tarball != None:
        plotextracted_xml_path = process_single(tarball, clean=True)
        if plotextracted_xml_path != None:
            plotsxml_fd = open(plotextracted_xml_path, 'r')
            plotextracted_xml = plotsxml_fd.read()
            plotsxml_fd.close()
        else:
            err_msg += "Error extracting plots from id: %s %s\n" % \
                     (identifier, tarball)
            exitcode = 1
    else:
        err_msg += "Error harvesting plots from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1

    if pdf != None:
        fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \
                   '<subfield code="a">' + pdf + '</subfield>' + \
                   '<subfield code="t"></subfield>' + \
                   '</datafield>'
    else:
        err_msg += "Error harvesting fulltext from id: %s %s\n" % \
                     (identifier, extract_path)
        exitcode = 1
    return exitcode, err_msg, fulltext_xml, plotextracted_xml
 def test_harvest_single(self):
     """plotextractor - check harvest_single"""
     tarball, pdf = harvest_single('arXiv:1204.6260', '/tmp', ('pdf', 'tarball'))
     self.assertTrue(pdf is not None, "PDF is of unknown type")
     self.assertTrue(tarball is not None, "Tarball is of unknown type")
Beispiel #4
0
 def test_harvest_single(self):
     """plotextractor - check harvest_single"""
     tarball, pdf = harvest_single('arXiv:1204.6260', '/tmp',
                                   ('pdf', 'tarball'))
     self.assertTrue(pdf is not None, "PDF is of unknown type")
     self.assertTrue(tarball is not None, "Tarball is of unknown type")