def plotextractor_harvest(identifier, active_file): """ Function that calls plotextractor library to download and extract tarball and fulltext pdf for each record. @param identifier: OAI identifier of the record to harvest @param active_file: path to the currently processed file @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple (exitcode, err_msg, fulltext_xml, plotextracted_xml) """ err_msg = "" exitcode = 0 plotextracted_xml = None fulltext_xml = None active_dir, active_name = os.path.split(active_file) extract_path = make_single_directory(active_dir, active_name + \ "_plotextraction") tarball, pdf = harvest_single(identifier, extract_path) if tarball != None: plotextracted_xml_path = process_single(tarball, clean = True) if plotextracted_xml_path != None: plotsxml_fd = open(plotextracted_xml_path, 'r') plotextracted_xml = plotsxml_fd.read() plotsxml_fd.close() else: err_msg += "Error extracting plots from id: %s %s\n" % \ (identifier, tarball) exitcode = 1 else: err_msg += "Error harvesting plots from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 if pdf != None: fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \ '<subfield code="a">' + pdf + '</subfield>' + \ '<subfield code="t"></subfield>' + \ '</datafield>' else: err_msg += "Error harvesting fulltext from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 return exitcode, err_msg, fulltext_xml, plotextracted_xml
def plotextractor_harvest(identifier, active_file): """ Function that calls plotextractor library to download and extract tarball and fulltext pdf for each record. @param identifier: OAI identifier of the record to harvest @param active_file: path to the currently processed file @return: exitcode, errormessages and paths to generated MARCXML for plots and fulltext as a tuple (exitcode, err_msg, fulltext_xml, plotextracted_xml) """ err_msg = "" exitcode = 0 plotextracted_xml = None fulltext_xml = None active_dir, active_name = os.path.split(active_file) extract_path = make_single_directory(active_dir, active_name + \ "_plotextraction") tarball, pdf = harvest_single(identifier, extract_path) if tarball != None: plotextracted_xml_path = process_single(tarball, clean=True) if plotextracted_xml_path != None: plotsxml_fd = open(plotextracted_xml_path, 'r') plotextracted_xml = plotsxml_fd.read() plotsxml_fd.close() else: err_msg += "Error extracting plots from id: %s %s\n" % \ (identifier, tarball) exitcode = 1 else: err_msg += "Error harvesting plots from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 if pdf != None: fulltext_xml = '<datafield tag="FFT" ind1=" " ind2=" ">' + \ '<subfield code="a">' + pdf + '</subfield>' + \ '<subfield code="t"></subfield>' + \ '</datafield>' else: err_msg += "Error harvesting fulltext from id: %s %s\n" % \ (identifier, extract_path) exitcode = 1 return exitcode, err_msg, fulltext_xml, plotextracted_xml
def test_harvest_single(self): """plotextractor - check harvest_single""" tarball, pdf = harvest_single('arXiv:1204.6260', '/tmp', ('pdf', 'tarball')) self.assertTrue(pdf is not None, "PDF is of unknown type") self.assertTrue(tarball is not None, "Tarball is of unknown type")