def test_extract_captions(self):
     '''Test extracting captions'''
     if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False):
         return True
     from file_ops import untar
     from file_ops import convert_images
     from invenio_tools import extract_captions
     from invenio_tools import prepare_image_data
     from invenio_tools import extract_context
     # Assuming the previous tests succeeded, we know we can extract
     # the images fro the archive
     archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME
     tex, imgs, sdir = untar(archive)
     # Now use the update machinery to convert the images to PNGs
     converted_images = convert_images(imgs)
     # Pick the right TeX file
     tex_file = [f for f in tex if f.split('.')[-1] == 'tex'][0]
     # Extract captions
     TMP = self.app.config.get('GRAPHICS_TMP_DIR')
     im_data = extract_captions(tex_file, TMP, converted_images)
     # Did we get what we expected
     expected = ('', 'noimgDistance to M~51', '')
     self.assertEqual(im_data[0], expected)
     # Check cleaned data
     cleaned = prepare_image_data(im_data, tex_file, converted_images)
     self.assertEqual(os.path.basename(cleaned[-1][0]), 'figure09.png')
     self.assertEqual(cleaned[-1][1], 'figure05.ps')
     self.assertEqual(cleaned[-1][2], '')
     # Check extracted context
     context = extract_context(tex_file, cleaned)
     expected = ('', 'noimgDistance to M~51', '', [])
     self.assertEqual(context[0], expected)
     # Cleanup the extracted data
     extract_dir = "%s/NN" % TMP
     shutil.rmtree(extract_dir)
Esempio n. 2
0
 def test_extract_captions(self):
     '''Test extracting captions'''
     if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False):
         return True
     from file_ops import untar
     from file_ops import convert_images
     from invenio_tools import extract_captions
     from invenio_tools import prepare_image_data
     from invenio_tools import extract_context
     # Assuming the previous tests succeeded, we know we can extract
     # the images fro the archive
     archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME
     tex, imgs, sdir = untar(archive)
     # Now use the update machinery to convert the images to PNGs
     converted_images = convert_images(imgs)
     # Pick the right TeX file
     tex_file = [f for f in tex if f.split('.')[-1] == 'tex'][0]
     # Extract captions
     TMP = self.app.config.get('GRAPHICS_TMP_DIR')
     im_data = extract_captions(tex_file, TMP, converted_images)
     # Did we get what we expected
     expected = ('', 'noimgDistance to M~51', '')
     self.assertEqual(im_data[0], expected)
     # Check cleaned data
     cleaned = prepare_image_data(im_data, tex_file, converted_images)
     self.assertEqual(os.path.basename(cleaned[-1][0]), 'figure09.png')
     self.assertEqual(cleaned[-1][1], 'figure05.ps')
     self.assertEqual(cleaned[-1][2], '')
     # Check extracted context
     context = extract_context(tex_file, cleaned)
     expected = ('', 'noimgDistance to M~51', '', [])
     self.assertEqual(context[0], expected)
     # Cleanup the extracted data
     extract_dir = "%s/NN" % TMP
     shutil.rmtree(extract_dir)
Esempio n. 3
0
            pass
        return
    # We now have a list with successfully converted (PNG) images
    extracted_image_data = []
    for tex_file in tex_files:
        # Extract images, captions and labels
        partly_extracted_image_data = extract_captions(tex_file, xdir,
                                                       img_files)
        if not partly_extracted_image_data == []:
            # Add proper filepaths and do various cleaning
            cleaned_image_data = prepare_image_data(
                partly_extracted_image_data, tex_file, converted_images)

            # Using prev. extracted info, get contexts for each image found
            extracted_image_data.extend(
                (extract_context(tex_file, cleaned_image_data)))
    extracted_image_data = remove_dups(extracted_image_data)
    fid = 1
    source2target = {}
    for item in extracted_image_data:
        if not os.path.exists(item[0]) or not item[0].strip():
            continue
        fig_data = {}
        if arx_id.find('arXiv') > -1:
            figure_id = 'arxiv%s_f%s' % (arx_id.replace('arXiv:', ''), fid)
            subdir = arx_id.replace('arXiv:', '').split('.')[0]
            eprdir = arx_id.replace('arXiv:', '').split('.')[1]
        else:
            figure_id = '%s_f%s' % (arx_id.replace('/', '_'), fid)
            subdir = arx_id.split('/')[1][:4]
            eprdir = arx_id.split('/')[1][4:]
Esempio n. 4
0
        try:
            shutil.rmtree(xdir)
        except:
            pass
        return
    # We now have a list with successfully converted (PNG) images
    extracted_image_data = []
    for tex_file in tex_files:
        # Extract images, captions and labels
        partly_extracted_image_data = extract_captions(tex_file, xdir, img_files)
        if not partly_extracted_image_data == []:
            # Add proper filepaths and do various cleaning
            cleaned_image_data = prepare_image_data(partly_extracted_image_data, tex_file, converted_images)

            # Using prev. extracted info, get contexts for each image found
            extracted_image_data.extend((extract_context(tex_file, cleaned_image_data)))
    extracted_image_data = remove_dups(extracted_image_data)
    fid = 1
    source2target = {}
    for item in extracted_image_data:
        if not os.path.exists(item[0]) or not item[0].strip():
            continue
        fig_data = {}
        if arx_id.find("arXiv") > -1:
            figure_id = "arxiv%s_f%s" % (arx_id.replace("arXiv:", ""), fid)
            subdir = arx_id.replace("arXiv:", "").split(".")[0]
            eprdir = arx_id.replace("arXiv:", "").split(".")[1]
        else:
            figure_id = "%s_f%s" % (arx_id.replace("/", "_"), fid)
            subdir = arx_id.split("/")[1][:4]
            eprdir = arx_id.split("/")[1][4:]