コード例 #1
0
 def test_extract_captions(self):
     '''Test extracting captions'''
     if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False):
         return True
     from file_ops import untar
     from file_ops import convert_images
     from invenio_tools import extract_captions
     from invenio_tools import prepare_image_data
     from invenio_tools import extract_context
     # Assuming the previous tests succeeded, we know we can extract
     # the images fro the archive
     archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME
     tex, imgs, sdir = untar(archive)
     # Now use the update machinery to convert the images to PNGs
     converted_images = convert_images(imgs)
     # Pick the right TeX file
     tex_file = [f for f in tex if f.split('.')[-1] == 'tex'][0]
     # Extract captions
     TMP = self.app.config.get('GRAPHICS_TMP_DIR')
     im_data = extract_captions(tex_file, TMP, converted_images)
     # Did we get what we expected
     expected = ('', 'noimgDistance to M~51', '')
     self.assertEqual(im_data[0], expected)
     # Check cleaned data
     cleaned = prepare_image_data(im_data, tex_file, converted_images)
     self.assertEqual(os.path.basename(cleaned[-1][0]), 'figure09.png')
     self.assertEqual(cleaned[-1][1], 'figure05.ps')
     self.assertEqual(cleaned[-1][2], '')
     # Check extracted context
     context = extract_context(tex_file, cleaned)
     expected = ('', 'noimgDistance to M~51', '', [])
     self.assertEqual(context[0], expected)
     # Cleanup the extracted data
     extract_dir = "%s/NN" % TMP
     shutil.rmtree(extract_dir)
コード例 #2
0
 def test_extract_captions(self):
     '''Test extracting captions'''
     if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False):
         return True
     from file_ops import untar
     from file_ops import convert_images
     from invenio_tools import extract_captions
     from invenio_tools import prepare_image_data
     from invenio_tools import extract_context
     # Assuming the previous tests succeeded, we know we can extract
     # the images fro the archive
     archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME
     tex, imgs, sdir = untar(archive)
     # Now use the update machinery to convert the images to PNGs
     converted_images = convert_images(imgs)
     # Pick the right TeX file
     tex_file = [f for f in tex if f.split('.')[-1] == 'tex'][0]
     # Extract captions
     TMP = self.app.config.get('GRAPHICS_TMP_DIR')
     im_data = extract_captions(tex_file, TMP, converted_images)
     # Did we get what we expected
     expected = ('', 'noimgDistance to M~51', '')
     self.assertEqual(im_data[0], expected)
     # Check cleaned data
     cleaned = prepare_image_data(im_data, tex_file, converted_images)
     self.assertEqual(os.path.basename(cleaned[-1][0]), 'figure09.png')
     self.assertEqual(cleaned[-1][1], 'figure05.ps')
     self.assertEqual(cleaned[-1][2], '')
     # Check extracted context
     context = extract_context(tex_file, cleaned)
     expected = ('', 'noimgDistance to M~51', '', [])
     self.assertEqual(context[0], expected)
     # Cleanup the extracted data
     extract_dir = "%s/NN" % TMP
     shutil.rmtree(extract_dir)
コード例 #3
0
    try:
        img_files, converted_images = file_ops.convert_images(img_files)
    except Exception, exc:
        sys.stderr.write('Image conversion barfed for %s. Skipping.\n' %
                         bibcode)
        # Remove the temporary directory
        try:
            shutil.rmtree(xdir)
        except:
            pass
        return
    # We now have a list with successfully converted (PNG) images
    extracted_image_data = []
    for tex_file in tex_files:
        # Extract images, captions and labels
        partly_extracted_image_data = extract_captions(tex_file, xdir,
                                                       img_files)
        if not partly_extracted_image_data == []:
            # Add proper filepaths and do various cleaning
            cleaned_image_data = prepare_image_data(
                partly_extracted_image_data, tex_file, converted_images)

            # Using prev. extracted info, get contexts for each image found
            extracted_image_data.extend(
                (extract_context(tex_file, cleaned_image_data)))
    extracted_image_data = remove_dups(extracted_image_data)
    fid = 1
    source2target = {}
    for item in extracted_image_data:
        if not os.path.exists(item[0]) or not item[0].strip():
            continue
        fig_data = {}
コード例 #4
0
    # removed from the list of originals
    try:
        img_files, converted_images = file_ops.convert_images(img_files)
    except Exception, exc:
        sys.stderr.write("Image conversion barfed for %s. Skipping.\n" % bibcode)
        # Remove the temporary directory
        try:
            shutil.rmtree(xdir)
        except:
            pass
        return
    # We now have a list with successfully converted (PNG) images
    extracted_image_data = []
    for tex_file in tex_files:
        # Extract images, captions and labels
        partly_extracted_image_data = extract_captions(tex_file, xdir, img_files)
        if not partly_extracted_image_data == []:
            # Add proper filepaths and do various cleaning
            cleaned_image_data = prepare_image_data(partly_extracted_image_data, tex_file, converted_images)

            # Using prev. extracted info, get contexts for each image found
            extracted_image_data.extend((extract_context(tex_file, cleaned_image_data)))
    extracted_image_data = remove_dups(extracted_image_data)
    fid = 1
    source2target = {}
    for item in extracted_image_data:
        if not os.path.exists(item[0]) or not item[0].strip():
            continue
        fig_data = {}
        if arx_id.find("arXiv") > -1:
            figure_id = "arxiv%s_f%s" % (arx_id.replace("arXiv:", ""), fid)