def test_extract_captions(self): '''Test extracting captions''' if not self.app.config.get('GRAPHICS_ENABLE_UPDATES', False): return True from file_ops import untar from file_ops import convert_images from invenio_tools import extract_captions from invenio_tools import prepare_image_data from invenio_tools import extract_context # Assuming the previous tests succeeded, we know we can extract # the images fro the archive archive = "%s/tests/stubdata/arXiv/YY/NN.tar.gz" % PROJECT_HOME tex, imgs, sdir = untar(archive) # Now use the update machinery to convert the images to PNGs converted_images = convert_images(imgs) # Pick the right TeX file tex_file = [f for f in tex if f.split('.')[-1] == 'tex'][0] # Extract captions TMP = self.app.config.get('GRAPHICS_TMP_DIR') im_data = extract_captions(tex_file, TMP, converted_images) # Did we get what we expected expected = ('', 'noimgDistance to M~51', '') self.assertEqual(im_data[0], expected) # Check cleaned data cleaned = prepare_image_data(im_data, tex_file, converted_images) self.assertEqual(os.path.basename(cleaned[-1][0]), 'figure09.png') self.assertEqual(cleaned[-1][1], 'figure05.ps') self.assertEqual(cleaned[-1][2], '') # Check extracted context context = extract_context(tex_file, cleaned) expected = ('', 'noimgDistance to M~51', '', []) self.assertEqual(context[0], expected) # Cleanup the extracted data extract_dir = "%s/NN" % TMP shutil.rmtree(extract_dir)
try: img_files, converted_images = file_ops.convert_images(img_files) except Exception, exc: sys.stderr.write('Image conversion barfed for %s. Skipping.\n' % bibcode) # Remove the temporary directory try: shutil.rmtree(xdir) except: pass return # We now have a list with successfully converted (PNG) images extracted_image_data = [] for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, xdir, img_files) if not partly_extracted_image_data == []: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data( partly_extracted_image_data, tex_file, converted_images) # Using prev. extracted info, get contexts for each image found extracted_image_data.extend( (extract_context(tex_file, cleaned_image_data))) extracted_image_data = remove_dups(extracted_image_data) fid = 1 source2target = {} for item in extracted_image_data: if not os.path.exists(item[0]) or not item[0].strip(): continue fig_data = {}
# removed from the list of originals try: img_files, converted_images = file_ops.convert_images(img_files) except Exception, exc: sys.stderr.write("Image conversion barfed for %s. Skipping.\n" % bibcode) # Remove the temporary directory try: shutil.rmtree(xdir) except: pass return # We now have a list with successfully converted (PNG) images extracted_image_data = [] for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, xdir, img_files) if not partly_extracted_image_data == []: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data(partly_extracted_image_data, tex_file, converted_images) # Using prev. extracted info, get contexts for each image found extracted_image_data.extend((extract_context(tex_file, cleaned_image_data))) extracted_image_data = remove_dups(extracted_image_data) fid = 1 source2target = {} for item in extracted_image_data: if not os.path.exists(item[0]) or not item[0].strip(): continue fig_data = {} if arx_id.find("arXiv") > -1: figure_id = "arxiv%s_f%s" % (arx_id.replace("arXiv:", ""), fid)