def test_dup_images(self): """plotextractor - remove_dups images""" images_and_captions = [('img1', 'caption1', 'label1', 'FIXME1'), ('img1', 'caption2', 'label1', 'FIXME1')] pared_images_and_captions = remove_dups(images_and_captions) self.assertTrue(pared_images_and_captions == [('img1', 'caption1 : caption2', 'label1', 'FIXME1')], \ 'didn\'t merge captions correctly')
def test_dup_captions(self): images_and_captions = [('img1', 'caption1'), ('img1', 'caption1'),\ ('img1', 'caption2')] pared_images_and_captions = remove_dups(images_and_captions) assert pared_images_and_captions==[('img1', 'caption1 : caption2')],\ 'didn\'t merge captions correctly'
def test_no_dups(self): """plotextractor - remove_dups no dupes""" images_and_captions = [('img1', 'caption1', 'label1', 'FIXME1'), ('img2', 'caption2', 'label1', 'FIXME1')] pared_images_and_captions = remove_dups(images_and_captions) self.assertTrue(pared_images_and_captions == images_and_captions, 'removed nondup')
def test_dup_captions(self): """plotextractor - remove_dups captions""" images_and_captions = [('img1', 'caption1', 'label1', 'FIXME1'), ('img1', 'caption1', 'label1', 'FIXME1'), \ ('img1', 'caption2', 'label1', 'FIXME1')] pared_images_and_captions = remove_dups(images_and_captions) self.assertTrue(pared_images_and_captions == [('img1', 'caption1 : caption2', 'label1', 'FIXME1')], \ 'didn\'t merge captions correctly')
def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \ upload_plots=False, force=False, squash="", \ yes_i_know=False, refno_url="", \ clean=False): """ Processes one tarball end-to-end. @param: tarball (string): the absolute location of the tarball we wish to process @param: sdir (string): where we should put all the intermediate files for the processing. if you're uploading, this directory should be one of the ones specified in CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS, else the upload won't work @param: xtract_text (boolean): true iff you want to run pdftotext on the pdf versions of the tarfiles. this programme assumes that the pdfs are named the same as the tarballs but with a .pdf extension. @param: upload_plots (boolean): true iff you want to bibupload the plots extracted by this process @param: force (boolean): force creation of new xml file @param: squash: write MARCXML output into a specified 'squash' file instead of single files. @param: yes_i_know: if True, no user interaction if upload_plots is True @param: refno_url: URL to the invenio-instance to query for refno. @param: clean: if True, everything except the original tarball, plots and context- files will be removed @return: marc_name(string): path to generated marcxml file """ sub_dir, refno = get_defaults(tarball, sdir, refno_url) if not squash: marc_name = os.path.join(sub_dir, '%s.xml' % (refno,)) if (force or not os.path.exists(marc_name)): marc_fd = open(marc_name, 'w') marc_fd.write('<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n') marc_fd.close() else: marc_name = squash if xtract_text: extract_text(tarball) try: extracted_files_list, image_list, tex_files = untar(tarball, sub_dir) except Timeout: write_message('Timeout during tarball extraction on %s' % (tarball,)) return if tex_files == [] or tex_files == None: write_message('%s is not a tarball' % (os.path.split(tarball)[-1],)) run_shell_command('rm -r %s', (sub_dir,)) return converted_image_list = convert_images(image_list) write_message('converted %d of %d images found for %s' % (len(converted_image_list), \ len(image_list), \ os.path.basename(tarball))) extracted_image_data = [] for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, sub_dir, \ converted_image_list) if partly_extracted_image_data != []: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data(partly_extracted_image_data, \ tex_file, converted_image_list) # Using prev. extracted info, get contexts for each image found extracted_image_data.extend((extract_context(tex_file, cleaned_image_data))) extracted_image_data = remove_dups(extracted_image_data) if extracted_image_data == []: write_message('No plots detected in %s' % (refno,)) else: if refno_url == "": refno = None create_contextfiles(extracted_image_data) marc_xml = create_MARC(extracted_image_data, tarball, refno) if not squash: marc_xml += "\n</collection>" if marc_name != None: marc_fd = open(marc_name, 'a') marc_fd.write('%s\n' % (marc_xml,)) marc_fd.close() if not squash: write_message('generated %s' % (marc_name,)) if upload_plots: upload_to_site(marc_name, yes_i_know) if clean: clean_up(extracted_files_list, image_list) write_message('work complete on %s' % (os.path.split(tarball)[-1],)) return marc_name
def test_no_dups(self): images_and_captions = [('img1', 'caption1'), ('img2', 'caption2')] pared_images_and_captions = remove_dups(images_and_captions) assert pared_images_and_captions==images_and_captions, 'removed nondup'
def test_dup_images(self): images_and_captions = [('img1', 'caption1', 'label1'), ('img1', 'caption2', 'label1')] pared_images_and_captions = remove_dups(images_and_captions) self.assertTrue(pared_images_and_captions == [('img1', 'caption1 : caption2', 'label1')], \ 'didn\'t merge captions correctly')