def test_wrong_option_name_format(self): # we check format of options. They must start with dashes. # short name must have format '-XXX' with pytest.raises(ValueError): Argument('opt-name-wo-dash') # long name must have format '--XXX' with pytest.raises(ValueError): Argument('-myproc-opt1', '-myproc-option1')
class CSSCleaner(BaseProcessor): """A processor for cleaning up CSS parts of HTML code. Normal converters leave CSS inside an HTML document. This processor first aggregates these style parts and then puts it into an external CSS file leaving only a link to that file. This processor requires HTML/XHTML input. """ prefix = 'css_cleaner' args = [ Argument( '-css-cleaner-min', '--css-cleaner-minified', type=boolean, default=True, metavar='YES|NO', help='Whether to minify generated CSS (when handling HTML) ' 'Default: yes', ), Argument( '-css-cleaner-prettify', '--css-cleaner-prettify-html', type=boolean, default=False, metavar='YES|NO', help='Prettify generated HTML (may lead to gaps in ' 'rendered output) Default: no', ), ] supported_extensions = ['.html', '.xhtml'] def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) remove_file_dir(path) new_html, css = extract_css( open(src_path, 'rb').read().decode('utf-8'), basename, prettify_html=self.options['css_cleaner_prettify_html']) css, errors = cleanup_css( css, minified=self.options['css_cleaner_minified']) css_file = os.path.splitext(src_path)[0] + '.css' if css is not None: with open(css_file, 'wb') as fd: fd.write(css.encode('utf-8')) with open(src_path, 'wb') as fd: fd.write(new_html.encode('utf-8')) return src_path, metadata
def test_default_string(self): # we can get the defaults as string assert Argument( # ints '-my-opt1', None, default=1).default_string == '1' assert Argument( # strings '-my-opt1', None, default='foo').default_string == 'foo' assert Argument( # bools '-my-opt1', None, default=True).default_string == 'yes' assert Argument( # lists '-my-opt1', None, default=['a', 'b']).default_string == 'a, b' assert Argument( # tuples '-my-opt1', None, default=('a', 'b')).default_string == 'a, b' assert Argument( # None '-my-opt1', None).default_string == 'None'
def test_regular(self): # normally we pass in args and keywords arg = Argument('-myproc-opt1', '--myproc-option1', choice=[1, 2, 3]) assert arg.short_name == '-myproc-opt1' assert arg.long_name == '--myproc-option1' assert arg.keywords['choice'] == [1, 2, 3]
class HTMLCleaner(BaseProcessor): """A processor for cleaning up HTML produced by OO.org. Fixes minor issues with HTML code produced by OO.org. This processor expects XHTML input input. """ prefix = 'html_cleaner' args = [ Argument( '-html-cleaner-fix-head-nums', '--html-cleaner-fix-heading-numbers', type=boolean, default=True, metavar='YES|NO', help='Whether to fix heading numbers in generated HTML ' 'Default: yes', ), Argument( '-html-cleaner-fix-img-links', '--html-cleaner-fix-image-links', type=boolean, default=True, metavar='YES|NO', help='Whether to fix heading numbers in generated HTML ' 'Default: yes', ), Argument( '-html-cleaner-fix-sd-fields', '--html-cleaner-fix-sd-fields', type=boolean, default=True, metavar='YES|NO', help='Whether to fix SD fields in HTML generated by ' 'LibreOffice. Default: yes', ), ] supported_extensions = ['.html', '.xhtml'] def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) new_html, img_name_map = cleanup_html( codecs.open(src_path, 'r', 'utf-8').read(), basename, fix_head_nums=self.options['html_cleaner_fix_heading_numbers'], fix_img_links=self.options['html_cleaner_fix_image_links'], fix_sdfields=self.options['html_cleaner_fix_sd_fields'], ) with codecs.open(src_path, 'wb', 'utf-8') as fd: fd.write(new_html) # Rename images self.rename_img_files(src_dir, img_name_map) return src_path, metadata def rename_img_files(self, src_dir, img_name_map): for old_img, new_img in img_name_map.items(): old_path = os.path.join(src_dir, old_img) new_path = os.path.join(src_dir, new_img) if not os.path.isfile(old_path): # XXX: Update error messages continue if os.path.exists(new_path): # XXX: Update error messages continue shutil.move(old_path, new_path) return
class OOConvProcessor(BaseProcessor): """A processor that converts office docs into different formats. XXX: we could support far more options. See http://wiki.services.openoffice.org/wiki/API/Tutorials/ PDF_export#How_to_use_it_from_OOo_Basic only for a list of PDF export options. """ prefix = 'oocp' #: mapping: extension <-> format (as accepted by unoconv) formats = OUTPUT_FORMATS options = {} args = [ Argument( '-oocp-out-fmt', '--oocp-output-format', choices=OUTPUT_FORMATS.keys(), default='html', help=('Output format to create via LibreOffice.' 'Pick from: %s' % ', '.join(OUTPUT_FORMATS.keys())), metavar='FORMAT', ), Argument( '-oocp-pdf-version', '--oocp-pdf-version', type=boolean, default=False, metavar='YES|NO', help='Create versioned PDF (aka PDF/A)? Default: no', ), Argument( '-oocp-pdf-tagged', '--oocp-pdf-tagged', type=boolean, default=False, metavar='YES|NO', help='Create tagged PDF document? Default: no', ), Argument('-oocp-host', '--oocp-hostname', default='localhost', help='Host to contact for LibreOffice document ' 'conversion. Default: "localhost"'), Argument( '-oocp-port', '--oocp-port', type=int, default=2002, help='Port of host to contact for LibreOffice document ' 'conversion. Default: 2002', ), ] def _get_filter_props(self): props = [] if self.options['oocp_output_format'] == 'pdf': pdf_version = self.options['oocp_pdf_version'] and '1' or '0' props.append(("SelectPdfVersion", pdf_version)) pdf_tagged = self.options['oocp_pdf_tagged'] and '1' or '0' props.append(("UseTaggedPDF", pdf_tagged)) return props def process(self, path, metadata): basename = os.path.basename(path) src = os.path.join(copy_to_secure_location(path), basename) if os.path.isfile(path): path = os.path.dirname(path) shutil.rmtree(path) extension = self.options['oocp_output_format'] filter_name = self.formats[extension] url = 'socket,host=%s,port=%d;urp;StarOffice.ComponentContext' % ( self.options['oocp_hostname'], self.options['oocp_port']) filter_props = self._get_filter_props() status, result_path = convert( url=url, out_format=filter_name, filter_props=filter_props, path=src, out_dir=os.path.dirname(src), ) metadata['oocp_status'] = status if status != 0: metadata['error'] = True metadata['error-descr'] = 'conversion problem' if os.path.isfile(src): src = os.path.dirname(src) shutil.rmtree(src) return None, metadata if extension == 'xhtml': extension = 'html' result_path = '%s.%s' % (os.path.splitext(src)[0], extension) # Remove input file if different from output if os.path.exists(src): if os.path.basename(result_path) != basename: os.unlink(src) return result_path, metadata
class MetaProcessor(BaseProcessor): """The meta processor handles general workflow. When getting certain options, it constructs a pipeline of document processors. The :class:`MetaProcessor` is a kind of processor dispatcher that finds, setups and calls all requested processors in the requested order. """ #: the meta processor is named 'meta' prefix = 'meta' #: We support a ``-meta-procord`` option which stands for #: ``processororder``. The current default order is: #: ``'unzip,oocp,zip'`` which means: maybe unzip the input, then #: convert it into HTML and afterwards zip the results. args = [ Argument( '-meta-procord', '--meta-processor-order', default=string_to_stringtuple(DEFAULT_PROCORDER), type=processor_order, help='Comma-separated list of processors to run. ' 'Default: "%s"' % DEFAULT_PROCORDER, metavar='PROC_LIST', ), ] @property def avail_procs(self): return get_entry_points('ulif.openoffice.processors') def __init__(self, options={}): from ulif.openoffice.options import Options if not isinstance(options, Options): options = Options(string_dict=options) self.all_options = options self.options = options self.metadata = {} return def process(self, input=None, metadata={'error': False}): """Run all processors defined in options. If all processors run successful, the output of the last along with (maybe modified) metadata is returned. Each processor is fed with the `metadata` dict and an `input` (normally a filepath). Feeding a processor means to call its `process` method. If a processor sets the ``error`` entry of `metadata` to ``True`` this indicates some problem and the whole process is aborted returning ``None`` as output and the `metadata`, maybe containing some smart hints about the reasons. If all processors work correctly, the output of the last processor is returned along with the last `metadata`. The set and order of processors called depends on the ``procord`` option passed in. If this option is set to some value like ``oocp,oocp`` then the ``oocp`` processor (which is the :class:`OOConvProcessor`, registered under ``oocp`` in `setup.py`) is called two times. .. note:: after each processing, the (then old) input is removed. """ metadata = metadata.copy() pipeline = self._build_pipeline() output = None for processor in pipeline: proc_instance = processor(self.all_options) output, metadata = proc_instance.process(input, metadata) if metadata['error'] is True: metadata = self._handle_error(processor, input, output, metadata) return None, metadata if input != output: remove_file_dir(input) input = output return input, metadata def _handle_error(self, proc, input, output, metadata): metadata['error-descr'] = metadata.get( 'error-descr', 'problem while processing %s' % proc.prefix) remove_file_dir(input) remove_file_dir(output) return metadata def _build_pipeline(self): """Build a pipeline of processors according to options. """ result = [] procs = self.avail_procs for proc_name in self.options['meta_processor_order']: result.append(procs[proc_name]) return tuple(result)