Esempio n. 1
0
    def convert(self, cache_key=None):
        """Convert the document to HTML.

        Returns the main document content as string and a cache_key
        for quick later retrieval. Additional documents (images, etc.)
        which are result of the conversion are placed in the `tmpdir`
        of this `Document`.

        If `cache_key` is given (and a `cache_dir` set before) we will
        lookup the cache before performing any real conversion.

        Raises `IOError` if conversion fails.
        """
        name = self.name()
        src_path = os.path.join(self.tmpdir, name)
        resultpath = self.client.get_cached(cache_key)
        if resultpath is not None:
            # Lookup cached doc by cache key (fast)
            newdir = copy_to_secure_location(resultpath)
            resultpath = os.path.join(newdir, os.path.basename(resultpath))
        if resultpath is None:
            # Lookup cached doc by source (expensive)
            resultpath, cache_key = self.client.get_cached_by_source(
                src_path, OPTIONS_HTML)
            if resultpath is not None:
                newdir = copy_to_secure_location(resultpath)
                resultpath = os.path.join(newdir, os.path.basename(resultpath))
        if resultpath is None:
            # Convert to HTML, new doc will be in resultpath
            resultpath, cache_key, metadata = self.client.convert(
                src_path, OPTIONS_HTML)
            if metadata['error']:
                descr = metadata.get('error-descr', 'Descr. not avail.')
                raise IOError('Could not convert: %s [%s]' % (name, descr))
            newdir = os.path.dirname(resultpath)
        html = open(resultpath, 'r').read()
        self.cleanDir(self.tmpdir)
        self.tmpdir = newdir
        return html, cache_key
Esempio n. 2
0
def convert_doc(src_doc, options, cache_dir):
    """Convert `src_doc` according to the other parameters.

    `src_doc` is the path to the source document. `options` is a dict
    of options for processing, passed to the processors.

    `cache_dir` may be ``None`` in which no caching is requested
    during processing.

    Generates a converted representation of `src_doc` by calling
    :class:`ulif.openoffice.processor.MetaProcessor` with `options` as
    parameters.

    Afterwards the conversion result is stored in cache (if
    allowed/possible) for speedup of upcoming requests.

    Returns a triple:

      ``(<PATH>, <CACHE_KEY>, <METADATA>)``

    where ``<PATH>`` is the path to the resulting document,
    ``<CACHE_KEY>`` an identifier (string) to retrieve a generated doc
    from cache on future requests, and ``<METADATA>`` is a dict of values
    returned during request (and set by the document processors,
    notably setting the `error` keyword).

    If errors happen or caching is disabled, ``<CACHE_KEY>`` is
    ``None``.
    """
    result_path = None
    cache_key = None
    repr_key = get_marker(options)  # Create unique marker out of options
    metadata = dict(error=False)

    # Generate result
    input_copy_dir = copy_to_secure_location(os.path.abspath(src_doc))
    input_copy = os.path.join(input_copy_dir, os.path.basename(src_doc))
    try:
        proc = MetaProcessor(options=options)  # Removes original doc
        result_path, metadata = proc.process(input_copy)
    except Exception, exc:
        shutil.rmtree(input_copy_dir)
        raise exc
Esempio n. 3
0
 def process(self, path, metadata):
     ext = os.path.splitext(path)[1]
     if ext not in self.supported_extensions:
         return path, metadata
     basename = os.path.basename(path)
     src_path = os.path.join(copy_to_secure_location(path), basename)
     src_dir = os.path.dirname(src_path)
     remove_file_dir(path)
     new_html, img_name_map = cleanup_html(
         codecs.open(src_path, 'r', 'utf-8').read(),
         basename,
         fix_head_nums=self.options['html_cleaner_fix_heading_numbers'],
         fix_img_links=self.options['html_cleaner_fix_image_links'],
         fix_sdfields=self.options['html_cleaner_fix_sd_fields'],
     )
     with codecs.open(src_path, 'wb', 'utf-8') as fd:
         fd.write(new_html)
     # Rename images
     self.rename_img_files(src_dir, img_name_map)
     return src_path, metadata
Esempio n. 4
0
 def process(self, path, metadata):
     ext = os.path.splitext(path)[1]
     if ext not in self.supported_extensions:
         return path, metadata
     basename = os.path.basename(path)
     src_path = os.path.join(
         copy_to_secure_location(path), basename)
     src_dir = os.path.dirname(src_path)
     remove_file_dir(path)
     new_html, img_name_map = cleanup_html(
         codecs.open(src_path, 'r', 'utf-8').read(),
         basename,
         fix_head_nums=self.options['html_cleaner_fix_heading_numbers'],
         fix_img_links=self.options['html_cleaner_fix_image_links'],
         fix_sdfields=self.options['html_cleaner_fix_sd_fields'],
         )
     with codecs.open(src_path, 'wb', 'utf-8') as fd:
         fd.write(new_html)
     # Rename images
     self.rename_img_files(src_dir, img_name_map)
     return src_path, metadata
Esempio n. 5
0
    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(copy_to_secure_location(path), basename)
        src_dir = os.path.dirname(src_path)
        remove_file_dir(path)

        # Remove <SDFIELD> tags if any
        cleaned_html = rename_sdfield_tags(
            open(src_path, 'rb').read().decode('utf-8'))
        with open(src_path, 'wb') as fd:
            fd.write(cleaned_html.encode('utf-8'))

        error_file = os.path.join(src_dir, 'tidy-errors')
        cmd = 'tidy -asxhtml -clean -indent -modify -utf8 -f %s %s' % (
            error_file, src_path)
        os.system(cmd)
        os.unlink(error_file)
        return src_path, metadata
Esempio n. 6
0
    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(
            copy_to_secure_location(path), basename)
        src_dir = os.path.dirname(src_path)
        remove_file_dir(path)

        # Remove <SDFIELD> tags if any
        cleaned_html = rename_sdfield_tags(
            open(src_path, 'rb').read().decode('utf-8'))
        with open(src_path, 'wb') as fd:
            fd.write(cleaned_html.encode('utf-8'))

        error_file = os.path.join(src_dir, 'tidy-errors')
        cmd = 'tidy -asxhtml -clean -indent -modify -utf8 -f %s %s' % (
            error_file, src_path)
        os.system(cmd)
        os.unlink(error_file)
        return src_path, metadata
Esempio n. 7
0
    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(
            copy_to_secure_location(path), basename)
        remove_file_dir(path)

        new_html, css = extract_css(
            open(src_path, 'rb').read().decode('utf-8'), basename,
            prettify_html=self.options['css_cleaner_prettify_html'])
        css, errors = cleanup_css(
            css, minified=self.options['css_cleaner_minified'])

        css_file = os.path.splitext(src_path)[0] + '.css'
        if css is not None:
            with open(css_file, 'wb') as fd:
                fd.write(css.encode('utf-8'))
        with open(src_path, 'wb') as fd:
            fd.write(new_html.encode('utf-8'))

        return src_path, metadata
Esempio n. 8
0
    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(copy_to_secure_location(path), basename)
        remove_file_dir(path)

        new_html, css = extract_css(
            open(src_path, 'rb').read().decode('utf-8'),
            basename,
            prettify_html=self.options['css_cleaner_prettify_html'])
        css, errors = cleanup_css(
            css, minified=self.options['css_cleaner_minified'])

        css_file = os.path.splitext(src_path)[0] + '.css'
        if css is not None:
            with open(css_file, 'wb') as fd:
                fd.write(css.encode('utf-8'))
        with open(src_path, 'wb') as fd:
            fd.write(new_html.encode('utf-8'))

        return src_path, metadata
Esempio n. 9
0
    def process(self, path, metadata):
        """Do PSJ-specific adaptions of generated HTML input.

        `path` gives any (beforehand) generated HTML document. The
        path might be located in a directory with additional files
        (images, etc.) that could also be processed.

        `metadata` is a dictionary of metadata concerning the
        conversion process. It contains at least a key ``error`` with
        a boolean value (should alway be `False`, otherwise the
        document conversion failed), and a key ``error-descr`` which
        contains some error message in case of failures.

        The ``error`` and ``error-descr`` should be set when
        unresolvable processing problems occur.

        Returns a tuple (``result_path``, ``metadata``) with
        ``result_path`` containing the path to the modified document
        and ``metadata`` containing the updated ``metadata`` directory
        passed in.
        """
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(
            copy_to_secure_location(path), basename)
        remove_file_dir(path)

        html = self.fix_html(open(src_path, 'r').read())
        open(src_path, 'w').write(html.encode('utf-8'))

        css = self.get_css(os.path.dirname(src_path))
        css = self.fix_css(css)
        open(os.path.join(
            os.path.dirname(src_path), 'psj.css'), 'w').write(css)
        return src_path, metadata
Esempio n. 10
0
    def process(self, path, metadata):
        basename = os.path.basename(path)
        src = os.path.join(
            copy_to_secure_location(path), basename)
        if os.path.isfile(path):
            path = os.path.dirname(path)
        shutil.rmtree(path)
        extension = self.options['oocp_output_format']
        filter_name = self.formats[extension]
        url = 'socket,host=%s,port=%d;urp;StarOffice.ComponentContext' % (
            self.options['oocp_hostname'], self.options['oocp_port'])

        filter_props = self._get_filter_props()
        status, result_path = convert(
            url=url,
            out_format=filter_name,
            filter_props=filter_props,
            path=src,
            out_dir=os.path.dirname(src),
            )
        metadata['oocp_status'] = status
        if status != 0:
            metadata['error'] = True
            metadata['error-descr'] = 'conversion problem'
            if os.path.isfile(src):
                src = os.path.dirname(src)
            shutil.rmtree(src)
            return None, metadata
        if extension == 'xhtml':
            extension = 'html'
        result_path = '%s.%s' % (os.path.splitext(src)[0], extension)

        # Remove input file if different from output
        if os.path.exists(src):
            if os.path.basename(result_path) != basename:
                os.unlink(src)
        return result_path, metadata
Esempio n. 11
0
    def process(self, path, metadata):
        basename = os.path.basename(path)
        src = os.path.join(copy_to_secure_location(path), basename)
        if os.path.isfile(path):
            path = os.path.dirname(path)
        shutil.rmtree(path)
        extension = self.options['oocp_output_format']
        filter_name = self.formats[extension]
        url = 'socket,host=%s,port=%d;urp;StarOffice.ComponentContext' % (
            self.options['oocp_hostname'], self.options['oocp_port'])

        filter_props = self._get_filter_props()
        status, result_path = convert(
            url=url,
            out_format=filter_name,
            filter_props=filter_props,
            path=src,
            out_dir=os.path.dirname(src),
        )
        metadata['oocp_status'] = status
        if status != 0:
            metadata['error'] = True
            metadata['error-descr'] = 'conversion problem'
            if os.path.isfile(src):
                src = os.path.dirname(src)
            shutil.rmtree(src)
            return None, metadata
        if extension == 'xhtml':
            extension = 'html'
        result_path = '%s.%s' % (os.path.splitext(src)[0], extension)

        # Remove input file if different from output
        if os.path.exists(src):
            if os.path.basename(result_path) != basename:
                os.unlink(src)
        return result_path, metadata
Esempio n. 12
0
 def test_copy_to_secure_location_path(self):
     sample_path = os.path.join(self.workdir, 'sample.txt')
     open(sample_path, 'wb').write("Hi from sample")
     sample_dir = os.path.dirname(sample_path)
     self.resultpath = copy_to_secure_location(sample_dir)
     assert os.path.isfile(os.path.join(self.resultpath, 'sample.txt'))
Esempio n. 13
0
 def test_copy_to_secure_location_path(self, workdir):
     # we can copy dirs to a secure location
     workdir.join("src").join("sample.txt").write("Hey there!")
     result_path = copy_to_secure_location(str(workdir / "src"))
     assert os.path.isfile(os.path.join(result_path, 'sample.txt'))