Example #1
0
 def setMetadata(self, metadata):
   """Returns a document with new metadata.
   Keyword arguments:
   metadata -- expected an dictionary with metadata.
   """
   text_template = "InfoKey: %s\nInfoValue: %s\n"
   text_list = [text_template % (key.capitalize(), value) \
                                for key, value in metadata.iteritems()]
   metadata_file = File(self.document.directory_name,
                        "".join(text_list),
                        "txt")
   output_url = NamedTemporaryFile(suffix=".pdf",
                       dir=self.document.directory_name).name
   command = ["pdftk",
              self.document.getUrl(),
              "update_info",
              metadata_file.getUrl(),
              "output",
              output_url
              ]
   stdout, stderr = Popen(command,
                          stdout=PIPE,
                          stderr=PIPE,
                          close_fds=True,
                          env=self.environment).communicate()
   self.document.reload(output_url)
   try:
     return self.document.getContent()
   finally:
     self.document.trash()
Example #2
0
 def setMetadata(self, metadata):
     """Returns a document with new metadata.
 Keyword arguments:
 metadata -- expected an dictionary with metadata.
 """
     text_template = "InfoKey: %s\nInfoValue: %s\n"
     text_list = [text_template % (key.capitalize(), value) \
                                  for key, value in metadata.iteritems()]
     metadata_file = File(self.document.directory_name, "".join(text_list),
                          "txt")
     output_url = mktemp(suffix=".pdf", dir=self.document.directory_name)
     command = [
         "pdftk",
         self.document.getUrl(), "update_info",
         metadata_file.getUrl(), "output", output_url
     ]
     stdout, stderr = Popen(command,
                            stdout=PIPE,
                            stderr=PIPE,
                            close_fds=True,
                            env=self.environment).communicate()
     self.document.reload(output_url)
     try:
         return self.document.getContent()
     finally:
         self.document.trash()
Example #3
0
 def __init__(self, base_folder_url, data, source_format, **kw):
   """
   base_folder_url(string)
     The requested url for data base folder
   data(string)
     The opened and readed file into a string
   source_format(string)
     The source format of the inputed file"""
   self.base_folder_url = base_folder_url
   self.input = File(base_folder_url, data, source_format)
   self.environment = kw.get("env", {})
Example #4
0
class Handler(object):
    """ImageMagic Handler is used to handler images."""

    implements(IHandler)

    def __init__(self, base_folder_url, data, source_format, **kw):
        """ Load pdf document """
        self.base_folder_url = base_folder_url
        self.file = File(base_folder_url, data, source_format)
        self.environment = kw.get("env", {})

    def convert(self, destination_format=None, **kw):
        """Convert a image"""
        logger.debug("ImageMagickConvert: %s > %s" %
                     (self.file.source_format, destination_format))
        output_url = mktemp(suffix='.%s' % destination_format,
                            dir=self.base_folder_url)
        command = ["convert", self.file.getUrl(), output_url]
        stdout, stderr = Popen(command,
                               stdout=PIPE,
                               stderr=PIPE,
                               close_fds=True,
                               env=self.environment).communicate()
        self.file.reload(output_url)
        try:
            return self.file.getContent()
        finally:
            self.file.trash()

    def getMetadata(self, base_document=False):
        """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
        command = ["identify", "-verbose", self.file.getUrl()]
        stdout, stderr = Popen(command,
                               stdout=PIPE,
                               stderr=PIPE,
                               close_fds=True,
                               env=self.environment).communicate()
        self.file.trash()
        metadata_dict = {}
        for std in stdout.split("\n"):
            std = std.strip()
            if re.search("^[a-zA-Z]", std):
                if std.count(":") > 1:
                    key, value = re.compile(".*\:\ ").split(std)
                else:
                    key, value = std.split(":")
                metadata_dict[key] = value.strip()
        return metadata_dict

    def setMetadata(self, metadata={}):
        """Returns image with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
        raise NotImplementedError
 def testSendZipFile(self):
   """Tests if the htm is extrated from zipfile"""
   zip_input_url = 'data/test.zip'
   data = open(zip_input_url).read()
   zipdocument = File(self.tmp_url, data, 'zip')
   mime = magic.Magic(mime=True)
   mimetype = mime.from_buffer(zipdocument.getContent(True))
   self.assertEquals(mimetype, "application/zip")
   mimetype = mime.from_buffer(zipdocument.getContent())
   self.assertEquals(mimetype, "text/html")
   zipfile = ZipFile(StringIO(zipdocument.getContent(True)))
   self.assertEquals(sorted(zipfile.namelist()),
               sorted(['logo.gif', 'test.htm']))
Example #6
0
class Handler(object):
  """ImageMagic Handler is used to handler images."""

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """ Load pdf document """
    self.base_folder_url = base_folder_url
    self.file = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def convert(self, destination_format=None, **kw):
    """Convert a image"""
    logger.debug("ImageMagickConvert: %s > %s" % (self.file.source_format, destination_format))
    output_url = mktemp(suffix='.%s' % destination_format,
                        dir=self.base_folder_url)
    command = ["convert", self.file.getUrl(), output_url]
    stdout, stderr = Popen(command,
                          stdout=PIPE,
                          stderr=PIPE,
                          close_fds=True,
                          env=self.environment).communicate()
    self.file.reload(output_url)
    try:
      return self.file.getContent()
    finally:
      self.file.trash()

  def getMetadata(self, base_document=False):
    """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
    command = ["identify", "-verbose", self.file.getUrl()]
    stdout, stderr = Popen(command,
                          stdout=PIPE,
                          stderr=PIPE,
                          close_fds=True,
                          env=self.environment).communicate()
    self.file.trash()
    metadata_dict = {}
    for std in stdout.split("\n"):
      std = std.strip()
      if re.search("^[a-zA-Z]", std):
        if std.count(":") > 1:
          key, value = re.compile(".*\:\ ").split(std)
        else:
          key, value = std.split(":")
        metadata_dict[key] = value.strip()
    return metadata_dict

  def setMetadata(self, metadata={}):
    """Returns image with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    raise NotImplementedError
Example #7
0
class TestUnoConverter(HandlerTestCase):
  """Test case to test all features of the unoconverter script"""

  file_msg_list = ["Microsoft Office Document",
                  "CDF V2 Document, Little Endian, Os: Windows, Version 1.0,"]

  def afterSetUp(self):
    """ """
    openoffice.acquire()
    self.hostname, self.port = openoffice.getAddress()
    data = open("data/test.odt", 'r').read()
    self.document = File(self.tmp_url, data, 'odt')

  def tearDown(self):
    """Called to unlock the openoffice"""
    openoffice.release()

  def testUnoConverterOdtToDoc(self):
    """Test script unoconverter"""
    mimemapper = dict(filter_list=[('doc',
                                    'com.sun.star.text.TextDocument',
                                    'MS Word 97')],
                     doc_type_list_by_extension=dict(doc=['com.sun.star.text.TextDocument']))
    mimemapper_pickled = json.dumps(mimemapper)
    python = join(self.office_binary_path, "python")
    command = [exists(python) and python or "python",
          pkg_resources.resource_filename("cloudooo.handler.ooo",
                                          "/helper/unoconverter.py"),
          "--convert",
          "--uno_path=%s" % self.uno_path,
          "--office_binary_path=%s" % self.office_binary_path,
          "--hostname=%s" % self.hostname,
          "--port=%s" % self.port,
          "--document_url=%s" % self.document.getUrl(),
          "--destination_format=%s" % "doc",
          "--source_format=%s" % "odt",
          "--mimemapper=%s" % mimemapper_pickled]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE).communicate()
    self.assertEquals(stderr, '')
    output_url = stdout.replace('\n', '')
    self.assertTrue(exists(output_url), stdout)
    mime = magic.Magic(mime=True)
    self.assertEquals(mime.from_file(output_url), 'application/msword')
    self.document.trash()
    self.assertEquals(exists(output_url), False)
Example #8
0
 def __init__(self, base_folder_url, data, source_format, **kw):
   """Creates document in file system and loads it in OOo."""
   self.document = File(base_folder_url, data, source_format)
   self.zip = kw.get('zip', False)
   self.uno_path = kw.get("uno_path", None)
   self.office_binary_path = kw.get("office_binary_path", None)
   self.timeout = kw.get("timeout", 600)
   self.refresh = kw.get('refresh', False)
   self.source_format = source_format
   if not self.uno_path:
     self.uno_path = environ.get("uno_path")
   if not self.office_binary_path:
     self.office_binary_path = environ.get("office_binary_path")
Example #9
0
 def __init__(self, base_folder_url, data, source_format, **kw):
     self.file = File(base_folder_url, data, source_format)
     self.environment = kw.get("env", {})
     self.grain_directory = mkdtemp(dir=self.file.directory_name)
Example #10
0
class Handler(object):
  """PDF Handler is used to handler inputed pdf document."""

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """ Load pdf document """
    self.base_folder_url = base_folder_url
    self.document = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def convert(self, destination_format=None, **kw):
    """ Convert a pdf document """
    logger.debug("PDFConvert: %s > %s" % (self.document.source_format, 
                                        destination_format))
    output_url = NamedTemporaryFile(suffix=".%s" % destination_format,
                        dir=self.document.directory_name).name
    if self.document.source_format == 'ps':
      command = ["ps2pdf",
                "-dASCII85EncodePages=false",
                "-dLanguageLevel=1",
                self.document.getUrl(),
                output_url]
    else:
      command = ["pdftotext", self.document.getUrl(), output_url]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    self.document.reload(output_url)
    try:
      return self.document.getContent()
    finally:
      self.document.trash()

  def getMetadata(self, base_document=False):
    """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
    command = ["pdfinfo", self.document.getUrl()]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    info_list = filter(None, stdout.split("\n"))
    metadata = {}
    for info in iter(info_list):
      if info.count(":") == 1:
        info_name, info_value = info.split(":")
      else:
        info_name, info_value = info.split("  ")
        info_name = info_name.replace(":", "")
      info_value = info_value.strip()
      metadata[info_name.lower()] = info_value
    self.document.trash()
    return metadata

  def setMetadata(self, metadata):
    """Returns a document with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    text_template = "InfoKey: %s\nInfoValue: %s\n"
    text_list = [text_template % (key.capitalize(), value) \
                                 for key, value in metadata.iteritems()]
    metadata_file = File(self.document.directory_name,
                         "".join(text_list),
                         "txt")
    output_url = NamedTemporaryFile(suffix=".pdf",
                        dir=self.document.directory_name).name
    command = ["pdftk",
               self.document.getUrl(),
               "update_info",
               metadata_file.getUrl(),
               "output",
               output_url
               ]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    self.document.reload(output_url)
    try:
      return self.document.getContent()
    finally:
      self.document.trash()
Example #11
0
class Handler(object):
    """ImageMagic Handler is used to handler images."""

    implements(IHandler)

    def __init__(self, base_folder_url, data, source_format, **kw):
        """ Load pdf document """
        self.base_folder_url = base_folder_url
        self.file = File(base_folder_url, data, source_format)
        self.environment = kw.get("env", {})

    def makeTempFile(self, destination_format=None):
        path = mktemp(
            suffix='.%s' % destination_format,
            dir=self.file.directory_name,
        )
        return path

    def makeTempDir(self, *args, **kw):
        return mkdtemp(*args, dir=self.file.directory_name, **kw)

    def convertPathToUrl(self, path):
        if path.startswith("/"):
            return "file://" + path
        raise ValueError("path %r is not absolute" % path)

    def convert(self, destination_format=None, **kw):
        """Convert a image"""
        logger.debug("wkhtmltopdf convert: %s > %s" %
                     (self.file.source_format, destination_format))
        output_path = self.makeTempFile(destination_format)
        command = self.makeWkhtmltopdfCommandList(
            self.convertPathToUrl(self.file.getUrl()),
            output_path,
            conversion_kw=kw,
        )
        stdout, stderr = Popen(
            command,
            stdout=PIPE,
            stderr=PIPE,
            close_fds=True,
            env=self.environment,
            cwd=self.file.directory_name,
        ).communicate()
        self.file.reload(output_path)
        try:
            return self.file.getContent()
        finally:
            self.file.trash()

    def getMetadata(self, base_document=False):
        """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
        return NotImplementedError

    def setMetadata(self, metadata={}):
        """Returns image with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
        raise NotImplementedError

    @staticmethod
    def getAllowedConversionFormatList(source_mimetype):
        """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('application/pdf', 'PDF - Portable Document Format'),
     ...
    ]
    """
        source_mimetype = parseContentType(source_mimetype).gettype()
        if source_mimetype in ("text/html", "htm", "html"):
            return [("application/pdf", "PDF - Portable Document Format")]
        return []

    def makeSwitchOptionList(self, allowed_option_list, option_dict):
        """
      A switch option is enable if it exists.

      Ex: for         : --grayscale
          option_dict : {"grayscale": True}
          result      : ["--grayscale"]
    """
        option_list = []
        for option_name in allowed_option_list:
            value = option_dict.get(option_name)
            if value:
                option_list.append(keyNameToOption(option_name))
        return option_list

    def makeNoPrefixedOptionList(self, allowed_option_list, option_dict):
        """
      A "no" prefixed option is an option that if disable contains a
      "no" prefix.

      Ex: for         : --images (and --no-images)
          option_dict : {"images": False}
          result      : ["--no-images"]
    """
        option_list = []
        for option_name in allowed_option_list:
            value = option_dict.get(option_name)
            if value is not None:
                option_list.append(
                    keyNameToOption(option_name,
                                    prefix="" if value else "no-"))
        return option_list

    def makeEnablePrefixedOptionList(self, allowed_option_list, option_dict):
        """
      An "enable" prefixed option is an option that if enable contains a
      "enable" prefix else contains a "disable" prefix.

      Ex: for         : --enable-external-links (and --disable-external-links)
          option_dict : {"enable_external_links": False}
          result      : ["--disable-external-links"]
    """
        option_list = []
        for option_name in allowed_option_list:
            value = option_dict.get(option_name)
            if value is not None:
                if value:
                    option_list.append(keyNameToOption(option_name))
                else:
                    option_list.append(
                        keyNameToOption(option_name[7:], prefix="disable-"))
        return option_list

    def makeIncludeInPrefixedOptionList(self, allowed_option_list,
                                        option_dict):
        """
      An "include-in" prefixed option is an option that if enable contains a
      "include-in" prefix else contains a "exclude-from" prefix.

      Ex: for         : --include-in-outline (and --exclude-from-outline)
          option_dict : {"include_in_outline": False}
          result      : ["--exclude-from-outline"]
    """
        option_list = []
        for option_name in allowed_option_list:
            value = option_dict.get(option_name)
            if value is not None:
                if value:
                    option_list.append(keyNameToOption(option_name))
                else:
                    option_list.append(
                        keyNameToOption(option_name[11:],
                                        prefix="exclude-from-"))
        return option_list

    def makeOneStringArgumentOptionList(self, allowed_option_list,
                                        option_dict):
        """
      A one-string-argument option is a option that require an argument
      which is a string.

      Ex: for         : --title <text>
          option_dict : {"title": "Hello World!"}
          result      : ["--title", "Hello World!"]
    """
        option_list = []
        for option_name in allowed_option_list:
            value = option_dict.get(option_name)
            if value is not None:
                option_list += [keyNameToOption(option_name), str(value)]
        return option_list

    def makeRepeatableOneStringArgumentOptionList(self, allowed_option_list,
                                                  option_dict):
        """
      A repeatable one-string-argument option is a option that require one
      string argument, this option can be set several times.

      Ex: for         : --allow <path>
          option_dict : {"allow_list": ["a", "b"]}
          result      : ["--allow", "a", "--allow", "b"]
    """
        option_list = []
        for option_name in allowed_option_list:
            value_list = option_dict.get(option_name)
            if value_list:
                for value in value_list:
                    option_list += [
                        keyNameToOption(option_name[:-5]),
                        str(value)
                    ]
        return option_list

    def makeRepeatableTwoStringArgumentOptionList(self, allowed_option_list,
                                                  option_dict):
        """
      A repeatable two-string-argument option is a option that require two
      string arguments, this option can be set several times.

      Ex: for         : --cookie <name> <value>
          option_dict : {"cookie_list": [("a", "b"), ("c", "d")]}
          result      : ["--cookie", "a", "b", "--cookie", "c", "d"]
    """
        option_list = []
        for option_name in allowed_option_list:
            tuple_list = option_dict.get(option_name)
            if tuple_list:
                for name, value in tuple_list:
                    option_list += [
                        keyNameToOption(option_name[:-5]),
                        str(name),
                        str(value)
                    ]
        return option_list

    def makeDataUrlArgumentOptionList(self,
                                      allowed_option_list,
                                      option_dict,
                                      url_type="url",
                                      destination_format=None,
                                      use_switch=True):
        """
      A data-file-argument option is a option that require an url argument.

      Here, we don't want option value to be an url but data, so that
      we can put the data to a temp file an use it's url as option value.

      Ex: for         : --user-style-sheet <url> (and url_type="url")
          option_dict : {"user_style_sheet_data": b64encode("body { background-color: black; }")}
          result      : ["--user-style-sheet", "file:///tmp/tmp.XYZ.css"]

      Ex: for         : --checkbox-svg <path> (and url_type="path")
          option_dict : {"checkbox_svg_data": b64encode("<svg>....</svg>")}
          result      : ["--checkbox-svg", "/tmp/tmp.XYZ.svg"]

      Ex: for         : --xsl-style-sheet <file> (and url_type="file")
          option_dict : {"xsl_style_sheet_data": b64encode("table { border: none; }")}
          result      : ["--xsl-style-sheet", "tmp.XYZ.css"]
    """
        option_list = []
        for option_name in allowed_option_list:
            value = option_dict.get(option_name)
            if value is not None:
                # creates a tmp file in the directory which will be trashed
                path = self.makeTempFile(destination_format=destination_format)
                open(path, "wb").write(b64decode(value))
                if url_type == "url":
                    path = self.convertPathToUrl(path)
                elif url_type == "file":
                    path = basename(path)
                if use_switch:
                    option_list += [keyNameToOption(option_name[:-5]), path]
                else:
                    option_list.append(path)
        return option_list

    def makeDataPathArgumentOptionList(self, *args, **kw):
        return self.makeDataUrlArgumentOptionList(*args, url_type="path", **kw)

    def makeDataFileArgumentOptionList(self, *args, **kw):
        return self.makeDataUrlArgumentOptionList(*args, url_type="file", **kw)

    def makeRepeatableDataUrlArgumentOptionList(self, allowed_option_list,
                                                option_dict, **kw):
        option_list = []
        for option_name in allowed_option_list:
            data_list = option_dict.get(option_name)
            if data_list:
                for data in data_list:
                    option_name = option_name[:-5]
                    option_list += self.makeDataUrlArgumentOptionList(
                        [
                            option_name,
                        ], {option_name: data}, **kw)
        return option_list

    def makeWkhtmltopdfCommandList(self, *args, **kw):
        # http://wkhtmltopdf.org/usage/wkhtmltopdf.txt
        conversion_kw = kw.get("conversion_kw", {})
        command = ["wkhtmltopdf"]

        # Global Options
        command += self.makeNoPrefixedOptionList(["collate"], conversion_kw)
        command += self.makeSwitchOptionList(
            [
                #"extended-help",
                "grayscale",
                #"help",
                #"htmldoc",
                #"licence",
                "lowquality",
                #"manpage",
                "no_pdf_compression",
                #"quiet",  # we decide
                #"read_args_from_stdin",  # only for several command line at a time
                #"readme",
                #"version",
            ],
            conversion_kw)
        command += self.makeOneStringArgumentOptionList(
            [
                #"cookie_jar",  # no cookie jar
                "copies",
                "dpi",
                "image_dpi",
                "image_quality",
                "margin_bottom",
                "margin_left",
                "margin_right",
                "margin_top",
                "orientation",
                "page_height",
                "page_size",
                "page_width",
                "title",
            ],
            conversion_kw)

        # Outline Options
        command += self.makeNoPrefixedOptionList(["outline"], conversion_kw)
        #"dump_default_toc_xsl",
        command += self.makeOneStringArgumentOptionList(
            [
                #"dump_outline",
                "outline_depth",
            ],
            conversion_kw)

        # Page Options
        command += self.makeNoPrefixedOptionList(
            [
                "background",
                "custom_header_propagation",
                "images",
                "print_media_type",
                #"debug_javascript",  # we decide
                #"stop_slow_scripts",  # we decide
            ],
            conversion_kw)
        command += self.makeEnablePrefixedOptionList(
            [
                "enable_external_links",
                "enable_forms",
                "enable_internal_links",
                "enable_javascript",
                #"enable_local_file_access",  # we decide
                #"enable_plugins",
                "enable_smart_shrinking",
                "enable_toc_back_links",
            ],
            conversion_kw)
        command += ["--disable-local-file-access"]
        command += self.makeIncludeInPrefixedOptionList([
            "include_in_outline",
        ], conversion_kw)
        command += self.makeSwitchOptionList(["default_header"], conversion_kw)
        # put cache in the temp dir - to disable cache
        command += ["--cache-dir", self.makeTempDir(prefix="cache")]
        command += self.makeOneStringArgumentOptionList(
            [
                #"cache_dir",  # we decide
                "encoding",
                "javascript_delay",
                "load_error_handling",
                "load_media_error_handling",
                "minimum_font_size",
                "page_offset",
                #"password",  # too dangerous
                #"proxy",  # we decide
                #"username",  # too dangerous
                "viewport_size",
                "window_status",
                "zoom",
            ],
            conversion_kw)
        #"allow",  # we decide
        command += self.makeDataPathArgumentOptionList(
            [
                # <option_name>_data
                "checkbox_checked_svg_data",
                "checkbox_svg_data",
                "radiobutton_checked_svg_data",
                "radiobutton_svg_data",
            ],
            conversion_kw,
            destination_format="svg")
        command += self.makeDataUrlArgumentOptionList([
            "user_style_sheet_data",
        ],
                                                      conversion_kw,
                                                      destination_format="css")
        #"run_script_list",  # too dangerous, fills --run-script
        command += self.makeRepeatableTwoStringArgumentOptionList(
            [
                # <option_name>_list
                "cookie_list",
                "custom_header_list",
                #"post_list",
                #"post_file_list",
            ],
            conversion_kw)

        # Headers and Footer Options
        command += self.makeNoPrefixedOptionList([
            "footer_line",
            "header_line",
        ], conversion_kw)
        command += self.makeOneStringArgumentOptionList(
            [
                "footer_center",
                "footer_font_name",
                "footer_font_size",
                "footer_left",
                "footer_right",
                "footer_spacing",
                "header_center",
                "header_font_name",
                "header_font_size",
                "header_left",
                "header_right",  # there's a --top option (not documented)
                # may be we can do header_right_top option
                "header_spacing",
            ],
            conversion_kw)
        command += self.makeDataUrlArgumentOptionList(
            [
                # <option_name>_data
                "footer_html_data",
                "header_html_data",
            ],
            conversion_kw,
            destination_format="html")
        command += self.makeRepeatableTwoStringArgumentOptionList([
            "replace",
        ], conversion_kw)

        # Custom Options
        command += self.makeRepeatableDataUrlArgumentOptionList(
            [
                "before_toc_data_list",
            ],
            conversion_kw,
            destination_format="html",
            use_switch=False)

        # TOC Options
        value = conversion_kw.get("toc")
        if value:
            command += ["toc"]
            command += self.makeEnablePrefixedOptionList([
                "enable_dotted_lines",
                "enable_toc_links",
            ], conversion_kw)
            command += self.makeOneStringArgumentOptionList([
                "toc_header_text",
                "toc_level_indentation",
                "toc_text_size_shrink",
            ], conversion_kw)
            command += self.makeDataFileArgumentOptionList(
                [
                    "xsl_style_sheet_data",
                ],
                conversion_kw,
                destination_format="xsl")

        # Custom Options
        command += self.makeRepeatableDataUrlArgumentOptionList(
            [
                "after_toc_data_list",
                "before_body_data_list",
            ],
            conversion_kw,
            destination_format="html",
            use_switch=False)
        command += args[:-1]  # input_url
        command += self.makeRepeatableDataUrlArgumentOptionList(
            [
                "after_body_data_list",
            ],
            conversion_kw,
            destination_format="html",
            use_switch=False)
        command += args[-1:]  # output_path

        return command
Example #12
0
class Handler(object):
  """PDF Handler is used to handler inputed pdf document."""

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """ Load pdf document """
    self.base_folder_url = base_folder_url
    self.document = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def convert(self, destination_format=None, **kw):
    """ Convert a pdf document """
    # TODO: use pyPdf
    logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format))
    output_url = mktemp(suffix=".%s" % destination_format,
                        dir=self.document.directory_name)
    command = ["pdftotext", self.document.getUrl(), output_url]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    self.document.reload(output_url)
    try:
      return self.document.getContent()
    finally:
      self.document.trash()

  def getMetadata(self, base_document=False):
    """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
    # TODO: use pyPdf and not use lower()
    command = ["pdfinfo", self.document.getUrl()]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    info_list = filter(None, stdout.split("\n"))
    metadata = {}
    for info in iter(info_list):
      info = info.split(":")
      info_name = info[0].lower()
      info_value = ":".join(info[1:]).strip()
      metadata[info_name] = info_value
    self.document.trash()
    return metadata

  def setMetadata(self, metadata):
    """Returns a document with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    # TODO: date as "D:20090401124817-04'00'" ASN.1 for ModDate and CreationDate
    input_pdf = PdfFileReader(open(self.document.getUrl(), "rb"))
    output_pdf = PdfFileWriter()

    modification_date = metadata.pop("ModificationDate", None)
    if modification_date:
      metadata['ModDate'] = modification_date
    if type(metadata.get('Keywords', None)) is list:
      metadata['Keywords'] = metadata['Keywords'].join(' ')
    args = {}
    for key, value in list(metadata.items()):
      args[NameObject('/' + key.capitalize())] = createStringObject(value)

    output_pdf._info.getObject().update(args)

    for page_num in range(input_pdf.getNumPages()):
      output_pdf.addPage(input_pdf.getPage(page_num))

    output_stream = io.BytesIO()
    output_pdf.write(output_stream)
    return output_stream.getvalue()

  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):
    """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('text/plain', 'Plain Text'),
     ...
    ]
    """
    source_mimetype = parseContentType(source_mimetype).gettype()
    if source_mimetype in ("application/pdf", "pdf"):
      return [("text/plain", "Plain Text")]
    return []
class TestFile(unittest.TestCase):
  """Test to class File"""

  def setUp(self):
    """Create data to tests and instantiated a File"""
    self.tmp_url = '/tmp'
    self.data = decodestring("cloudooo Test")
    self.fsdocument = File(self.tmp_url, self.data, 'txt')

  def tearDown(self):
    """Remove the file in system"""
    if self.fsdocument.getUrl() is not None:
      self.fsdocument.trash()

  def testRestoreOriginal(self):
    """Test if changing the document and call remake, the document back to
    original state"""
    old_document_url = self.fsdocument.getUrl()
    document_filename = "document"
    document_test_url = path.join(self.fsdocument.directory_name,
                                  document_filename)
    open(document_test_url, 'wb').write(decodestring("Test Document"))
    self.fsdocument.reload(document_test_url)
    self.assertEquals(path.exists(old_document_url), False)
    self.assertNotEquals(self.fsdocument.original_data,
        self.fsdocument.getContent())
    old_document_url = self.fsdocument.getUrl()
    self.fsdocument.restoreOriginal()
    self.assertEquals(path.exists(old_document_url), False)
    self.assertNotEquals(old_document_url, self.fsdocument.getUrl())
    self.assertTrue(path.exists(self.fsdocument.getUrl()))
    self.assertEquals(self.fsdocument.getContent(), self.data)

  def testgetContent(self):
    """Test if returns the data correctly"""
    self.assertEquals(self.fsdocument.getContent(), self.data)

  def testgetUrl(self):
    """Check if the url is correct"""
    url = self.fsdocument.getUrl()
    self.assertTrue(path.exists(url))

  def testLoadDocumentFile(self):
    """Test if the document is created correctly"""
    url = self.fsdocument.getUrl()
    tmp_document = open(url, 'r').read()
    self.assertEquals(self.data, tmp_document)
    self.fsdocument.trash()
    self.assertEquals(path.exists(url), False)

  def testReload(self):
    """Change url and check if occurs correctly"""
    old_document_url = self.fsdocument.getUrl()
    document_filename = "document"
    document_test_url = path.join(self.fsdocument.directory_name,
                                               document_filename)
    open(document_test_url, 'wb').write(self.data)
    self.fsdocument.reload(document_test_url)
    url = self.fsdocument.getUrl()
    self.assertEquals(path.exists(old_document_url), False)
    self.assertEquals(self.fsdocument.getContent(), self.data)
    self.fsdocument.trash()
    self.assertEquals(path.exists(url), False)

  def testZipDocumentList(self):
    """Tests if the zip file is returned correctly"""
    open(path.join(self.fsdocument.directory_name, 'document2'), 'w').write('test')
    zip_file = self.fsdocument.getContent(True)
    mime = magic.Magic(mime=True)
    mimetype = mime.from_buffer(zip_file)
    self.assertEquals(mimetype, 'application/zip')
    ziptest = ZipFile(StringIO(zip_file), 'r')
    self.assertEquals(len(ziptest.filelist), 2)
    for file in ziptest.filelist:
      if file.filename.endswith("document2"):
        self.assertEquals(file.file_size, 4)
      else:
        self.assertEquals(file.file_size, 9)

  def testSendZipFile(self):
    """Tests if the htm is extrated from zipfile"""
    zip_input_url = 'data/test.zip'
    data = open(zip_input_url).read()
    zipdocument = File(self.tmp_url, data, 'zip')
    mime = magic.Magic(mime=True)
    mimetype = mime.from_buffer(zipdocument.getContent(True))
    self.assertEquals(mimetype, "application/zip")
    mimetype = mime.from_buffer(zipdocument.getContent())
    self.assertEquals(mimetype, "text/html")
    zipfile = ZipFile(StringIO(zipdocument.getContent(True)))
    self.assertEquals(sorted(zipfile.namelist()),
                sorted(['logo.gif', 'test.htm']))
Example #14
0
class Handler(object):
  """PDF Handler is used to handler inputed pdf document."""

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """ Load pdf document """
    self.base_folder_url = base_folder_url
    self.document = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def convert(self, destination_format=None, **kw):
    """ Convert a pdf document """
    logger.debug("PDFConvert: %s > %s" % (self.document.source_format, destination_format))
    output_url = mktemp(suffix=".%s" % destination_format,
                        dir=self.document.directory_name)
    command = ["pdftotext", self.document.getUrl(), output_url]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    self.document.reload(output_url)
    try:
      return self.document.getContent()
    finally:
      self.document.trash()

  def getMetadata(self, base_document=False):
    """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
    command = ["pdfinfo", self.document.getUrl()]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    info_list = filter(None, stdout.split("\n"))
    metadata = {}
    for info in iter(info_list):
      if info.count(":") == 1:
        info_name, info_value = info.split(":")
      else:
        info_name, info_value = info.split("  ")
        info_name = info_name.replace(":", "")
      info_value = info_value.strip()
      metadata[info_name.lower()] = info_value
    self.document.trash()
    return metadata

  def setMetadata(self, metadata):
    """Returns a document with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    text_template = "InfoKey: %s\nInfoValue: %s\n"
    text_list = [text_template % (key.capitalize(), value) \
                                 for key, value in metadata.iteritems()]
    metadata_file = File(self.document.directory_name,
                         "".join(text_list),
                         "txt")
    output_url = mktemp(suffix=".pdf",
                        dir=self.document.directory_name)
    command = ["pdftk",
               self.document.getUrl(),
               "update_info",
               metadata_file.getUrl(),
               "output",
               output_url
               ]
    stdout, stderr = Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    self.document.reload(output_url)
    try:
      return self.document.getContent()
    finally:
      self.document.trash()

  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):
    """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('text/plain', 'Plain Text'),
     ...
    ]
    """
    source_mimetype = parseContentType(source_mimetype).gettype()
    if source_mimetype in ("application/pdf", "pdf"):
      return [("text/plain", "Plain Text")]
    return []
Example #15
0
class Handler(object):
  """
  X2T Handler is used to convert Microsoft Office 2007 documents to OnlyOffice
  documents.
  """

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """
    base_folder_url(string)
      The requested url for data base folder
    data(string)
      The opened and readed file into a string
    source_format(string)
      The source format of the inputed file
    """
    self.base_folder_url = base_folder_url
    self._data = data
    self._source_format = source_format
    self._init_kw = kw
    self.file = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def convert(self, destination_format=None, **kw):
    """ Convert the inputed file to output as format that were informed """
    source_format = self.file.source_format
    logger.debug("x2t convert: %s > %s" % (source_format, destination_format))

    # init vars and xml configuration file
    in_format = format_code_map[source_format]
    out_format = format_code_map_output.get(destination_format,
                                            format_code_map[destination_format])
    root_dir = self.file.directory_name
    input_dir = os.path.join(root_dir, "input");
    input_file_name = self.file.getUrl()
    output_file_name = os.path.join(root_dir, "document.%s" % destination_format)
    config_file_name = os.path.join(root_dir, "config.xml")
    metadata = None
    output_data = None

    if source_format in yformat_tuple:
      if self._data.startswith("PK\x03\x04"):
        os.mkdir(input_dir)
        unzip(self.file.getUrl(), input_dir)
        input_file_name = os.path.join(input_dir, "body.txt")
        if not os.path.isfile(input_file_name):
          input_file_name = os.path.join(input_dir, "Editor.bin")
          if not os.path.isfile(input_file_name):
            raise RuntimeError("input format incorrect: Editor.bin absent in zip archive")
        metadata_file_name = os.path.join(input_dir, "metadata.json")
        if os.path.isfile(metadata_file_name):
          with open(metadata_file_name) as metadata_file:
            metadata = json.loads(metadata_file.read())

    with open(config_file_name, "w") as config_file:
      config = {
        # 'm_sKey': 'from',
        'm_sFileFrom': input_file_name,
        'm_nFormatFrom': str(in_format),
        'm_sFileTo': output_file_name,
        'm_nFormatTo': str(out_format),
        # 'm_bPaid': 'true',
        # 'm_bEmbeddedFonts': 'false',
        # 'm_bFromChanges': 'false',
        # 'm_sFontDir': '/usr/share/fonts',
        # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes',
      }
      root = ElementTree.Element('root')
      for key, value in config.items():
        ElementTree.SubElement(root, key).text = value
      ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True,
                                          default_namespace=None, method="xml")

    # run convertion binary
    p = Popen(
      ["x2t", config_file.name],
      stdout=PIPE,
      stderr=PIPE,
      close_fds=True,
      env=self.environment,
    )
    stdout, stderr = p.communicate()
    if p.returncode != 0:
      raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s"
                         % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr,
                            "  " + open(config_file.name).read().replace("\n", "\n  ")))

    self.file.reload(output_file_name)
    try:
      if source_format in yformat_tuple:
        if metadata:
          output_data = OOoHandler(self.base_folder_url, self.file.getContent(), source_format, **self._init_kw)\
            .setMetadata(metadata)
        else:
          output_data = self.file.getContent()
      elif destination_format in yformat_tuple:
        if not metadata:
          if source_format not in yformat_tuple:
            metadata = OOoHandler(self.base_folder_url, self._data, source_format, **self._init_kw).getMetadata()
          if not metadata:
            metadata = {}
          metadata.pop('MIMEType', None)
          metadata.pop('Generator', None)
          metadata.pop('AppVersion', None)
          metadata.pop('ImplementationName', None)
        with ZipFile(output_file_name, mode="a") as zipfile:
          zipfile.writestr("metadata.json", json.dumps(metadata))
        output_data = self.file.getContent()
    finally:
      self.file.trash()
    return output_data

  def _getContentType(self):
    mimetype_type = None
    if "/" not in self._source_format:
      mimetype_type = guess_type('a.' + self._source_format)[0]
    if mimetype_type is None:
      mimetype_type = self._source_format
    return mimetype_type

  def getMetadata(self, base_document=False):
    r"""Returns a dictionary with all metadata of document.
    """
    if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"):
      if base_document:
        openxml_format = yformat_map[self._source_format]
        data = self.convert(yformat_map[self._source_format])
        return OOoHandler(self.base_folder_url, data, openxml_format, **self._init_kw).getMetadata(base_document)
      else:
        with io.BytesIO(self._data) as memfile, ZipFile(memfile) as zipfile:
          try:
            metadata = zipfile.read("metadata.json")
          except KeyError:
            metadata = '{}'
          metadata = json.loads(metadata)
          metadata['MIMEType'] = self._getContentType()
          return metadata
    else:
      return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw)\
        .getMetadata(base_document)

  def setMetadata(self, metadata=None):
    r"""Returns document with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    if metadata is None:
      metadata = {}
    if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"):
      root_dir = self.file.directory_name
      output_file_name = os.path.join(root_dir, "tmp")
      try:
        input_dir = os.path.join(root_dir, "input")
        os.mkdir(input_dir)
        unzip(self.file.getUrl(), input_dir)
        with open(os.path.join(input_dir, "metadata.json"), "w") as metadata_file:
          metadata_file.write(json.dumps(metadata))
        with ZipFile(output_file_name, "w") as zipfile:
          for root, _, files in os.walk(input_dir):
            relative_root = root.replace(input_dir, '')
            for file_name in files:
              absolute_path = os.path.join(root, file_name)
              file_name = os.path.join(relative_root, file_name)
              zipfile.write(absolute_path, file_name)
        output_data = open(output_file_name).read()
      finally:
        os.unlink(output_file_name)
      return output_data
    else:
      return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata)

  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):
    """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('application/x-asc-text', 'OnlyOffice Text Document'),
     ...
    ]
    """
    source_mimetype = parseContentType(source_mimetype).gettype()
    if source_mimetype in ("docy", "application/x-asc-text"):
      return [
        ("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word 2007 Document"),
        ("application/vnd.oasis.opendocument.text", "ODF Text Document"),
      ]
    if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"):
      return [
        ("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel 2007 Spreadsheet"),
        ("application/vnd.oasis.opendocument.spreadsheet", "ODF Spreadsheet Document"),
      ]
    if source_mimetype in ("ppty", "application/x-asc-presentation"):
      return [
        ("application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint 2007 Presentation"),
        ("application/vnd.oasis.opendocument.presentation", "ODF Presentation Document"),
      ]

    get_format_list = OOoHandler.getAllowedConversionFormatList
    format_list = get_format_list(source_mimetype)
    format_list_append = format_list.append
    for f_type, _ in format_list:
      if f_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        format_list_append(("application/x-asc-text", "OnlyOffice Text Document"))
        break
      if f_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
        format_list_append(("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet"))
        break
      if f_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
        format_list_append(("application/x-asc-presentation", "OnlyOffice Presentation"))
        break
    return format_list
Example #16
0
 def afterSetUp(self):
   """ """
   openoffice.acquire()
   self.hostname, self.port = openoffice.getAddress()
   data = open("data/test.odt", 'r').read()
   self.document = File(self.tmp_url, data, 'odt')
Example #17
0
class Handler(object):
  """FFMPEG Handler is used to handler inputed audio and video files"""

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """
    base_folder_url(string)
      The requested url for data base folder
    data(string)
      The opened and readed file into a string
    source_format(string)
      The source format of the inputed file"""
    self.base_folder_url = base_folder_url
    self.input = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def convert(self, destination_format):
    """ Convert the inputed file to output as format that were informed """
    # XXX This implementation could use ffmpeg -i pipe:0, but
    # XXX seems super unreliable currently and it generates currupted files in
    # the end
    logger.debug("FfmpegConvert: %s > %s" % (self.input.source_format, destination_format))
    output_url = mktemp(suffix=".%s" % destination_format,
                        dir=self.input.directory_name)
    command = ["ffmpeg",
               "-i",
               self.input.getUrl(),
               "-y",
               output_url]
    # XXX ffmpeg has a bug that needs this options to work with webm format
    if destination_format == "webm":
      command.insert(3, "32k")
      command.insert(3, "-ab")
    try:
      stdout, stderr = Popen(command,
                             stdout=PIPE,
                             stderr=PIPE,
                             close_fds=True,
                             env=self.environment).communicate()
      self.input.reload(output_url)
      if len(self.input.getContent()) == 0:
        logger.error(stderr.split("\n")[-2])
      return self.input.getContent()
    finally:
      self.input.trash()

  def getMetadata(self, base_document=False):
    """Returns a dictionary with all metadata of the file.
    Keywords Arguments:"""
    command = ["ffprobe",self.input.getUrl()]
    stdout, stderr =  Popen(command,
                           stdout=PIPE,
                           stderr=PIPE,
                           close_fds=True,
                           env=self.environment).communicate()
    metadata = stderr.split('Metadata:')[1].split('\n')
    metadata_dict = {}
    for data in metadata:
      if len(data) != 0:
        key, value = data.split(':')
        metadata_dict[key.strip().capitalize()] = value.strip()
    self.input.trash()
    return metadata_dict

  def setMetadata(self, metadata_dict={}):
    """Returns a document with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    output_url = mktemp(suffix=".%s" % self.input.source_format,
                        dir=self.input.directory_name)
    command = ["ffmpeg",
               "-i",
               self.input.getUrl(),
               "-y",
               output_url]
    for metadata in metadata_dict:
      command.insert(3, "%s=%s"%(metadata, metadata_dict[metadata]))
      command.insert(3, "-metadata")
    try:
      stdout, stderr = Popen(command,
                             stdout=PIPE,
                             stderr=PIPE,
                             close_fds=True,
                             env=self.environment).communicate()
      self.input.reload(output_url)
      return self.input.getContent()
    finally:
      self.input.trash()

  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):
    """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('audio/ogg;codecs=opus', 'Opus Audio File Format'),
     ('video/webm', 'Webm Video File Format'),
     ...
    ]
    """
    # XXX NotImplemented
    return []
Example #18
0
 def __init__(self, base_folder_url, data, source_format, **kw):
     """ Load pdf document """
     self.base_folder_url = base_folder_url
     self.document = File(base_folder_url, data, source_format)
     self.environment = kw.get("env", {})
 def setUp(self):
   """Create data to tests and instantiated a File"""
   self.tmp_url = '/tmp'
   self.data = decodestring("cloudooo Test")
   self.fsdocument = File(self.tmp_url, self.data, 'txt')
Example #20
0
class PDFGranulator(object):
    def __init__(self, base_folder_url, data, source_format, **kw):
        self.file = File(base_folder_url, data, source_format)
        self.environment = kw.get("env", {})
        self.grain_directory = mkdtemp(dir=self.file.directory_name)

    # XXX - It should have another name for returning all images
    def getImageItemList(self):
        logger.debug("PDFImageGrainExtract")
        command = ["pdftohtml", self.file.getUrl(), "%s/" % self.grain_directory]
        stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate()
        # XXX - PDF can be protect
        if "Erro" in stderr:
            return False
        else:
            removeEqualImages(self.grain_directory)
            images = glob("%s/*.*" % self.grain_directory)
            imagesList = getImages(images)
            return imagesList

    def getTableItemList(self):
        """Returns the list of table title"""
        tables = self.getTablesMatrix()
        if tables == False:
            return "PDF Protect or have no Table Item List"
        else:
            table_list = tables.keys()
            return table_list

    def getTable(self, id, format="html"):
        """Returns the table into html format."""
        try:
            table_matrix = self.getTablesMatrix()[id]
            content = "<html><body><h1> %s </h1><table>" % id
            for line in table_matrix:
                content += "<tr>"
                for column in line:
                    if not type(column) == list:
                        content += "<td> %s </td>" % column
                    else:
                        content += "<td>"
                        for element in column:
                            content += "%s </br>" % element
                        content += "</td>"
                content += "</tr>"
            content += "</table></body></html>"
            return content
        except:
            return "PDF Protect or have no table with this id"

    def getTablesMatrix(self):
        """Returns the table as a matrix"""
        logger.debug("PDFTableGrainExtract")
        output_url = NamedTemporaryFile(suffix=".xml", dir=self.file.directory_name).name
        command = ["pdftohtml", "-xml", self.file.getUrl(), output_url]
        stdout, stderr = Popen(command, stdout=PIPE, stderr=PIPE, close_fds=True, env=self.environment).communicate()
        # XXX - PDF can be protect
        if "Erro" in stderr:
            return False
        else:
            output = etree.fromstring(open(output_url).read())
            row_list = output.xpath("//text")
            name, previous, next = "", "", ""
            tables = {}
            element = []
            line = []
            matrix = []
            i, j, l, m = 0, 0, 0, 0
            old_x_left = 600
            for x in row_list:
                base_line = x.attrib["top"]
                base_column = x.attrib["left"]
                i += 1
                for y in row_list[i:]:
                    if base_line == y.attrib["top"]:
                        l += 1
                        line.append(get_text(y))
                        base_column = y.attrib["left"]
                        row_list.remove(y)
                    elif base_column == y.attrib["left"]:
                        m = l
                        if len(element) > 0:
                            element.append(get_text(y))
                        # In case name of the table is after table
                        if len(line) == 0:
                            next = get_text(x)
                            if next != None and len(next.split(":")) == 2:
                                name = next
                                next = ""
                        elif len(line) > 0:
                            element.append(line.pop())
                            element.append(get_text(y))
                    else:
                        if len(element) > 0:
                            line.insert(m - 1, element)
                        l = 0
                        element = []
                        base_column = 0
                        break

                if len(line) > 0:
                    # In case name of the table is before table
                    previous = get_text(x.getprevious())
                    if previous != None and len(previous.split(":")) == 2:
                        name = previous
                        previous = ""
                    line.insert(0, get_text(x))
                    if len(line) > 1:
                        matrix.append(line)
                line = []
                if x.attrib["left"] < old_x_left and len(matrix) > 0:
                    if len(matrix) > 0:
                        j += 1
                        if name == "":
                            name = "Tabela %d" % j
                        name += " - pag %s" % x.getparent().attrib["number"]
                        tables[name] = matrix
                    name = ""
                    matrix = []
                old_x_left = x.attrib["left"]
            return tables

    def trash(self):
        """Remove file from memory"""
        self.file.trash()
Example #21
0
class Handler(object):
    """PDF Handler is used to handler inputed pdf document."""

    implements(IHandler)

    def __init__(self, base_folder_url, data, source_format, **kw):
        """ Load pdf document """
        self.base_folder_url = base_folder_url
        self.document = File(base_folder_url, data, source_format)
        self.environment = kw.get("env", {})

    def convert(self, destination_format=None, **kw):
        """ Convert a pdf document """
        logger.debug("PDFConvert: %s > %s" %
                     (self.document.source_format, destination_format))
        output_url = mktemp(suffix=".%s" % destination_format,
                            dir=self.document.directory_name)
        command = ["pdftotext", self.document.getUrl(), output_url]
        stdout, stderr = Popen(command,
                               stdout=PIPE,
                               stderr=PIPE,
                               close_fds=True,
                               env=self.environment).communicate()
        self.document.reload(output_url)
        try:
            return self.document.getContent()
        finally:
            self.document.trash()

    def getMetadata(self, base_document=False):
        """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
        command = ["pdfinfo", self.document.getUrl()]
        stdout, stderr = Popen(command,
                               stdout=PIPE,
                               stderr=PIPE,
                               close_fds=True,
                               env=self.environment).communicate()
        info_list = filter(None, stdout.split("\n"))
        metadata = {}
        for info in iter(info_list):
            if info.count(":") == 1:
                info_name, info_value = info.split(":")
            else:
                info_name, info_value = info.split("  ")
                info_name = info_name.replace(":", "")
            info_value = info_value.strip()
            metadata[info_name.lower()] = info_value
        self.document.trash()
        return metadata

    def setMetadata(self, metadata):
        """Returns a document with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
        text_template = "InfoKey: %s\nInfoValue: %s\n"
        text_list = [text_template % (key.capitalize(), value) \
                                     for key, value in metadata.iteritems()]
        metadata_file = File(self.document.directory_name, "".join(text_list),
                             "txt")
        output_url = mktemp(suffix=".pdf", dir=self.document.directory_name)
        command = [
            "pdftk",
            self.document.getUrl(), "update_info",
            metadata_file.getUrl(), "output", output_url
        ]
        stdout, stderr = Popen(command,
                               stdout=PIPE,
                               stderr=PIPE,
                               close_fds=True,
                               env=self.environment).communicate()
        self.document.reload(output_url)
        try:
            return self.document.getContent()
        finally:
            self.document.trash()
Example #22
0
class Handler(object):
  """
  X2T Handler is used to convert Microsoft Office 2007 documents to OnlyOffice
  documents.
  """

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """
    base_folder_url(string)
      The requested url for data base folder
    data(string)
      The opened and readed file into a string
    source_format(string)
      The source format of the inputed file
    """
    self.base_folder_url = base_folder_url
    self._data = data
    self._source_format = source_format
    self._init_kw = kw
    self.file = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def convert(self, destination_format=None, **kw):
    """ Convert the inputed file to output as format that were informed """
    source_format = self.file.source_format
    logger.debug("x2t convert: %s > %s" % (source_format, destination_format))

    # init vars and xml configuration file
    in_format = format_code_map[source_format]
    out_format = format_code_map[destination_format]
    root_dir = self.file.directory_name
    input_dir = os.path.join(root_dir, "input");
    output_dir = os.path.join(root_dir, "output");
    final_file_name = os.path.join(root_dir, "document.%s" % destination_format)
    input_file_name = self.file.getUrl()
    output_file_name = final_file_name
    config_file_name = os.path.join(root_dir, "config.xml")

    if source_format in yformat_tuple:
      os.mkdir(input_dir)
      unzip(self.file.getUrl(), input_dir)
      for _, _, files in os.walk(input_dir):
        input_file_name, = files
        break
      input_file_name = os.path.join(input_dir, input_file_name)
    if destination_format in yformat_tuple:
      os.mkdir(output_dir)
      output_file_name = os.path.join(output_dir, "body.txt")

    config_file = open(config_file_name, "w")

    config = {
      # 'm_sKey': 'from',
      'm_sFileFrom': input_file_name,
      'm_nFormatFrom': in_format,
      'm_sFileTo': output_file_name,
      'm_nFormatTo': out_format,
      # 'm_bPaid': 'true',
      # 'm_bEmbeddedFonts': 'false',
      # 'm_bFromChanges': 'false',
      # 'm_sFontDir': '/usr/share/fonts',
      # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes',
    }
    root = ElementTree.Element('root')
    for key, value in config.items():
      ElementTree.SubElement(root, key).text = value
    ElementTree.ElementTree(root).write(config_file, encoding='utf-8', xml_declaration=True, default_namespace=None, method="xml")
    config_file.close()

    # run convertion binary
    p = Popen(
      ["x2t", config_file.name],
      stdout=PIPE,
      stderr=PIPE,
      close_fds=True,
      env=self.environment,
    )
    stdout, stderr = p.communicate()
    if p.returncode != 0:
      raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, "  " + open(config_file.name).read().replace("\n", "\n  ")))

    if destination_format in yformat_tuple:
      zipTree(
        final_file_name,
        (output_file_name, ""),
        (os.path.join(os.path.dirname(output_file_name), "media"), ""),
      )

    self.file.reload(final_file_name)
    try:
      return self.file.getContent()
    finally:
      self.file.trash()

  def getMetadata(self, base_document=False):
    r"""Returns a dictionary with all metadata of document.
    /!\ Not Implemented: no format are handled correctly.
    """
    # XXX Cloudooo takes the first handler that can "handle" source_mimetype.
    #     However, docx documents metadata can only be "handled" by the ooo handler.
    #     Handlers should provide a way to tell if such capability is available for the required source mimetype.
    #     We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
    #     And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
    if self._source_format in (
          "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
          "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
          "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ):
      return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).getMetadata(base_document)
    return {}

  def setMetadata(self, metadata={}):
    r"""Returns document with new metadata.
    /!\ Not Implemented: no format are handled correctly.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    # XXX Cloudooo takes the first handler that can "handle" source_mimetype.
    #     However, docx documents metadata can only be "handled" by the ooo handler.
    #     Handlers should provide a way to tell if such capability is available for the required source mimetype.
    #     We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
    #     And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
    if self._source_format in (
          "docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
          "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
          "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ):
      return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata)
    return self.file.getContent()

  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):
    """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('application/x-asc-text', 'OnlyOffice Text Document'),
     ...
    ]
    """
    source_mimetype = parseContentType(source_mimetype).gettype()
    if source_mimetype in ("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"):
      return [("application/x-asc-text", "OnlyOffice Text Document")]
    if source_mimetype in ("docy", "application/x-asc-text"):
      return [("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word 2007 Document")]
    if source_mimetype in ("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
      return [("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet")]
    if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"):
      return [("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel 2007 Spreadsheet")]
    if source_mimetype in ("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"):
      return [("application/x-asc-presentation", "OnlyOffice Presentation")]
    if source_mimetype in ("ppty", "application/x-asc-presentation"):
      return [("application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint 2007 Presentation")]
    return []
Example #23
0
class Handler(object):
    """
  X2T Handler is used to convert Microsoft Office 2007 documents to OnlyOffice
  documents.
  """

    implements(IHandler)

    def __init__(self, base_folder_url, data, source_format, **kw):
        """
    base_folder_url(string)
      The requested url for data base folder
    data(string)
      The opened and readed file into a string
    source_format(string)
      The source format of the inputed file
    """
        self.base_folder_url = base_folder_url
        self._data = data
        self._source_format = source_format
        self._init_kw = kw
        self.file = File(base_folder_url, data, source_format)
        self.environment = kw.get("env", {})

    def convert(self, destination_format=None, **kw):
        """ Convert the inputed file to output as format that were informed """
        source_format = self.file.source_format
        logger.debug("x2t convert: %s > %s" %
                     (source_format, destination_format))

        # init vars and xml configuration file
        in_format = format_code_map[source_format]
        out_format = format_code_map[destination_format]
        root_dir = self.file.directory_name
        input_dir = os.path.join(root_dir, "input")
        output_dir = os.path.join(root_dir, "output")
        final_file_name = os.path.join(root_dir,
                                       "document.%s" % destination_format)
        input_file_name = self.file.getUrl()
        output_file_name = final_file_name
        config_file_name = os.path.join(root_dir, "config.xml")

        if source_format in yformat_tuple:
            if self._data.startswith("PK\x03\x04"):
                os.mkdir(input_dir)
                unzip(self.file.getUrl(), input_dir)
                for _, _, files in os.walk(input_dir):
                    input_file_name, = files
                    break
                input_file_name = os.path.join(input_dir, input_file_name)
        if destination_format in yformat_tuple:
            os.mkdir(output_dir)
            output_file_name = os.path.join(output_dir, "body.txt")

        config_file = open(config_file_name, "w")

        config = {
            # 'm_sKey': 'from',
            'm_sFileFrom': input_file_name,
            'm_nFormatFrom': in_format,
            'm_sFileTo': output_file_name,
            'm_nFormatTo': out_format,
            # 'm_bPaid': 'true',
            # 'm_bEmbeddedFonts': 'false',
            # 'm_bFromChanges': 'false',
            # 'm_sFontDir': '/usr/share/fonts',
            # 'm_sThemeDir': '/var/www/onlyoffice/documentserver/FileConverterService/presentationthemes',
        }
        root = ElementTree.Element('root')
        for key, value in config.items():
            ElementTree.SubElement(root, key).text = value
        ElementTree.ElementTree(root).write(config_file,
                                            encoding='utf-8',
                                            xml_declaration=True,
                                            default_namespace=None,
                                            method="xml")
        config_file.close()

        # run convertion binary
        p = Popen(
            ["x2t", config_file.name],
            stdout=PIPE,
            stderr=PIPE,
            close_fds=True,
            env=self.environment,
        )
        stdout, stderr = p.communicate()
        if p.returncode != 0:
            raise RuntimeError(
                "x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s"
                % (p.returncode, " ".join(["x2t", config_file.name
                                           ]), stdout, stderr,
                   "  " + open(config_file.name).read().replace("\n", "\n  ")))

        if destination_format in yformat_tuple:
            zipTree(
                final_file_name,
                (output_file_name, ""),
                (os.path.join(os.path.dirname(output_file_name), "media"), ""),
            )

        self.file.reload(final_file_name)
        try:
            return self.file.getContent()
        finally:
            self.file.trash()

    def getMetadata(self, base_document=False):
        r"""Returns a dictionary with all metadata of document.
    /!\ Not Implemented: no format are handled correctly.
    """
        # XXX Cloudooo takes the first handler that can "handle" source_mimetype.
        #     However, docx documents metadata can only be "handled" by the ooo handler.
        #     Handlers should provide a way to tell if such capability is available for the required source mimetype.
        #     We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
        #     And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
        if self._source_format in (
                "docx",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                "xlsx",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                "pptx",
                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ):
            return OOoHandler(self.base_folder_url, self._data,
                              self._source_format,
                              **self._init_kw).getMetadata(base_document)
        return {}

    def setMetadata(self, metadata={}):
        r"""Returns document with new metadata.
    /!\ Not Implemented: no format are handled correctly.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
        # XXX Cloudooo takes the first handler that can "handle" source_mimetype.
        #     However, docx documents metadata can only be "handled" by the ooo handler.
        #     Handlers should provide a way to tell if such capability is available for the required source mimetype.
        #     We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration.
        #     And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating.
        if self._source_format in (
                "docx",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                "xlsx",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                "pptx",
                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ):
            return OOoHandler(self.base_folder_url, self._data,
                              self._source_format,
                              **self._init_kw).setMetadata(metadata)
        return self.file.getContent()

    @staticmethod
    def getAllowedConversionFormatList(source_mimetype):
        """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('application/x-asc-text', 'OnlyOffice Text Document'),
     ...
    ]
    """
        source_mimetype = parseContentType(source_mimetype).gettype()
        if source_mimetype in (
                "docx",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ):
            return [("application/x-asc-text", "OnlyOffice Text Document")]
        if source_mimetype in ("docy", "application/x-asc-text"):
            return [(
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                "Word 2007 Document")]
        if source_mimetype in (
                "xlsx",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        ):
            return [("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet")
                    ]
        if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"):
            return [(
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                "Excel 2007 Spreadsheet")]
        if source_mimetype in (
                "pptx",
                "application/vnd.openxmlformats-officedocument.presentationml.presentation"
        ):
            return [("application/x-asc-presentation",
                     "OnlyOffice Presentation")]
        if source_mimetype in ("ppty", "application/x-asc-presentation"):
            return [(
                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                "PowerPoint 2007 Presentation")]
        return []
Example #24
0
class Handler(object):
  """ImageMagic Handler is used to handler images."""

  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """ Load pdf document """
    self.base_folder_url = base_folder_url
    self.file = File(base_folder_url, data, source_format)
    self.environment = kw.get("env", {})

  def makeTempFile(self, destination_format=None):
    path = mktemp(
      suffix='.%s' % destination_format,
      dir=self.file.directory_name,
    )
    return path

  def convertPathToUrl(self, path):
    if path.startswith("/"):
      return "file://" + path
    raise ValueError("path %r is not absolute" % path)

  def convert(self, destination_format=None, **kw):
    """Convert a image"""
    logger.debug("wkhtmltopdf convert: %s > %s" % (self.file.source_format, destination_format))
    output_path = self.makeTempFile(destination_format)
    command = self.makeWkhtmltopdfCommandList(
      self.convertPathToUrl(self.file.getUrl()),
      output_path,
      conversion_kw=kw,
    )
    stdout, stderr = Popen(
      command,
      stdout=PIPE,
      stderr=PIPE,
      close_fds=True,
      env=self.environment,
      cwd=self.file.directory_name,
    ).communicate()
    self.file.reload(output_path)
    try:
      return self.file.getContent()
    finally:
      self.file.trash()

  def getMetadata(self, base_document=False):
    """Returns a dictionary with all metadata of document.
    along with the metadata.
    """
    return NotImplementedError

  def setMetadata(self, metadata={}):
    """Returns image with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    raise NotImplementedError

  @staticmethod
  def getAllowedConversionFormatList(source_mimetype):
    """Returns a list content_type and their titles which are supported
    by enabled handlers.

    [('application/pdf', 'PDF - Portable Document Format'),
     ...
    ]
    """
    source_mimetype = parseContentType(source_mimetype).gettype()
    if source_mimetype in ("text/html", "htm", "html"):
      return [("application/pdf", "PDF - Portable Document Format")]
    return []

  def makeSwitchOptionList(self, allowed_option_list, option_dict):
    """
      A switch option is enable if it exists.

      Ex: for         : --grayscale
          option_dict : {"grayscale": True}
          result      : ["--grayscale"]
    """
    option_list = []
    for option_name in allowed_option_list:
      value = option_dict.get(option_name)
      if value:
        option_list.append(keyNameToOption(option_name))
    return option_list

  def makeNoPrefixedOptionList(self, allowed_option_list, option_dict):
    """
      A "no" prefixed option is an option that if disable contains a
      "no" prefix.

      Ex: for         : --images (and --no-images)
          option_dict : {"images": False}
          result      : ["--no-images"]
    """
    option_list = []
    for option_name in allowed_option_list:
      value = option_dict.get(option_name)
      if value is not None:
        option_list.append(keyNameToOption(option_name, prefix="" if value else "no-"))
    return option_list

  def makeEnablePrefixedOptionList(self, allowed_option_list, option_dict):
    """
      An "enable" prefixed option is an option that if enable contains a
      "enable" prefix else contains a "disable" prefix.

      Ex: for         : --enable-external-links (and --disable-external-links)
          option_dict : {"enable_external_links": False}
          result      : ["--disable-external-links"]
    """
    option_list = []
    for option_name in allowed_option_list:
      value = option_dict.get(option_name)
      if value is not None:
        if value:
          option_list.append(keyNameToOption(option_name))
        else:
          option_list.append(keyNameToOption(option_name[7:], prefix="disable-"))
    return option_list

  def makeIncludeInPrefixedOptionList(self, allowed_option_list, option_dict):
    """
      An "include-in" prefixed option is an option that if enable contains a
      "include-in" prefix else contains a "exclude-from" prefix.

      Ex: for         : --include-in-outline (and --exclude-from-outline)
          option_dict : {"include_in_outline": False}
          result      : ["--exclude-from-outline"]
    """
    option_list = []
    for option_name in allowed_option_list:
      value = option_dict.get(option_name)
      if value is not None:
        if value:
          option_list.append(keyNameToOption(option_name))
        else:
          option_list.append(keyNameToOption(option_name[11:], prefix="exclude-from-"))
    return option_list

  def makeOneStringArgumentOptionList(self, allowed_option_list, option_dict):
    """
      A one-string-argument option is a option that require an argument
      which is a string.

      Ex: for         : --title <text>
          option_dict : {"title": "Hello World!"}
          result      : ["--title", "Hello World!"]
    """
    option_list = []
    for option_name in allowed_option_list:
      value = option_dict.get(option_name)
      if value is not None:
        option_list += [keyNameToOption(option_name), str(value)]
    return option_list

  def makeRepeatableOneStringArgumentOptionList(self, allowed_option_list, option_dict):
    """
      A repeatable one-string-argument option is a option that require one
      string argument, this option can be set several times.

      Ex: for         : --allow <path>
          option_dict : {"allow_list": ["a", "b"]}
          result      : ["--allow", "a", "--allow", "b"]
    """
    option_list = []
    for option_name in allowed_option_list:
      value_list = option_dict.get(option_name)
      if value_list:
        for value in value_list:
          option_list += [keyNameToOption(option_name[:-5]), str(value)]
    return option_list

  def makeRepeatableTwoStringArgumentOptionList(self, allowed_option_list, option_dict):
    """
      A repeatable two-string-argument option is a option that require two
      string arguments, this option can be set several times.

      Ex: for         : --cookie <name> <value>
          option_dict : {"cookie_list": [("a", "b"), ("c", "d")]}
          result      : ["--cookie", "a", "b", "--cookie", "c", "d"]
    """
    option_list = []
    for option_name in allowed_option_list:
      tuple_list = option_dict.get(option_name)
      if tuple_list:
        for name, value in tuple_list:
          option_list += [keyNameToOption(option_name[:-5]), str(name), str(value)]
    return option_list

  def makeDataUrlArgumentOptionList(self, allowed_option_list, option_dict,
                                    url_type="url", destination_format=None,
                                    use_switch=True):
    """
      A data-file-argument option is a option that require an url argument.

      Here, we don't want option value to be an url but data, so that
      we can put the data to a temp file an use it's url as option value.

      Ex: for         : --user-style-sheet <url> (and url_type="url")
          option_dict : {"user_style_sheet_data": b64encode("body { background-color: black; }")}
          result      : ["--user-style-sheet", "file:///tmp/tmp.XYZ.css"]

      Ex: for         : --checkbox-svg <path> (and url_type="path")
          option_dict : {"checkbox_svg_data": b64encode("<svg>....</svg>")}
          result      : ["--checkbox-svg", "/tmp/tmp.XYZ.svg"]

      Ex: for         : --xsl-style-sheet <file> (and url_type="file")
          option_dict : {"xsl_style_sheet_data": b64encode("table { border: none; }")}
          result      : ["--xsl-style-sheet", "tmp.XYZ.css"]
    """
    option_list = []
    for option_name in allowed_option_list:
      value = option_dict.get(option_name)
      if value is not None:
        # creates a tmp file in the directory which will be trashed
        path = self.makeTempFile(destination_format=destination_format)
        open(path, "wb").write(b64decode(value))
        if url_type == "url":
          path = self.convertPathToUrl(path)
        elif url_type == "file":
          path = basename(path)
        if use_switch:
          option_list += [keyNameToOption(option_name[:-5]), path]
        else:
          option_list.append(path)
    return option_list

  def makeDataPathArgumentOptionList(self, *args, **kw):
    return self.makeDataUrlArgumentOptionList(*args, url_type="path", **kw)

  def makeDataFileArgumentOptionList(self, *args, **kw):
    return self.makeDataUrlArgumentOptionList(*args, url_type="file", **kw)

  def makeRepeatableDataUrlArgumentOptionList(self, allowed_option_list,
                                              option_dict, **kw):
    option_list = []
    for option_name in allowed_option_list:
      data_list = option_dict.get(option_name)
      if data_list:
        for data in data_list:
          option_name = option_name[:-5]
          option_list += self.makeDataUrlArgumentOptionList([
            option_name,
          ], {option_name: data}, **kw)
    return option_list

  def makeWkhtmltopdfCommandList(self, *args, **kw):
    # http://wkhtmltopdf.org/usage/wkhtmltopdf.txt
    conversion_kw = kw.get("conversion_kw", {})
    command = ["wkhtmltopdf"]

    # Global Options
    command += self.makeNoPrefixedOptionList(["collate"], conversion_kw)
    command += self.makeSwitchOptionList([
      #"extended-help",
      "grayscale",
      #"help",
      #"htmldoc",
      #"licence",
      "lowquality",
      #"manpage",
      "no_pdf_compression",
      #"quiet",  # we decide
      #"read_args_from_stdin",  # only for several command line at a time
      #"readme",
      #"version",
    ], conversion_kw)
    command += self.makeOneStringArgumentOptionList([
      #"cookie_jar",  # no cookie jar
      "copies",
      "dpi",
      "image_dpi",
      "image_quality",
      "margin_bottom",
      "margin_left",
      "margin_right",
      "margin_top",
      "orientation",
      "page_height",
      "page_size",
      "page_width",
      "title",
    ], conversion_kw)

    # Outline Options
    command += self.makeNoPrefixedOptionList(["outline"], conversion_kw)
    #"dump_default_toc_xsl",
    command += self.makeOneStringArgumentOptionList([
      #"dump_outline",
      "outline_depth",
    ], conversion_kw)

    # Page Options
    command += self.makeNoPrefixedOptionList([
      "background",
      "custom_header_propagation",
      "images",
      "print_media_type",
      #"debug_javascript",  # we decide
      #"stop_slow_scripts",  # we decide
    ], conversion_kw)
    command += self.makeEnablePrefixedOptionList([
      "enable_external_links",
      "enable_forms",
      "enable_internal_links",
      "enable_javascript",
      #"enable_local_file_access",  # we decide
      #"enable_plugins",
      "enable_smart_shrinking",
      "enable_toc_back_links",
    ], conversion_kw)
    command += ["--disable-local-file-access"]
    command += self.makeIncludeInPrefixedOptionList([
      "include_in_outline",
    ], conversion_kw)
    command += self.makeSwitchOptionList(["default_header"], conversion_kw)
    command += self.makeOneStringArgumentOptionList([
      #"cache_dir",  # we decide
      "encoding",
      "javascript_delay",
      "load_error_handling",
      "load_media_error_handling",
      "minimum_font_size",
      "page_offset",
      #"password",  # too dangerous
      #"proxy",  # we decide
      #"username",  # too dangerous
      "viewport_size",
      "window_status",
      "zoom",
    ], conversion_kw)
    #"allow",  # we decide
    command += self.makeDataPathArgumentOptionList([
      # <option_name>_data
      "checkbox_checked_svg_data",
      "checkbox_svg_data",
      "radiobutton_checked_svg_data",
      "radiobutton_svg_data",
    ], conversion_kw, destination_format="svg")
    command += self.makeDataUrlArgumentOptionList([
      "user_style_sheet_data",
    ], conversion_kw, destination_format="css")
    #"run_script_list",  # too dangerous, fills --run-script
    command += self.makeRepeatableTwoStringArgumentOptionList([
      # <option_name>_list
      "cookie_list",
      "custom_header_list",
      #"post_list",
      #"post_file_list",
    ], conversion_kw)

    # Headers and Footer Options
    command += self.makeNoPrefixedOptionList([
      "footer_line",
      "header_line",
    ], conversion_kw)
    command += self.makeOneStringArgumentOptionList([
      "footer_center",
      "footer_font_name",
      "footer_font_size",
      "footer_left",
      "footer_right",
      "footer_spacing",
      "header_center",
      "header_font_name",
      "header_font_size",
      "header_left",
      "header_right",  # there's a --top option (not documented)
                       # may be we can do header_right_top option
      "header_spacing",
    ], conversion_kw)
    command += self.makeDataUrlArgumentOptionList([
      # <option_name>_data
      "footer_html_data",
      "header_html_data",
    ], conversion_kw, destination_format="html")
    command += self.makeRepeatableTwoStringArgumentOptionList([
      "replace",
    ], conversion_kw)

    # Custom Options
    command += self.makeRepeatableDataUrlArgumentOptionList([
      "before_toc_data_list",
    ], conversion_kw, destination_format="html", use_switch=False)

    # TOC Options
    value = conversion_kw.get("toc")
    if value:
      command += ["toc"]
      command += self.makeEnablePrefixedOptionList([
        "enable_dotted_lines",
        "enable_toc_links",
      ], conversion_kw)
      command += self.makeOneStringArgumentOptionList([
        "toc_header_text",
        "toc_level_indentation",
        "toc_text_size_shrink",
      ], conversion_kw)
      command += self.makeDataFileArgumentOptionList([
        "xsl_style_sheet_data",
      ], conversion_kw, destination_format="xsl")

    # Custom Options
    command += self.makeRepeatableDataUrlArgumentOptionList([
      "after_toc_data_list",
      "before_body_data_list",
    ], conversion_kw, destination_format="html", use_switch=False)
    command += args[:-1]  # input_url
    command += self.makeRepeatableDataUrlArgumentOptionList([
      "after_body_data_list",
    ], conversion_kw, destination_format="html", use_switch=False)
    command += args[-1:]  # output_path

    return command
Example #25
0
 def __init__(self, base_folder_url, data, source_format, **kw):
   """ Load pdf document """
   self.base_folder_url = base_folder_url
   self.file = File(base_folder_url, data, source_format)
   self.environment = kw.get("env", {})
Example #26
0
class Handler(object):
  """OOO Handler is used to access the one Document and OpenOffice.
  For each Document inputed is created on instance of this class to manipulate
  the document. This Document must be able to create and remove a temporary
  document at FS, load and export.
  """
  implements(IHandler)

  def __init__(self, base_folder_url, data, source_format, **kw):
    """Creates document in file system and loads it in OOo."""
    self.document = File(base_folder_url, data, source_format)
    self.zip = kw.get('zip', False)
    self.uno_path = kw.get("uno_path", None)
    self.office_binary_path = kw.get("office_binary_path", None)
    self.timeout = kw.get("timeout", 600)
    self.refresh = kw.get('refresh', False)
    self.source_format = source_format
    if not self.uno_path:
      self.uno_path = environ.get("uno_path")
    if not self.office_binary_path:
      self.office_binary_path = environ.get("office_binary_path")

  def _getCommand(self, *args, **kw):
    """Transforms all parameters passed in a command"""
    hostname, port = openoffice.getAddress()
    kw['hostname'] = hostname
    kw['port'] = port
    python = path.join(self.office_binary_path, "python")
    command_list = [path.exists(python) and python or "python",
                    pkg_resources.resource_filename(__name__,
                                 path.join("helper", "unoconverter.py")),
                    "--uno_path=%s" % self.uno_path,
                    "--office_binary_path=%s" % self.office_binary_path,
                    '--document_url=%s' % self.document.getUrl()]
    for arg in args:
      command_list.insert(3, "--%s" % arg)
    for k, v in kw.iteritems():
      command_list.append("--%s=%s" % (k, v))

    return command_list

  def _startTimeout(self):
    """start the Monitor"""
    self.monitor = MonitorTimeout(openoffice, self.timeout)
    self.monitor.start()
    return

  def _stopTimeout(self):
    """stop the Monitor"""
    self.monitor.terminate()
    return

  def _subprocess(self, command_list):
    """Run one procedure"""
    if monitor_sleeping_time is not None:
      monitor_sleeping_time.touch()
    try:
      self._startTimeout()
      process = Popen(command_list, stdout=PIPE, stderr=PIPE, close_fds=True,
                      env=openoffice.environment_dict.copy())
      stdout, stderr = process.communicate()
    finally:
      self._stopTimeout()
      if pid_exists(process.pid):
        process.terminate()
    return stdout, stderr

  def _callUnoConverter(self, *feature_list, **kw):
    """ """
    if not openoffice.status():
      openoffice.start()
    command_list = self._getCommand(*feature_list, **kw)
    stdout, stderr = self._subprocess(command_list)
    if not stdout and len(re.findall("\w*Exception|\w*Error", stderr)) >= 1:
      logger.debug(stderr)
      self.document.restoreOriginal()
      openoffice.restart()
      kw['document_url'] = self.document.getUrl()
      command = self._getCommand(*feature_list, **kw)
      stdout, stderr = self._subprocess(command)
      if stderr != "":
          raise Exception(stderr)

    return stdout, stderr

  def _serializeMimemapper(self,
                           source_extension=None,
                           destination_extension=None):
    """Serialize parts of mimemapper"""
    if destination_extension is None:
      return json.dumps(dict(mimetype_by_filter_type=mimemapper._mimetype_by_filter_type))

    filter_list = []
    service_type_list = mimemapper._doc_type_list_by_extension.get(
      source_extension, mimemapper.document_service_list)
    for service_type in service_type_list:
      filter_list.append((destination_extension,
                          service_type,
                          mimemapper.getFilterName(destination_extension, service_type)))
    logger.debug("Filter List: %r" % filter_list)
    return json.dumps(dict(doc_type_list_by_extension=mimemapper._doc_type_list_by_extension,
                            filter_list=filter_list,
                            mimetype_by_filter_type=mimemapper._mimetype_by_filter_type))

  def convert(self, destination_format=None, **kw):
    """Convert a document to another format supported by the OpenOffice
    Keyword Arguments:
    destination_format -- extension of document as String
    """
    logger.debug("OooConvert: %s > %s" % (self.source_format, destination_format))
    kw['source_format'] = self.source_format
    if destination_format:
      kw['destination_format'] = destination_format
    kw['mimemapper'] = self._serializeMimemapper(self.source_format,
                                                 destination_format)
    kw['refresh'] = json.dumps(self.refresh)
    openoffice.acquire()
    try:
      stdout, stderr = self._callUnoConverter(*['convert'], **kw)
    finally:
      openoffice.release()
    url = stdout.replace('\n', '')
    self.document.reload(url)
    content = self.document.getContent(self.zip)
    self.document.trash()
    return content

  def getMetadata(self, base_document=False):
    """Returns a dictionary with all metadata of document.
    Keywords Arguments:
    base_document -- Boolean variable. if true, the document is also returned
    along with the metadata."""
    logger.debug("getMetadata")
    kw = dict(mimemapper=self._serializeMimemapper())
    if base_document:
      feature_list = ['getmetadata', 'convert']
    else:
      feature_list = ['getmetadata']
    openoffice.acquire()
    try:
      stdout, stderr = self._callUnoConverter(*feature_list, **kw)
    finally:
      openoffice.release()
    metadata = json.loads(decodestring(stdout))
    if 'document_url' in metadata:
      self.document.reload(metadata['document_url'])
      metadata['Data'] = self.document.getContent()
      del metadata['document_url']
    self.document.trash()
    return metadata

  def setMetadata(self, metadata):
    """Returns a document with new metadata.
    Keyword arguments:
    metadata -- expected an dictionary with metadata.
    """
    metadata_pickled = json.dumps(metadata)
    logger.debug("setMetadata")
    kw = dict(metadata=encodestring(metadata_pickled))
    openoffice.acquire()
    try:
      stdout, stderr = self._callUnoConverter(*['setmetadata'], **kw)
    finally:
      openoffice.release()
    doc_loaded = self.document.getContent()
    self.document.trash()
    return doc_loaded