Beispiel #1
0
    def __init__(self, context):
        preference_tool = getToolByName(context, 'portal_preferences')

        uri = getattr(preference_tool,
                      "getPreferredDocumentConversionServerUrl", str)()
        if uri in ('', None):
            address = preference_tool.getPreferredOoodocServerAddress()
            port = preference_tool.getPreferredOoodocServerPortNumber()
            if address in ('', None) or port in ('', None):
                raise ConversionError(
                    'OOoDocument: cannot proceed with conversion:'
                    ' conversion server url is not defined in preferences')

            LOG('OOoDocument', WARNING, 'PreferredOoodocServer{Address,PortNumber}' + \
                ' are DEPRECATED please use PreferredDocumentServerUrl instead', error=True)
            scheme = "http"
            uri = 'http://%s:%d' % (address, port)
        else:
            if uri.startswith("http://"):
                scheme = "http"
            elif uri.startswith("https://"):
                scheme = "https"
            else:
                raise ConversionError(
                    'OOoDocument: cannot proceed with conversion:'
                    ' preferred conversion server url is invalid')

        timeout = preference_tool.getPreferredOoodocServerTimeout() \
                        or OOO_SERVER_PROXY_TIMEOUT
        transport = TimeoutTransport(timeout=timeout, scheme=scheme)

        ServerProxy.__init__(self, uri, allow_none=True, transport=transport)
Beispiel #2
0
  def __init__(self, context):
    self._serverproxy_list = []
    preference_tool = getToolByName(context, 'portal_preferences')
    self._ooo_server_retry = preference_tool.getPreferredDocumentConversionServerRetry() or OOO_SERVER_RETRY
    uri_list = preference_tool.getPreferredDocumentConversionServerUrlList()
    if not uri_list:
      address = preference_tool.getPreferredOoodocServerAddress()
      port = preference_tool.getPreferredOoodocServerPortNumber()
      if not (address and port):
        raise ConversionError('OOoDocument: cannot proceed with conversion:'
              ' conversion server url is not defined in preferences')

      LOG('OOoDocument', WARNING, 'PreferredOoodocServer{Address,PortNumber}' + \
          ' are DEPRECATED please use PreferredDocumentServerUrl instead', error=True)
       
      uri_list =  ['%s://%s:%s' % ('http', address, port)]

    timeout = preference_tool.getPreferredOoodocServerTimeout() \
                    or OOO_SERVER_PROXY_TIMEOUT
    for uri in uri_list:
      if uri.startswith("http://"):
        scheme = "http"
      elif uri.startswith("https://"):
        scheme = "https"
      else:
        raise ConversionError('OOoDocument: cannot proceed with conversion:'
              ' preferred conversion server url is invalid')

      transport = TimeoutTransport(timeout=timeout, scheme=scheme)
      
      self._serverproxy_list.append((uri, ServerProxy(uri, allow_none=True, transport=transport)))
Beispiel #3
0
        def cached_getTargetFormatItemList(content_type):
            server_proxy = OOoServerProxy(self)
            try:
                allowed_target_item_list = server_proxy.getAllowedTargetItemList(
                    content_type)
                try:
                    response_code, response_dict, response_message = \
                                                       allowed_target_item_list
                except ValueError:
                    # Compatibility with older oood where getAllowedTargetItemList only
                    # returned response_dict
                    response_code, response_dict, response_message = \
                                   200, dict(response_data=allowed_target_item_list), ''

                if response_code == 200:
                    allowed = response_dict['response_data']
                else:
                    # This is very temporary code - XXX needs to be changed
                    # so that the system can retry
                    raise ConversionError(
                        "OOoDocument: can not get list of allowed acceptable"
                        " formats for conversion (Code %s: %s)" %
                        (response_code, response_message))

            except Fault, f:
                allowed = server_proxy.getAllowedTargets(content_type)
                warn(
                    'Your oood version is too old, using old method '
                    'getAllowedTargets instead of getAllowedTargetList',
                    DeprecationWarning)
Beispiel #4
0
    def updateBaseMetadata(self, **kw):
        """
      Updates metadata information in the converted OOo document
      based on the values provided by the user. This is implemented
      through the invocation of the conversion server.
    """
        if not self.hasBaseData():
            # XXX please pass a meaningful description of error as argument
            raise NotConvertedError()

        server_proxy = OOoServerProxy(self)
        response_code, response_dict, response_message = \
              server_proxy.run_setmetadata(self.getId(),
                                           enc(str(self.getBaseData())),
                                           kw)
        if response_code == 200:
            # successful meta data extraction
            self._setBaseData(dec(response_dict['data']))
            self.updateFileMetadata(
            )  # record in workflow history # XXX must put appropriate comments.
        else:
            # Explicitly raise the exception!
            raise ConversionError(
                "OOoDocument: error getting document metadata (Code %s: %s)" %
                (response_code, response_message))
Beispiel #5
0
  def _convertToHTML(self):
    """Convert the PDF text content to HTML with pdftohtml
    """
    if not self.hasData():
      return ''
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(self.getData())
    tmp.seek(0)

    command_result = None
    try:
      command = ['pdftohtml', '-enc', 'UTF-8', '-stdout',
                 '-noframes', '-i', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdftohtml was not found')
        raise

    finally:
      tmp.close()
    # Quick hack to remove bg color - XXX
    h = command_result.replace('<BODY bgcolor="#A0A0A0"', '<BODY ')
    # Make links relative
    h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1],
                                                          'href="asEntireHTML')
    return h
Beispiel #6
0
def getDataURI(url):
    try:
        data = urllib2.urlopen(url)
    except Exception, e:
        raise ConversionError(
            "Error to transform url (%s) into data uri. ERROR = %s" %
            (url, Exception(e)))
Beispiel #7
0
 def _convertToBaseFormat(self):
   """
     Converts the original document into ODF
     by invoking the conversion server. Store the result
     on the object. Update metadata information.
   """
   server_proxy = OOoServerProxy(self)
   response_code, response_dict, response_message = server_proxy.run_convert(
                                     self.getFilename() or self.getId(),
                                     enc(str(self.getData())),
                                     None,
                                     None,
                                     self.getContentType())
   if response_code == 200:
     # sucessfully converted document
     self._setBaseData(dec(response_dict['data']))
     metadata = response_dict['meta']
     self._base_metadata = metadata
     if metadata.get('MIMEType', None) is not None:
       self._setBaseContentType(metadata['MIMEType'])
   else:
     # Explicitly raise the exception!
     raise ConversionError(
               "OOoDocument: Error converting document to base format. (Code %s: %s)"
                                      % (response_code, response_message))
Beispiel #8
0
  def getContentInformation(self):
    """Returns the information about the PDF document with pdfinfo.
    """
    if not self.hasData():
      return dict()
    try:
      return self._content_information.copy()
    except AttributeError:
      pass
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(self.getData())
    tmp.seek(0)
    command_result = None
    try:

      # First, we use pdfinfo to get standard metadata
      command = ['pdfinfo', '-meta', '-box', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdfinfo was not found')
        raise

      result = {}
      for line in command_result.splitlines():
        item_list = line.split(':')
        key = item_list[0].strip()
        value = ':'.join(item_list[1:]).strip()
        result[key] = value

      # Then we use PyPDF2 to get extra metadata
      try:
        from PyPDF2 import PdfFileReader
        from PyPDF2.utils import PdfReadError
      except ImportError:
        # if PyPDF2 not found, pass
        pass
      else:
        try:
          pdf_file = PdfFileReader(tmp)
          for info_key, info_value in (pdf_file.getDocumentInfo() or {}).iteritems():
            info_key = info_key.lstrip("/")
            if isinstance(info_value, unicode):
              info_value = info_value.encode("utf-8")

            # Ignore values that cannot be pickled ( such as AAPL:Keywords )
            try:
              pickle.dumps(info_value)
            except pickle.PicklingError:
              LOG("PDFDocument.getContentInformation", INFO,
                "Ignoring non picklable document info on %s: %s (%r)" % (
                self.getRelativeUrl(), info_key, info_value))
            else:
              result.setdefault(info_key, info_value)
        except PdfReadError:
          LOG("PDFDocument.getContentInformation", PROBLEM,
            "PyPDF2 is Unable to read PDF, probably corrupted PDF here : %s" % \
            (self.getRelativeUrl(),))
 def _getFormatFromMimetype(self, mimetype):
     """
 XXX: This should not be done here but Conversion Server API to get
      supported Format/Extension is deprecated (topic under discussion)
 """
     import mimetypes
     extension = mimetypes.guess_extension(mimetype)
     if extension is None:
         raise ConversionError(
             "Could not guess extension from mimetype '%s'" % mimetype)
     return extension.split('.', 1)[1]
Beispiel #10
0
    def _resize(self, quality, width, height, format, resolution, frame):
        """Resize and resample photo."""
        parameter_list = [
            'convert', '-colorspace', 'sRGB', '-depth', '8', '-quality',
            str(quality), '-geometry',
            '%sx%s' % (width, height)
        ]
        if format not in VALID_TRANSPARENT_IMAGE_FORMAT_LIST:
            # ImageMagick way to remove transparent that works with multiple
            # images. http://www.imagemagick.org/Usage/masking/#remove
            parameter_list += '-bordercolor', 'white', '-border', '0'
        if resolution:
            parameter_list += '-density', '%sx%s' % (resolution, resolution)
        if frame is not None:
            parameter_list.append('-[%s]' % frame)
        else:
            parameter_list.append('-')

        if format:
            # Is there a way to make 'convert' fail if the format is unknown,
            # instead of treating this whole parameter as an output file path?
            # As a workaround, we run 'convert' in a non-writeable directory.
            if '/' in format or os.access('/', os.W_OK):
                raise ConversionError
            parameter_list.append('%s:-' % format)
        else:
            parameter_list.append('-')

        data = str(self.getData())
        if self.getContentType() == "image/svg+xml":
            data = transformUrlToDataURI(data)

        env = os.environ.copy()
        env.update({'LC_NUMERIC': 'C'})
        process = subprocess.Popen(parameter_list,
                                   env=env,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   cwd='/',
                                   close_fds=True)
        try:
            # XXX: The only portable way is to pass what stdin.write can accept,
            #      which is a string for PIPE.
            image, err = process.communicate(data)
        finally:
            del process
        if image:
            return StringIO(image)
        raise ConversionError('Image conversion failed (%s).' % err)
Beispiel #11
0
 def _convertToText(self):
     """
   Convert the PDF text content to text with pdftotext
 """
     if not self.hasData():
         return ''
     mime_type = 'text/plain'
     portal_transforms = self.getPortalObject().portal_transforms
     filename = self.getFilename()
     result = portal_transforms.convertToData(
         mime_type,
         str(self.getData()),
         context=self,
         filename=filename,
         mimetype=self.getContentType())
     if result:
         return result
     else:
         # Try to use OCR
         # As high dpi images are required, it may take some times to convert the
         # pdf.
         # It may be required to use activities to fill the cache and at the end,
         # to calculate the final result
         text = ''
         content_information = self.getContentInformation()
         page_count = int(content_information.get('Pages', 0))
         for page_number in range(page_count):
             src_mimetype, png_data = self._convert('png',
                                                    quality=100,
                                                    resolution=300,
                                                    frame=page_number,
                                                    display='identical')
             if not src_mimetype.endswith('png'):
                 continue
             content = str(png_data)
             if content is not None:
                 filename = self.getStandardFilename(format='png')
                 result = portal_transforms.convertToData(
                     mime_type,
                     content,
                     context=self,
                     filename=filename,
                     mimetype=src_mimetype)
                 if result is None:
                     raise ConversionError(
                         'PDFDocument conversion error. '
                         'portal_transforms failed to convert to %s: %r' %
                         (mime_type, self))
                 text += result
         return text
Beispiel #12
0
 def convertTo(self, format):
     server_proxy = OOoServerProxy(self.context)
     response_code, response_dict, message = \
                            server_proxy.getAllowedTargetItemList(self.mimetype)
     allowed_extension_list = response_dict['response_data']
     if format in dict(allowed_extension_list):
         response_code, response_dict, message = server_proxy.run_generate(
             '', enc(self.data), None, format, self.mimetype)
         data = dec(response_dict['data'])
         if self.mimetype == 'text/html':
             data = self.includeImageList(data)
         return data
     else:
         raise ConversionError('Format not allowed %s' % format)
Beispiel #13
0
    def __init__(self, context):
        preference_tool = getToolByName(context, 'portal_preferences')

        address = preference_tool.getPreferredOoodocServerAddress()
        port = preference_tool.getPreferredOoodocServerPortNumber()
        if address in ('', None) or port in ('', None):
            raise ConversionError(
                'OOoDocument: cannot proceed with conversion:'
                ' conversion server host and port is not defined in preferences'
            )

        uri = 'http://%s:%d' % (address, port)
        timeout = preference_tool.getPreferredOoodocServerTimeout() \
                        or OOO_SERVER_PROXY_TIMEOUT
        transport = TimeoutTransport(timeout=timeout, scheme='http')

        ServerProxy.__init__(self, uri, allow_none=True, transport=transport)
Beispiel #14
0
    def __call__(self, *args, **kw):
        """
    Catch Protocol Errors (transport layer) and specifically
    identify them as OOo server network/communication error

    xml-rpc application level errors still go through: if a wrong method
    is called, or with wrong parameters, xmlrpclib.Fault will be raised.
    """
        try:
            return self.__callable(*args, **kw)
        except ProtocolError, e:
            message = "%s %s" % (e.errcode, e.errmsg)
            if e.errcode == -1:
                message = "Connection refused"
            raise ConversionError(
                "Protocol error while contacting OOo conversion"
                " server: %s" % (message))
Beispiel #15
0
    def _resize(self, quality, width, height, format, resolution, frame):
        """Resize and resample photo."""
        parameter_list = [
            'convert', '-colorspace', 'sRGB', '-depth', '8', '-quality',
            str(quality), '-geometry',
            '%sx%s' % (width, height)
        ]
        if format not in VALID_TRANSPARENT_IMAGE_FORMAT_LIST:
            # ImageMagick way to remove transparent that works with multiple
            # images. http://www.imagemagick.org/Usage/masking/#remove
            parameter_list += '-bordercolor', 'white', '-border', '0'
        if resolution:
            parameter_list += '-density', '%sx%s' % (resolution, resolution)
        if frame is not None:
            parameter_list.append('-[%s]' % frame)
        else:
            parameter_list.append('-')

        if format:
            parameter_list.append('%s:-' % format)
        else:
            parameter_list.append('-')

        data = str(self.getData())
        if self.getContentType() == "image/svg+xml":
            data = transformUrlToDataURI(data)

        process = subprocess.Popen(parameter_list,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE,
                                   close_fds=True)
        try:
            # XXX: The only portable way is to pass what stdin.write can accept,
            #      which is a string for PIPE.
            image, err = process.communicate(data)
        finally:
            del process
        if image:
            return StringIO(image)
        raise ConversionError('Image conversion failed (%s).' % err)
Beispiel #16
0
  def _convertToDJVU(self):
    """Convert the PDF text content to DJVU with pdf2djvu
    """
    if not self.hasData():
      return ''
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(self.getData())
    tmp.seek(0)

    command_result = None
    try:
      command = ['pdf2djvu', tmp.name]
      try:
        command_result = Popen(command, stdout=PIPE).communicate()[0]
      except OSError, e:
        if e.errno == errno.ENOENT:
          raise ConversionError('pdf2djvu was not found')
        raise

    finally:
      tmp.close()
    return command_result
    def convert(self, orig, data, context=None, **kwargs):
        server_proxy = DocumentConversionServerProxy(context)

        source_mimetype = self._getAllowedSourceMimetypeFromConversionServer(
            server_proxy)
        if source_mimetype is None:
            raise ConversionError(
                "Format(s) not allowed on Conversion Server %r" % self.inputs)
        source_format = self._getFormatFromMimetype(source_mimetype)
        destination_format = self._getFormatFromMimetype(self.output)

        data.setData(
            dec(
                server_proxy.convertFile(
                    enc(orig),
                    source_format,
                    destination_format,
                    # Default values are ConversionServer default ones
                    kwargs.get('zip', False),
                    kwargs.get('refresh', False),
                    kwargs.get('conversion_kw', {}))))

        return data
Beispiel #18
0
    def _convert(self, format, substitution_method_parameter_dict=None,
                safe_substitute=True, charset=None, text_content=None, substitute=True, **kw):
      """
        Convert text using portal_transforms or oood
      """
      # XXX 'or DEFAULT_CONTENT_TYPE' is compaptibility code used for old
      # web_page that have neither content_type nor text_format. Migration
      # should be done to make all web page having content_type property
      src_mimetype = self.getContentType() or DEFAULT_CONTENT_TYPE
      if not format and src_mimetype == 'text/html':
        format = 'html' # Force safe_html
      if not format:
        # can return document without conversion
        return src_mimetype, self.getTextContent()
      portal = self.getPortalObject()
      mime_type = portal.mimetypes_registry.lookupExtension('name.%s' % format)
      original_mime_type = mime_type = str(mime_type)
      if text_content is None:
        # check if document has set text_content and convert if necessary
        text_content = self.getTextContent()
      if text_content:
        kw['format'] = format
        convert_kw = {}
        # PortalTransforms does not accept empty values for 'encoding' parameter
        if charset:
          kw['charset'] = convert_kw['encoding'] = charset
        if not self.hasConversion(**kw):
          portal_transforms = portal.portal_transforms
          filename = self.getFilename()
          if mime_type == 'text/html':
            mime_type = 'text/x-html-safe'
          if src_mimetype != "image/svg+xml":
            result = portal_transforms.convertToData(mime_type, text_content,
                                                   object=self, context=self,
                                                   filename=filename,
                                                   mimetype=src_mimetype,
                                                   **convert_kw)
            if result is None:
              raise ConversionError('TextDocument conversion error. '
                                    'portal_transforms failed to convert '
                                    'from %r to %s: %r' %
                                    (src_mimetype, mime_type, self))
          else:
            result = text_content
          if format in VALID_IMAGE_FORMAT_LIST:
            # Include extra parameter for image conversions
            temp_image = self.portal_contributions.newContent(
                                       portal_type='Image',
                                       file=cStringIO.StringIO(),
                                       filename=self.getId(),
                                       temp_object=1)
            temp_image._setData(result)
            mime, result = temp_image.convert(**kw)

          self.setConversion(result, original_mime_type, **kw)
        else:
          mime_type, result = self.getConversion(**kw)
        if substitute and format in VALID_TEXT_FORMAT_LIST:
          # only textual content can be sustituted
          if substitution_method_parameter_dict is None:
            substitution_method_parameter_dict = {}
          result = self._substituteTextContent(result, safe_substitute=safe_substitute,
                                               **substitution_method_parameter_dict)
        return original_mime_type, result
      else:
        # text_content is not set, return empty string instead of None
        return original_mime_type, ''
Beispiel #19
0
    def _convert(self, format, frame=0, **kw):
        """Convert the document to the given format.

    If a conversion is already stored for this format, it is returned
    directly, otherwise the conversion is stored for the next time.

    frame: Only used for image conversion

    XXX Cascading conversions must be delegated to conversion server,
    not by OOoDocument._convert (ie: convert to pdf, then convert to image, then resize)
    *OR* as an optimisation we can read cached intermediate conversions
    instead of compute them each times.
      1- odt->pdf->png
      2- odt->cached(pdf)->jpg
    """
        #XXX if document is empty, stop to try to convert.
        #XXX but I don't know what is a appropriate mime-type.(Yusei)
        if not self.hasData():
            return 'text/plain', ''
        # if no conversion asked (format empty)
        # return raw data
        if not format:
            return self.getContentType(), self.getData()
        # Check if we have already a base conversion
        if not self.hasBaseData():
            # XXX please pass a meaningful description of error as argument
            raise NotConvertedError()
        # Make sure we can support html and pdf by default
        is_html = 0
        requires_pdf_first = 0
        original_format = format
        allowed_format_list = self.getTargetFormatList()
        if format == 'base-data':
            return self.getBaseContentType(), str(self.getBaseData())
        if format == 'pdf':
            format_list = [x for x in allowed_format_list if x.endswith('pdf')]
            format = format_list[0]
        elif format in VALID_IMAGE_FORMAT_LIST:
            format_list = [
                x for x in allowed_format_list if x.endswith(format)
            ]
            if len(format_list):
                format = format_list[0]
            else:
                # We must fist make a PDF which will be used to produce an image out of it
                requires_pdf_first = 1
                format_list = [
                    x for x in allowed_format_list if x.endswith('pdf')
                ]
                format = format_list[0]
        elif format == 'html':
            format_list = [
                x for x in allowed_format_list
                if x.startswith('html') or x.endswith('html')
            ]
            format = format_list[0]
            is_html = 1
        elif format in ('txt', 'text', 'text-content'):
            # if possible, we try to get utf8 text. ('enc.txt' will encode to utf8)
            if 'enc.txt' in allowed_format_list:
                format = 'enc.txt'
            elif format not in allowed_format_list:
                #Text conversion is not supported by oood, do it in other way
                if not self.hasConversion(format=original_format):
                    #Do real conversion for text
                    mime, data = self._getConversionFromProxyServer(
                        format='text-content')
                    self.setConversion(data, mime, format=original_format)
                    return mime, data
                return self.getConversion(format=original_format)
        # Raise an error if the format is not supported
        if not self.isTargetFormatAllowed(format):
            raise ConversionError(
                "OOoDocument: target format %s is not supported" % format)
        has_format = self.hasConversion(format=original_format, **kw)
        if not has_format:
            # Do real conversion
            mime, data = self._getConversionFromProxyServer(format)
            if is_html:
                # Extra processing required since
                # we receive a zip file
                cs = cStringIO.StringIO()
                cs.write(str(data))
                z = zipfile.ZipFile(
                    cs)  # A disk file would be more RAM efficient
                for f in z.infolist():
                    fn = f.filename
                    if fn.endswith('html'):
                        if self.getPortalType() == 'Presentation'\
                              and not (fn.find('impr') >= 0):
                            continue
                        data = z.read(fn)
                        break
                mime = 'text/html'
                self._populateConversionCacheWithHTML(
                    zip_file=z)  # Maybe some parts should be asynchronous for
                # better usability
                z.close()
                cs.close()
            if original_format not in VALID_IMAGE_FORMAT_LIST \
              and not requires_pdf_first:
                self.setConversion(data, mime, format=original_format, **kw)
            else:
                # create temporary image and use it to resize accordingly
                temp_image = self.portal_contributions.newContent(
                    portal_type='Image',
                    file=cStringIO.StringIO(),
                    filename=self.getId(),
                    temp_object=1)
                temp_image._setData(data)
                # we care for first page only but as well for image quality
                mime, data = temp_image.convert(original_format,
                                                frame=frame,
                                                **kw)
                # store conversion
                self.setConversion(data, mime, format=original_format, **kw)

        return self.getConversion(format=original_format, **kw)
Beispiel #20
0
        # avoid using same server again
        global_server_proxy_uri_failure_time[uri] = int(DateTime())

      # All servers are failed
      if count == self._ooo_server_retry or len(retry_server_list) == 0:
        break
      count += 1
      serverproxy_list = retry_server_list

    # Check error type
    # Return only one error result for compability
    if len(result_error_set_list):
      return result_error_set_list[0]

    if len(protocol_error_list):
      raise ConversionError("Protocol error while contacting OOo conversion: "
                          "%s" % (','.join(protocol_error_list)))
    if len(socket_error_list):
      raise SocketError("%s" % (','.join(socket_error_list)))
    if len(fault_error_list):
      raise fault_error_list[0]
   
  def __getattr__(self, attr):
    return partial(self._proxy_function, attr)

class OOoDocument(OOoDocumentExtensibleTraversableMixin, BaseConvertableFileMixin, File,
                  TextConvertableMixin, Document):
  """
    A file document able to convert OOo compatible files to
    any OOo supported format, to capture metadata and to
    update metadata in OOo documents.