def cached_getTargetFormatItemList(content_type): from xmlrpclib import Fault server_proxy = DocumentConversionServerProxy(self) try: allowed_target_item_list = server_proxy.getAllowedTargetItemList( content_type) try: response_code, response_dict, response_message = \ allowed_target_item_list except ValueError: # Compatibility with older oood where getAllowedTargetItemList only # returned response_dict response_code, response_dict, response_message = \ 200, dict(response_data=allowed_target_item_list), '' if response_code == 200: allowed = response_dict['response_data'] else: # This is very temporary code - XXX needs to be changed # so that the system can retry raise ConversionError( "OOoDocument: can not get list of allowed acceptable" " formats for conversion (Code %s: %s)" % (response_code, response_message)) except Fault: allowed = server_proxy.getAllowedTargets(content_type) warn( 'Your oood version is too old, using old method ' 'getAllowedTargets instead of getAllowedTargetList', DeprecationWarning) # tuple order is reversed to be compatible with ERP5 Form return [(y, x) for x, y in allowed]
def _convertToHTML(self): """Convert the PDF text content to HTML with pdftohtml """ if not self.hasData(): return '' tmp = tempfile.NamedTemporaryFile() tmp.write(self.getData()) tmp.seek(0) command_result = None try: command = [ 'pdftohtml', '-enc', 'UTF-8', '-stdout', '-noframes', '-i', tmp.name ] try: command_result = Popen(command, stdout=PIPE).communicate()[0] except OSError, e: if e.errno == errno.ENOENT: raise ConversionError('pdftohtml was not found') raise finally: tmp.close() # Quick hack to remove bg color - XXX h = command_result.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Make links relative h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1], 'href="asEntireHTML') return h
def updateBaseMetadata(self, **kw): """ Updates metadata information in the converted OOo document based on the values provided by the user. This is implemented through the invocation of the conversion server. """ if not self.hasBaseData(): # XXX please pass a meaningful description of error as argument raise NotConvertedError() server_proxy = DocumentConversionServerProxy(self) response_code, response_dict, response_message = \ server_proxy.run_setmetadata(self.getId(), enc(str(self.getBaseData())), kw) if response_code == 200: # successful meta data extraction self._setBaseData(dec(response_dict['data'])) self.updateFileMetadata( ) # record in workflow history # XXX must put appropriate comments. else: # Explicitly raise the exception! raise ConversionError( "OOoDocument: error getting document metadata (Code %s: %s)" % (response_code, response_message))
def getDataURI(url): try: data = urllib2.urlopen(url) except Exception, e: raise ConversionError( "Error to transform url (%s) into data uri. ERROR = %s" % (url, Exception(e)))
def _getFormatFromMimetype(self, mimetype): """ XXX: This should not be done here but Conversion Server API to get supported Format/Extension is deprecated (topic under discussion) """ import mimetypes extension = mimetypes.guess_extension(mimetype) if extension is None: raise ConversionError( "Could not guess extension from mimetype '%s'" % mimetype) return extension.split('.', 1)[1]
def _convertToText(self, format='txt'): # pylint: disable=redefined-builtin """ Convert the PDF text content to text with pdftotext """ if not self.hasData(): return '' mime_type = 'text/plain' portal_transforms = self.getPortalObject().portal_transforms filename = self.getFilename() result = portal_transforms.convertToData( mime_type, str(self.getData()), context=self, filename=filename, mimetype=self.getContentType()) if result: return result else: # Try to use OCR # As high dpi images are required, it may take some times to convert the # pdf. # It may be required to use activities to fill the cache and at the end, # to calculate the final result text = '' content_information = self.getContentInformation() page_count = int(content_information.get('Pages', 0)) for page_number in range(page_count): src_mimetype, png_data = self._convert('png', quality=100, resolution=300, frame=page_number, display='identical') if not src_mimetype.endswith('png'): continue content = str(png_data) if content is not None: filename = self.getStandardFilename(format='png') result = portal_transforms.convertToData( mime_type, content, context=self, filename=filename, mimetype=src_mimetype) if result is None: raise ConversionError( 'PDFDocument conversion error. ' 'portal_transforms failed to convert to %s: %r' % (mime_type, self)) text += result return text
def convertTo(self, format): # XXX Must be replaced by portal_data_adapters soon from erp5.component.document.Document import DocumentConversionServerProxy server_proxy = DocumentConversionServerProxy(self.context) response_code, response_dict, message = \ server_proxy.getAllowedTargetItemList(self.mimetype) allowed_extension_list = response_dict['response_data'] if format in dict(allowed_extension_list): # XXX Must be replaced by portal_data_adapters soon from erp5.component.document.Document import enc, dec response_code, response_dict, message = server_proxy.run_generate( '', enc(self.data), None, format, self.mimetype) data = dec(response_dict['data']) if self.mimetype == 'text/html': data = self.includeImageList(data) return data else: raise ConversionError('Format not allowed %s' % format)
def _convertToDJVU(self): """Convert the PDF text content to DJVU with pdf2djvu """ if not self.hasData(): return '' tmp = tempfile.NamedTemporaryFile() tmp.write(self.getData()) tmp.seek(0) command_result = None try: command = ['pdf2djvu', tmp.name] try: command_result = Popen(command, stdout=PIPE).communicate()[0] except OSError, e: if e.errno == errno.ENOENT: raise ConversionError('pdf2djvu was not found') raise finally: tmp.close() return command_result
def _convertToBaseFormat(self): """ Converts the original document into ODF by invoking the conversion server. Store the result on the object. Update metadata information. """ server_proxy = DocumentConversionServerProxy(self) response_code, response_dict, response_message = server_proxy.run_convert( self.getFilename() or self.getId(), enc(str(self.getData())), None, None, self.getContentType()) if response_code == 200: # sucessfully converted document self._setBaseData(dec(response_dict['data'])) metadata = response_dict['meta'] self._base_metadata = metadata if metadata.get('MIMEType', None) is not None: self._setBaseContentType(metadata['MIMEType']) else: # Explicitly raise the exception! raise ConversionError( "OOoDocument: Error converting document to base format. (Code %s: %s)" % (response_code, response_message))
def convert(self, orig, data, context=None, **kwargs): server_proxy = DocumentConversionServerProxy(context) source_mimetype = self._getAllowedSourceMimetypeFromConversionServer( server_proxy) if source_mimetype is None: raise ConversionError( "Format(s) not allowed on Conversion Server %r" % self.inputs) source_format = self._getFormatFromMimetype(source_mimetype) destination_format = self._getFormatFromMimetype(self.output) data.setData( dec( server_proxy.convertFile( enc(orig), source_format, destination_format, # Default values are ConversionServer default ones kwargs.get('zip', False), kwargs.get('refresh', False), kwargs.get('conversion_kw', {})))) return data
def _resize( self, quality, width, height, format, # pylint: disable=redefined-builtin resolution, frame, crop=False, ): """Resize and resample photo.""" # https://github.com/saucecontrol/Compact-ICC-Profiles icc_profile = os.path.join(os.path.dirname(Products.ERP5.__file__), 'misc', 'sRGB-v2-magic.icc') parameter_list = [ 'convert', '-colorspace', 'sRGB', '-depth', '8', '-profile', icc_profile ] if crop: parameter_list += '-thumbnail', '%sx%s^' % (width, height),\ '-gravity', 'center',\ '-extent','%sx%s' % (width, height) else: parameter_list += '-geometry', '%sx%s' % (width, height) parameter_list += '-quality', str(quality) if format not in VALID_TRANSPARENT_IMAGE_FORMAT_LIST: # ImageMagick way to remove transparent that works with multiple # images. http://www.imagemagick.org/Usage/masking/#remove parameter_list += '-bordercolor', 'white', '-border', '0' if resolution: parameter_list += '-density', '%sx%s' % (resolution, resolution) if frame is not None: parameter_list.append('-[%s]' % frame) else: parameter_list.append('-') if format: # Is there a way to make 'convert' fail if the format is unknown, # instead of treating this whole parameter as an output file path? # As a workaround, we run 'convert' in a non-writeable directory. if '/' in format or os.access('/', os.W_OK): raise ConversionError parameter_list.append('%s:-' % format) else: parameter_list.append('-') data = str(self.getData()) if self.getContentType() == "image/svg+xml": data = transformUrlToDataURI(data) env = os.environ.copy() env.update({'LC_NUMERIC': 'C'}) process = subprocess.Popen(parameter_list, env=env, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd='/', close_fds=True) try: # XXX: The only portable way is to pass what stdin.write can accept, # which is a string for PIPE. image, err = process.communicate(data) finally: del process if image: return StringIO(image) raise ConversionError('Image conversion failed (%s).' % err)
def getContentInformation(self): """Returns the information about the PDF document with pdfinfo. """ if not self.hasData(): return {} try: return self._content_information.copy() # pylint: disable=access-member-before-definition except AttributeError: pass tmp = tempfile.NamedTemporaryFile() tmp.write(self.getData()) tmp.seek(0) command_result = None try: # First, we use pdfinfo to get standard metadata command = ['pdfinfo', '-meta', '-box', tmp.name] try: command_result = Popen(command, stdout=PIPE).communicate()[0] except OSError, e: if e.errno == errno.ENOENT: raise ConversionError('pdfinfo was not found') raise result = {} for line in command_result.splitlines(): item_list = line.split(':') key = item_list[0].strip() value = ':'.join(item_list[1:]).strip() result[key] = value # Then we use PyPDF2 to get extra metadata try: from PyPDF2 import PdfFileReader from PyPDF2.utils import PdfReadError except ImportError: # if PyPDF2 not found, pass pass else: try: pdf_file = PdfFileReader(tmp) for info_key, info_value in (pdf_file.getDocumentInfo() or {}).iteritems(): info_key = info_key.lstrip("/") if isinstance(info_value, unicode): info_value = info_value.encode("utf-8") # Ignore values that cannot be pickled ( such as AAPL:Keywords ) try: pickle.dumps(info_value) except pickle.PicklingError: LOG( "PDFDocument.getContentInformation", INFO, "Ignoring non picklable document info on %s: %s (%r)" % (self.getRelativeUrl(), info_key, info_value)) else: result.setdefault(info_key, info_value) except (PdfReadError, AssertionError): LOG("PDFDocument.getContentInformation", PROBLEM, "PyPDF2 is Unable to read PDF, probably corrupted PDF here : %s" % \ (self.getRelativeUrl(),)) except Exception: # an exception of Exception class will be raised when the # document is encrypted. pass
def _convert(self, format, frame=0, **kw): # pylint: disable=redefined-builtin """Convert the document to the given format. If a conversion is already stored for this format, it is returned directly, otherwise the conversion is stored for the next time. frame: Only used for image conversion XXX Cascading conversions must be delegated to conversion server, not by OOoDocument._convert (ie: convert to pdf, then convert to image, then resize) *OR* as an optimisation we can read cached intermediate conversions instead of compute them each times. 1- odt->pdf->png 2- odt->cached(pdf)->jpg """ #XXX if document is empty, stop to try to convert. #XXX but I don't know what is a appropriate mime-type.(Yusei) if not self.hasData(): return 'text/plain', '' # if no conversion asked (format empty) # return raw data if not format: return self.getContentType(), self.getData() # Check if we have already a base conversion if not self.hasBaseData(): # XXX please pass a meaningful description of error as argument raise NotConvertedError() # Make sure we can support html and pdf by default is_html = 0 requires_pdf_first = 0 original_format = format allowed_format_list = self.getTargetFormatList() if format == 'base-data': return self.getBaseContentType(), str(self.getBaseData()) if format == 'pdf': format_list = [x for x in allowed_format_list if x.endswith('pdf')] format = format_list[0] elif format in VALID_IMAGE_FORMAT_LIST: format_list = [ x for x in allowed_format_list if x.endswith(format) ] if len(format_list): format = format_list[0] else: # We must fist make a PDF which will be used to produce an image out of it requires_pdf_first = 1 format_list = [ x for x in allowed_format_list if x.endswith('pdf') ] format = format_list[0] elif format == 'html': format_list = [ x for x in allowed_format_list if x.startswith('html') or x.endswith('html') ] format = format_list[0] is_html = 1 elif format in ('txt', 'text', 'text-content'): # if possible, we try to get utf8 text. ('enc.txt' will encode to utf8) if 'enc.txt' in allowed_format_list: format = 'enc.txt' elif format not in allowed_format_list: #Text conversion is not supported by oood, do it in other way if not self.hasConversion(format=original_format): #Do real conversion for text mime, data = self._getConversionFromProxyServer( format='text-content') self.setConversion(data, mime, format=original_format) return mime, data return self.getConversion(format=original_format) # Raise an error if the format is not supported if not self.isTargetFormatAllowed(format): raise ConversionError( "OOoDocument: target format %s is not supported" % format) has_format = self.hasConversion(format=original_format, **kw) if not has_format: # Do real conversion mime, data = self._getConversionFromProxyServer(format) if is_html: # Extra processing required since # we receive a zip file cs = cStringIO.StringIO() cs.write(str(data)) z = zipfile.ZipFile( cs) # A disk file would be more RAM efficient for f in z.infolist(): fn = f.filename if fn.endswith('html'): if self.getPortalType() == 'Presentation'\ and not (fn.find('impr') >= 0): continue data = z.read(fn) break mime = 'text/html' self._populateConversionCacheWithHTML( zip_file=z) # Maybe some parts should be asynchronous for # better usability z.close() cs.close() if original_format not in VALID_IMAGE_FORMAT_LIST \ and not requires_pdf_first: self.setConversion(data, mime, format=original_format, **kw) else: # create temporary image and use it to resize accordingly temp_image = self.portal_contributions.newContent( portal_type='Image', file=cStringIO.StringIO(), filename=self.getId(), temp_object=1) temp_image._setData(data) # we care for first page only but as well for image quality mime, data = temp_image.convert(original_format, frame=frame, **kw) # store conversion self.setConversion(data, mime, format=original_format, **kw) return self.getConversion(format=original_format, **kw)
def _convertToText(self, format='txt'): # pylint: disable=redefined-builtin """Convert the PDF to text If the PDF have text, return the text, otherwise try to do OCR using tesseract. """ if not self.hasData(): return '' data = str(self.getData()) try: from PyPDF2 import PdfFileReader from PyPDF2.utils import PdfReadError except ImportError: pass else: try: if PdfFileReader(StringIO(data)).isEncrypted: return '' except PdfReadError: return '' mime_type = 'text/plain' portal_transforms = self.getPortalObject().portal_transforms filename = self.getFilename() result = portal_transforms.convertToData( mime_type, data, context=self, filename=filename, mimetype=self.getContentType()) if result: return result else: # Try to use OCR from ghostscript, but tolerate that the command might # not be available. process = None command = [ 'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', '-dNOPROMPT', '-sDEVICE=ocr', '-r300x300', '-o', '-', '-f', '-' ] try: process = Popen( command, stdin=PIPE, stdout=PIPE, stderr=PIPE, close_fds=True, ) output, error = process.communicate(data) if process.returncode: raise ConversionError( "Error invoking ghostscript.\noutput:%s\nerror:%s" % (output, error)) return output.strip() except OSError as e: if e.errno != errno.ENOENT: raise finally: del process # We don't have ghostscript, fallback to the expensive pipeline using: # pdf -- (Image._convert imagemagick) --> png # -- (PortalTransforms.png_to_tiff imagemagick) --> tiff # -- (PortalTransforms.tiff_to_text tesseract) --> text # # As high dpi images are required, it may take some times to convert the # pdf. # It may be required to use activities to fill the cache and at the end, # to calculate the final result text = '' content_information = self.getContentInformation() page_count = int(content_information.get('Pages', 0)) for page_number in range(page_count): src_mimetype, png_data = self._convert('png', quality=100, resolution=300, frame=page_number, display='identical') if not src_mimetype.endswith('png'): continue content = str(png_data) if content is not None: filename = self.getStandardFilename(format='png') result = portal_transforms.convertToData( mime_type, content, context=self, filename=filename, mimetype=src_mimetype) if result is None: raise ConversionError( 'PDFDocument conversion error. ' 'portal_transforms failed to convert to %s: %r' % (mime_type, self)) text += result return text
def _convert( self, format, substitution_method_parameter_dict=None, # pylint: disable=redefined-builtin safe_substitute=True, charset=None, text_content=None, substitute=True, **kw): """ Convert text using portal_transforms or oood """ # XXX 'or DEFAULT_CONTENT_TYPE' is compaptibility code used for old # web_page that have neither content_type nor text_format. Migration # should be done to make all web page having content_type property src_mimetype = self.getContentType() or DEFAULT_CONTENT_TYPE if not format and src_mimetype == 'text/html': format = 'html' # Force safe_html if not format: # can return document without conversion return src_mimetype, self.getTextContent() portal = self.getPortalObject() mime_type = portal.mimetypes_registry.lookupExtension('name.%s' % format) original_mime_type = mime_type = str(mime_type) if text_content is None: # check if document has set text_content and convert if necessary text_content = self.getTextContent() if text_content: kw['format'] = format convert_kw = {} # PortalTransforms does not accept empty values for 'encoding' parameter if charset: kw['charset'] = convert_kw['encoding'] = charset if not self.hasConversion(**kw): portal_transforms = portal.portal_transforms filename = self.getFilename() if mime_type == 'text/html': mime_type = 'text/x-html-safe' if src_mimetype != "image/svg+xml": result = portal_transforms.convertToData( mime_type, text_content, object=self, context=self, filename=filename, mimetype=src_mimetype, **convert_kw) if result is None: raise ConversionError( 'TextDocument conversion error. ' 'portal_transforms failed to convert ' 'from %r to %s: %r' % (src_mimetype, mime_type, self)) else: result = text_content if format in VALID_IMAGE_FORMAT_LIST: # Include extra parameter for image conversions temp_image = self.portal_contributions.newContent( portal_type='Image', file=BytesIO(), filename=self.getId(), temp_object=1) temp_image._setData(result) _, result = temp_image.convert(**kw) self.setConversion(result, original_mime_type, **kw) else: mime_type, result = self.getConversion(**kw) if substitute and format in VALID_TEXT_FORMAT_LIST: # only textual content can be sustituted if substitution_method_parameter_dict is None: substitution_method_parameter_dict = {} result = self._substituteTextContent( result, safe_substitute=safe_substitute, **substitution_method_parameter_dict) return original_mime_type, result else: # text_content is not set, return empty string instead of None return original_mime_type, ''