def _get_content_type(self, file, body, id, content_type=None): # Consult self.content_type first, this is either # the default (unknown/unknown) or it got a value from a # .metadata file default_type = 'unknown/unknown' if getattr(self, 'content_type', default_type) != default_type: return self.content_type # Next, look at file headers headers = getattr(file, 'headers', None) if headers and headers.has_key('content-type'): content_type = headers['content-type'] else: # Last resort: Use the (imperfect) content type guessing # mechanism from OFS.Image, which ultimately uses the # Python mimetypes module. if not isinstance(body, basestring): body = body.data content_type, enc = guess_content_type( getattr(file, 'filename', id), body, content_type) if (enc is None and (content_type.startswith('text/') or content_type.startswith('application/')) and body.startswith(codecs.BOM_UTF8)): content_type += '; charset=utf-8' return content_type
def _get_content_type(self, file, body, id, content_type=None): # Consult self.content_type first, this is either # the default (unknown/unknown) or it got a value from a # .metadata file default_type = 'unknown/unknown' if getattr(self, 'content_type', default_type) != default_type: return self.content_type # Next, look at file headers headers=getattr(file, 'headers', None) if headers and headers.has_key('content-type'): content_type=headers['content-type'] else: # Last resort: Use the (imperfect) content type guessing # mechanism from OFS.Image, which ultimately uses the # Python mimetypes module. if not isinstance(body, basestring): body = body.data content_type, enc=guess_content_type( getattr(file, 'filename',id), body, content_type) if (enc is None and (content_type.startswith('text/') or content_type.startswith('application/')) and body.startswith(codecs.BOM_UTF8)): content_type += '; charset=utf-8' return content_type
def __init__(self,path,_prefix=None): if _prefix is None: _prefix=SOFTWARE_HOME elif type(_prefix) is not type(''): _prefix=package_home(_prefix) path = os.path.join(_prefix, path) self.path=path if Globals.DevelopmentMode: # In development mode, a shorter time is handy max_age = 60 # One minute else: # A longer time reduces latency in production mode max_age = 3600 # One hour self.cch = 'public,max-age=%d' % max_age file=open(path, 'rb') data=file.read() file.close() content_type, enc=guess_content_type(path, data) if content_type: self.content_type=content_type else: self.content_type='image/%s' % path[path.rfind('.')+1:] self.__name__=path[path.rfind('/')+1:] self.lmt=float(stat(path)[8]) or time() self.lmh=rfc1123_date(self.lmt)
def _get_content_type(self, file, body, id, content_type=None): headers = getattr(file, 'headers', None) if headers and headers.has_key('content-type'): content_type = headers['content-type'] else: if type(body) is not type(''): body = body.data content_type, enc = guess_content_type( getattr(file, 'filename', id), body, content_type) return content_type
def _get_content_type(self, file, body, id, content_type=None): headers=getattr(file, 'headers', None) if headers and headers.has_key('content-type'): content_type=headers['content-type'] else: if type(body) is not type(''): body=body.data content_type, enc=guess_content_type( getattr(file, 'filename',id), body, content_type) return content_type
def add_file(self, theFile=None, data=None, filename=None, content_type=None): "add a Zope file or Image to ourselves as an attachment" if theFile and data is not None: raise TypeError( 'A file-like object was passed as well as data to create a file' ) if (data is None) != (not filename): raise TypeError('Both data and filename must be specified') if data is not None: if content_type is None: content_type, enc = guess_content_type(filename, data) elif isinstance(theFile, File): filename = theFile.getId() data = str(theFile.data) content_type = content_type or theFile.content_type elif isinstance(theFile, file): filename = cookId(theFile.name) data = theFile.read() if content_type is None: content_type, enc = guess_content_type(filename, data) elif isinstance(theFile, FileUpload): filename = cookId(theFile.filename) data = theFile.read() headers = theFile.headers if content_type is None: if 'content-type' in headers: content_type = headers['content-type'] else: content_type, enc = guess_content_type(filename, data) else: raise TypeError('Unknown object type found: %r' % theFile) msg = MIMEBase(*content_type.split('/')) msg.set_payload(data) encode_base64(msg) msg.add_header('Content-ID', '<%s>' % \ ''.join(['%s' % ord(i) for i in filename])) msg.add_header('Content-Disposition', 'attachment', filename=filename) self.attach(msg)
def add_file(self,theFile=None,data=None,filename=None,content_type=None): "add a Zope file or Image to ourselves as an attachment" if theFile and data is not None: raise TypeError( 'A file-like object was passed as well as data to create a file' ) if (data is None) != (not filename): raise TypeError( 'Both data and filename must be specified' ) if data is not None: if content_type is None: content_type, enc=guess_content_type(filename, data) elif isinstance(theFile,File): filename = theFile.getId() data = str(theFile.data) content_type = content_type or theFile.content_type elif isinstance(theFile,file): filename = cookId(theFile.name) data = theFile.read() if content_type is None: content_type,enc = guess_content_type(filename, data) elif isinstance(theFile,FileUpload): filename = cookId(theFile.filename) data=theFile.read() headers=theFile.headers if content_type is None: if headers.has_key('content-type'): content_type=headers['content-type'] else: content_type, enc=guess_content_type(filename, data) else: raise TypeError('Unknown object type found: %r' % theFile) msg = MIMEBase(*content_type.split('/')) msg.set_payload(data) Encoders.encode_base64(msg) msg.add_header('Content-ID', '<%s>' % \ ''.join(['%s' % ord(i) for i in filename])) msg.add_header('Content-Disposition', 'attachment', filename=filename) self.attach(msg)
def __init__(self, path, _prefix=None): name = _prefix['__name__'] resource = pkg_resources.resource_stream(name, path) data = resource.read() content_type, enc = guess_content_type(path, data) if content_type: self.content_type = content_type else: self.content_type = 'image/%s' % path[path.rfind('.') + 1:] self.__name__ = path[path.rfind('/') + 1:] self.lmt = time.time() self.lmh = rfc1123_date(self.lmt)
def htmlValue(self, REQUEST): from ZPublisher.HTTPRequest import FileUpload from OFS.content_types import guess_content_type file = REQUEST.form.get('%s_file' % self.fgField.getName()) if isinstance(file, FileUpload) and file.filename != '': file.seek(0) fdata = file.read() filename = file.filename mimetype, enc = guess_content_type(filename, fdata, None) return "%s: %s bytes" % (mimetype, len(fdata)) else: return 'No Input'
def __call__(self, client=None, REQUEST={}, RESPONSE=None, **kw): """Render the document given a client object, REQUEST mapping, Response, and key word arguments.""" if not self._cache_namespace_keys: data = self.ZCacheable_get(default=_marker) if data is not _marker: # Return cached results. return data kw['document_id'] =self.getId() kw['document_title']=self.title if hasattr(self, 'aq_explicit'): bself=self.aq_explicit else: bself=self security=getSecurityManager() security.addContext(self) try: if client is None: # Called as subtemplate, so don't need error propigation! r=apply(HTML.__call__, (self, bself, REQUEST), kw) if RESPONSE is None: result = r else: result = decapitate(r, RESPONSE) if not self._cache_namespace_keys: self.ZCacheable_set(result) return result r=apply(HTML.__call__, (self, (client, bself), REQUEST), kw) if type(r) is not type('') or RESPONSE is None: if not self._cache_namespace_keys: self.ZCacheable_set(r) return r finally: security.removeContext(self) have_key=RESPONSE.headers.has_key if not (have_key('content-type') or have_key('Content-Type')): if self.__dict__.has_key('content_type'): c=self.content_type else: c, e=guess_content_type(self.__name__, r) RESPONSE.setHeader('Content-Type', c) result = decapitate(r, RESPONSE) if not self._cache_namespace_keys: self.ZCacheable_set(result) return result
def _get_content_type(self, filename, body, id, content_type=None): # Consult self.content_type first, this is either # the default (unknown/unknown) or it got a value from a # .metadata file default_type = 'unknown/unknown' if getattr(self, 'content_type', default_type) != default_type: return self.content_type # Use the (imperfect) content type guessing # mechanism from OFS.Image, which ultimately uses the # Python mimetypes module. if not isinstance(body, basestring): body = body.data content_type, enc = guess_content_type(filename, body, content_type) return content_type
def _fileitemkeywords(self, lang): """ """ res = '' if txng_converters: fileitem = self.getFileItem(lang) data = str(fileitem.data) mimetype, encoding = guess_content_type(self.getId(), data) converter = ConverterRegistry.get(mimetype) if converter: try: res, encoding = converter.convert2(data, encoding, mimetype) except: try: res = converter.convert(data) except: pass return res
def _get_content_type(self, file, body, id, content_type=None): # Consult self.content_type first, this is either # the default (unknown/unknown) or it got a value from a # .metadata file default_type = 'unknown/unknown' if getattr(self, 'content_type', default_type) != default_type: return self.content_type # Next, look at file headers headers = getattr(file, 'headers', None) if headers and headers.has_key('content-type'): content_type = headers['content-type'] else: # Last resort: Use the (imperfect) content type guessing # mechanism from OFS.Image, which ultimately uses the # Python mimetypes module. if type(body) is not type(''): body = body.data content_type, enc = guess_content_type( getattr(file, 'filename', id), body, content_type) return content_type
def _get_content_type(self, file, body, id, content_type=None): # Consult self.content_type first, this is either # the default (unknown/unknown) or it got a value from a # .metadata file default_type = 'unknown/unknown' if getattr(self, 'content_type', default_type) != default_type: return self.content_type # Next, look at file headers headers=getattr(file, 'headers', None) if headers and headers.has_key('content-type'): content_type=headers['content-type'] else: # Last resort: Use the (imperfect) content type guessing # mechanism from OFS.Image, which ultimately uses the # Python mimetypes module. if type(body) is not type(''): body=body.data content_type, enc=guess_content_type( getattr(file, 'filename',id), body, content_type) return content_type
def doFile(context, filename, data): """Create, modify or delete the specified file or image. An Image is created if the file suffix indicates it. Prints a status message and returns a boolean for success/failure. """ dlog('doFile(%s,...)' % (filename)) if options.dryrun: vlog(': dry run') return True folder = context.folder() existing = getattr(folder, filename, None) #if existing and options.ignore: # vlog(': ignored') # return True if existing and options.delete: folder._delObject(filename) get_transaction().commit() vlog(': deleted') return True elif existing and options.replace: folder._getOb(filename).manage_upload(data) get_transaction().commit() vlog(': replaced') return True else: try: if guess_content_type(filename)[0][0:5] == 'image': folder._setObject(filename, OFS.Image.Image(filename, filename, '')) else: folder._setObject(filename, OFS.Image.File(filename, filename, '')) folder._getOb(filename).manage_upload(data) get_transaction().commit() vlog(': created') return True except BadRequest, e: vlog(': failed\n*** (%s)' % e) return False
def doFile(context,filename,data): """Create, modify or delete the specified file or image. An Image is created if the file suffix indicates it. Prints a status message and returns a boolean for success/failure. """ dlog('doFile(%s,...)' % (filename)) if options.dryrun: vlog(': dry run') return True folder = context.folder() existing = getattr(folder,filename,None) #if existing and options.ignore: # vlog(': ignored') # return True if existing and options.delete: folder._delObject(filename) get_transaction().commit() vlog(': deleted') return True elif existing and options.replace: folder._getOb(filename).manage_upload(data) get_transaction().commit() vlog(': replaced') return True else: try: if guess_content_type(filename)[0][0:5] == 'image': folder._setObject(filename, OFS.Image.Image(filename,filename,'')) else: folder._setObject(filename, OFS.Image.File(filename,filename,'')) folder._getOb(filename).manage_upload(data) get_transaction().commit() vlog(': created') return True except BadRequest, e: vlog(': failed\n*** (%s)' % e) return False
def _populateConversionCacheWithHTML(self, zip_file=None): """ Extract content from the ODF zip file and populate the document. Optional parameter zip_file prevents from converting content twice. """ if zip_file is None: format_list = [x for x in self.getTargetFormatList() if x.startswith('html') or x.endswith('html')] format = format_list[0] mime, data = self._getConversionFromProxyServer(format) archive_file = cStringIO.StringIO() archive_file.write(str(data)) zip_file = zipfile.ZipFile(archive_file) must_close = 1 else: must_close = 0 for f in zip_file.infolist(): filename = f.filename document = self.get(filename, None) if document is not None: self.manage_delObjects([filename]) # For compatibility with old implementation if filename.endswith('html'): mime = 'text/html' # call portal_transforms to strip HTML in safe mode portal = self.getPortalObject() transform_tool = getToolByName(portal, 'portal_transforms') data = transform_tool.convertToData('text/x-html-safe', zip_file.read(filename), object=self, context=self, mimetype=mime) else: mime = guess_content_type(filename)[0] data = Pdata(zip_file.read(filename)) self.setConversion(data, mime=mime, format=EMBEDDED_FORMAT, filename=filename) if must_close: zip_file.close() archive_file.close()
def _index_object(self, documentId, obj, threshold=None, attr=''): encoding = self.default_encoding source = mimetype = None # This is to support foreign file formats that # are stored as "File" objects when searching # through PrincipiaSearchSource if hasattr(obj, 'txng_get'): # Check if the object has a method txng_get() result = obj.txng_get([attr]) if result is None: return None source, mimetype, encoding = result elif obj.meta_type in ('File', 'Portal File', 'Naaya File') and \ attr in ('PrincipiaSearchSource', 'SearchableText'): source = getattr(obj, attr, None) if source and not self.use_converters: if callable(source): source = source() else: source = str(obj) mimetype = obj.content_type elif obj.meta_type == 'ExtFile' and \ attr in ('PrincipiaSearchSource', 'SearchableText'): source = obj.index_html() mimetype = obj.getContentType() elif obj.meta_type in ('ZMSFile', ): lang = attr[attr.rfind('_') + 1:] req = {'lang': lang} file = obj.getObjProperty('file', req) source = '' mimetype = None if file: source = file.getData() mimetype = file.getContentType() elif obj.meta_type in ('TTWObject', ) and attr not in ('SearchableText', ): field = obj.get(attr) source = str(field) if field.meta_type in ('ZMSFile', 'File'): mimetype = field.getContentType() else: mimetype = None else: # default behaviour: try to obtain the source from # the attribute or method call return value try: source = getattr(obj, attr) if callable(source): source = source() if not isinstance(source, unicode): source = str(source) except (AttributeError, TypeError): return None # If enabled, we try to find a valid document converter # and convert the data to get a hopefully text only representation # of the data. if self.use_converters: if mimetype is None or mimetype == 'application/octet-stream': mimetype, encoding = guess_content_type(obj.getId(), source) if not encoding: encoding = self.default_encoding try: converter = ConverterRegistry.get(mimetype) except RegistryException: LOG( 'textindexng', ERROR, '%s could not be converted because no converter could be found for %s' % (obj.absolute_url(1), mimetype)) return None if converter: try: source, encoding = converter.convert2(source, encoding, mimetype) except: try: source = converter.convert(source) except: LOG('textindexng', ERROR, '%s could not be converted' % obj.absolute_url(1), error=sys.exc_info()) return None if obj.meta_type == 'Portal File': source += ' ' + obj.SearchableText() # Now we try to get a valid encoding. For unicode strings # we have to perform no action. For string objects we check # if the document has an attibute (not a method) '<index>_encoding'. # As fallback we also check for the presence of an attribute # 'document_encoding'. Checking for the two attributes allows # us to define different encodings for different attributes # on an object. This is useful when an object stores multiple texts # as attributes within the same instance (e.g. for multilingual # versions of a text but with different encodings). # If no encoding is specified as object attribute, we will use # Python's default encoding. # After getting the encoding, we convert the data to unicode. if isinstance(source, str): if encoding is None: try: encoding = self.default_encoding except: encoding = self.default_encoding = 'iso-8859-15' for k in ['document_encoding', attr + '_encoding']: enc = getattr(obj, k, None) if enc is not None: encoding = enc if encoding == 'ascii': encoding = 'iso-8859-15' try: source = unicode(source, encoding, 'strict') except UnicodeDecodeError: LOG( 'textindexng', WARNING, 'UnicodeDecodeError raised from %s - ignoring unknown unicode characters' % obj.absolute_url(1)) source = unicode(source, encoding, 'ignore') elif isinstance(source, unicode): pass else: raise TXNGError, "unknown object type" source = source.strip() if not source: return None # Normalization: apply translation table to data if self.use_normalizer: source = NormalizerRegistry.get(self.use_normalizer).process(source) # Split the text into a list of words SP = SplitterRegistry.get(self.use_splitter) _source = source words = SP(casefolding=self.splitter_casefolding, separator=self.splitter_separators, maxlen=self.splitter_max_len, singlechar=self.splitter_single_chars).split(_source) # remove stopwords from data if self.use_stopwords: words = self.use_stopwords.process(words) # We pass the list of words to the corresponding lexicon # and obtain a list of wordIds. The "old" TextIndex iterated # over every single words (overhead). return self._lexicon.getWordIdList(words)
def _guessContentType(self): content_type, enc = guess_content_type( self._filepath or self._entry_subpath, self._data) if content_type: self._content_type = content_type
def _index_object(self, documentId, obj, threshold=None, attr=''): encoding = self.default_encoding source = mimetype = None # This is to support foreign file formats that # are stored as "File" objects when searching # through PrincipiaSearchSource if hasattr(obj, 'txng_get'): # Check if the object has a method txng_get() result = obj.txng_get([attr]) if result is None: return None source, mimetype, encoding = result elif obj.meta_type in ('File', 'Portal File') and \ attr in ('PrincipiaSearchSource', 'SearchableText'): source= getattr(obj, attr, None) if source and not self.use_converters: if callable(source): source = source() else: source = str(obj) mimetype = obj.content_type elif obj.meta_type == 'ExtFile' and \ attr in ('PrincipiaSearchSource', 'SearchableText'): source = obj.index_html() mimetype = obj.getContentType() elif obj.meta_type in ('ZMSFile',): lang = attr[attr.rfind('_')+1:] req = {'lang' : lang} file = obj.getObjProperty('file', req) source = '' mimetype = None if file: source = file.getData() mimetype = file.getContentType() elif obj.meta_type in ('TTWObject',) and attr not in ('SearchableText', ): field = obj.get(attr) source = str(field) if field.meta_type in ( 'ZMSFile', 'File' ): mimetype = field.getContentType() else: mimetype = None else: # default behaviour: try to obtain the source from # the attribute or method call return value try: source = getattr(obj, attr) if callable(source): source = source() if not isinstance(source, unicode): source = str(source) except (AttributeError, TypeError): return None # If enabled, we try to find a valid document converter # and convert the data to get a hopefully text only representation # of the data. if self.use_converters: if mimetype is None or mimetype == 'application/octet-stream': mimetype, encoding = guess_content_type(obj.getId(), source) if not encoding: encoding = self.default_encoding try: converter = ConverterRegistry.get(mimetype) except RegistryException: LOG('textindexng', ERROR, '%s could not be converted because no converter could be found for %s' % (obj.absolute_url(1), mimetype)) return None if converter: try: source, encoding = converter.convert2(source, encoding, mimetype) except: try: source = converter.convert(source) except: LOG('textindexng', ERROR, '%s could not be converted' % obj.absolute_url(1), error=sys.exc_info()) return None if obj.meta_type == 'Portal File': source += ' ' + obj.SearchableText() # Now we try to get a valid encoding. For unicode strings # we have to perform no action. For string objects we check # if the document has an attibute (not a method) '<index>_encoding'. # As fallback we also check for the presence of an attribute # 'document_encoding'. Checking for the two attributes allows # us to define different encodings for different attributes # on an object. This is useful when an object stores multiple texts # as attributes within the same instance (e.g. for multilingual # versions of a text but with different encodings). # If no encoding is specified as object attribute, we will use # Python's default encoding. # After getting the encoding, we convert the data to unicode. if isinstance(source, str): if encoding is None: try: encoding = self.default_encoding except: encoding = self.default_encoding = 'iso-8859-15' for k in ['document_encoding', attr + '_encoding']: enc = getattr(obj, k, None) if enc is not None: encoding = enc if encoding=='ascii': encoding ='iso-8859-15' try: source = unicode(source, encoding, 'strict') except UnicodeDecodeError: LOG('textindexng', WARNING, 'UnicodeDecodeError raised from %s - ignoring unknown unicode characters' % obj.absolute_url(1)) source = unicode(source, encoding, 'ignore') elif isinstance(source, unicode): pass else: raise TXNGError,"unknown object type" source = source.strip() if not source: return None # Normalization: apply translation table to data if self.use_normalizer: source = NormalizerRegistry.get(self.use_normalizer).process(source) # Split the text into a list of words SP = SplitterRegistry.get(self.use_splitter) _source = source words = SP(casefolding = self.splitter_casefolding, separator = self.splitter_separators, maxlen = self.splitter_max_len, singlechar = self.splitter_single_chars ).split(_source) # remove stopwords from data if self.use_stopwords: words = self.use_stopwords.process( words ) # We pass the list of words to the corresponding lexicon # and obtain a list of wordIds. The "old" TextIndex iterated # over every single words (overhead). return self._lexicon.getWordIdList(words)
#if existing and options.ignore: # vlog(': ignored') # return True if existing and options.delete: folder._delObject(filename) get_transaction().commit() vlog(': deleted') return True elif existing and options.replace: folder._getOb(filename).manage_upload(data) get_transaction().commit() vlog(': replaced') return True else: try: if guess_content_type(filename)[0][0:5] == 'image': folder._setObject(filename, OFS.Image.Image(filename,filename,'')) else: folder._setObject(filename, OFS.Image.File(filename,filename,'')) folder._getOb(filename).manage_upload(data) get_transaction().commit() vlog(': created') return True except BadRequest, e: vlog(': failed\n*** (%s)' % e) return False def exportObj(path,dir): """Export a zope folder/wikipage/file/image as one or more files.""" dlog('exportFile(%s,%s)' % (path,dir)) vlog(path,newline=False)