Example #1
0
    def _get_content_type(self, file, body, id, content_type=None):
        # Consult self.content_type first, this is either
        # the default (unknown/unknown) or it got a value from a
        # .metadata file
        default_type = 'unknown/unknown'
        if getattr(self, 'content_type', default_type) != default_type:
            return self.content_type

        # Next, look at file headers
        headers = getattr(file, 'headers', None)
        if headers and headers.has_key('content-type'):
            content_type = headers['content-type']
        else:
            # Last resort: Use the (imperfect) content type guessing
            # mechanism from OFS.Image, which ultimately uses the
            # Python mimetypes module.
            if not isinstance(body, basestring):
                body = body.data
            content_type, enc = guess_content_type(
                getattr(file, 'filename', id), body, content_type)
            if (enc is None and (content_type.startswith('text/')
                                 or content_type.startswith('application/'))
                    and body.startswith(codecs.BOM_UTF8)):
                content_type += '; charset=utf-8'

        return content_type
Example #2
0
    def _get_content_type(self, file, body, id, content_type=None):
        # Consult self.content_type first, this is either
        # the default (unknown/unknown) or it got a value from a
        # .metadata file
        default_type = 'unknown/unknown'
        if getattr(self, 'content_type', default_type) != default_type:
            return self.content_type

        # Next, look at file headers
        headers=getattr(file, 'headers', None)
        if headers and headers.has_key('content-type'):
            content_type=headers['content-type']
        else:
            # Last resort: Use the (imperfect) content type guessing
            # mechanism from OFS.Image, which ultimately uses the
            # Python mimetypes module.
            if not isinstance(body, basestring):
                body = body.data
            content_type, enc=guess_content_type(
                getattr(file, 'filename',id), body, content_type)
            if (enc is None
                and (content_type.startswith('text/') or
                     content_type.startswith('application/'))
                and body.startswith(codecs.BOM_UTF8)):
                content_type += '; charset=utf-8'

        return content_type
Example #3
0
    def __init__(self,path,_prefix=None):
        if _prefix is None: _prefix=SOFTWARE_HOME
        elif type(_prefix) is not type(''):
            _prefix=package_home(_prefix)
        path = os.path.join(_prefix, path)
        self.path=path
        if Globals.DevelopmentMode:
            # In development mode, a shorter time is handy
            max_age = 60 # One minute
        else:
            # A longer time reduces latency in production mode
            max_age = 3600 # One hour
        self.cch = 'public,max-age=%d' % max_age

        file=open(path, 'rb')
        data=file.read()
        file.close()
        content_type, enc=guess_content_type(path, data)
        if content_type:
            self.content_type=content_type
        else:
            self.content_type='image/%s' % path[path.rfind('.')+1:]
        self.__name__=path[path.rfind('/')+1:]
        self.lmt=float(stat(path)[8]) or time()
        self.lmh=rfc1123_date(self.lmt)
Example #4
0
 def _get_content_type(self, file, body, id, content_type=None):
     headers = getattr(file, 'headers', None)
     if headers and headers.has_key('content-type'):
         content_type = headers['content-type']
     else:
         if type(body) is not type(''): body = body.data
         content_type, enc = guess_content_type(
             getattr(file, 'filename', id), body, content_type)
     return content_type
Example #5
0
 def _get_content_type(self, file, body, id, content_type=None):
     headers=getattr(file, 'headers', None)
     if headers and headers.has_key('content-type'):
         content_type=headers['content-type']
     else:
         if type(body) is not type(''): body=body.data
         content_type, enc=guess_content_type(
             getattr(file, 'filename',id), body, content_type)
     return content_type
Example #6
0
    def add_file(self,
                 theFile=None,
                 data=None,
                 filename=None,
                 content_type=None):
        "add a Zope file or Image to ourselves as an attachment"
        if theFile and data is not None:
            raise TypeError(
                'A file-like object was passed as well as data to create a file'
            )
        if (data is None) != (not filename):
            raise TypeError('Both data and filename must be specified')
        if data is not None:
            if content_type is None:
                content_type, enc = guess_content_type(filename, data)
        elif isinstance(theFile, File):
            filename = theFile.getId()
            data = str(theFile.data)
            content_type = content_type or theFile.content_type
        elif isinstance(theFile, file):
            filename = cookId(theFile.name)
            data = theFile.read()
            if content_type is None:
                content_type, enc = guess_content_type(filename, data)
        elif isinstance(theFile, FileUpload):
            filename = cookId(theFile.filename)
            data = theFile.read()
            headers = theFile.headers
            if content_type is None:
                if 'content-type' in headers:
                    content_type = headers['content-type']
                else:
                    content_type, enc = guess_content_type(filename, data)
        else:
            raise TypeError('Unknown object type found: %r' % theFile)

        msg = MIMEBase(*content_type.split('/'))
        msg.set_payload(data)
        encode_base64(msg)
        msg.add_header('Content-ID', '<%s>' % \
            ''.join(['%s' % ord(i) for i in filename]))
        msg.add_header('Content-Disposition', 'attachment', filename=filename)
        self.attach(msg)
Example #7
0
 def add_file(self,theFile=None,data=None,filename=None,content_type=None):
     "add a Zope file or Image to ourselves as an attachment"
     if theFile and data is not None:
         raise TypeError(
             'A file-like object was passed as well as data to create a file'
             )
     if (data is None) != (not filename):
         raise TypeError(
             'Both data and filename must be specified'
             )
     if data is not None:
         if content_type is None:
             content_type, enc=guess_content_type(filename, data)
     elif isinstance(theFile,File):
         filename = theFile.getId()
         data = str(theFile.data)
         content_type = content_type or theFile.content_type
     elif isinstance(theFile,file):
         filename = cookId(theFile.name)
         data = theFile.read()
         if content_type is None:
             content_type,enc = guess_content_type(filename, data)
     elif isinstance(theFile,FileUpload):
         filename = cookId(theFile.filename)
         data=theFile.read()
         headers=theFile.headers
         if content_type is None:
             if headers.has_key('content-type'):
                 content_type=headers['content-type']
             else:
                 content_type, enc=guess_content_type(filename, data)
     else:
         raise TypeError('Unknown object type found: %r' % theFile)
     
     msg = MIMEBase(*content_type.split('/'))
     msg.set_payload(data)
     Encoders.encode_base64(msg)
     msg.add_header('Content-ID', '<%s>' % \
         ''.join(['%s' % ord(i) for i in filename]))
     msg.add_header('Content-Disposition', 'attachment',
                    filename=filename)
     self.attach(msg)
Example #8
0
    def __init__(self, path, _prefix=None):
        name = _prefix['__name__']
        resource = pkg_resources.resource_stream(name, path)

        data = resource.read()
        content_type, enc = guess_content_type(path, data)
        if content_type:
            self.content_type = content_type
        else:
            self.content_type = 'image/%s' % path[path.rfind('.') + 1:]
        self.__name__ = path[path.rfind('/') + 1:]
        self.lmt = time.time()
        self.lmh = rfc1123_date(self.lmt)
Example #9
0
File: fields.py Project: a25kk/stv2
    def htmlValue(self, REQUEST):

        from ZPublisher.HTTPRequest import FileUpload
        from OFS.content_types import guess_content_type

        file = REQUEST.form.get('%s_file' % self.fgField.getName())
        if isinstance(file, FileUpload) and file.filename != '':
            file.seek(0)
            fdata = file.read()
            filename = file.filename
            mimetype, enc = guess_content_type(filename, fdata, None)
            return "%s: %s bytes" % (mimetype, len(fdata))
        else:
            return 'No Input'
Example #10
0
    def __call__(self, client=None, REQUEST={}, RESPONSE=None, **kw):
        """Render the document given a client object, REQUEST mapping,
        Response, and key word arguments."""

        if not self._cache_namespace_keys:
            data = self.ZCacheable_get(default=_marker)
            if data is not _marker:
                # Return cached results.
                return data

        kw['document_id']   =self.getId()
        kw['document_title']=self.title
        if hasattr(self, 'aq_explicit'):
            bself=self.aq_explicit
        else: bself=self

        security=getSecurityManager()
        security.addContext(self)

        try:
            if client is None:
                # Called as subtemplate, so don't need error propigation!
                r=apply(HTML.__call__, (self, bself, REQUEST), kw)
                if RESPONSE is None: result = r
                else: result = decapitate(r, RESPONSE)
                if not self._cache_namespace_keys:
                    self.ZCacheable_set(result)
                return result

            r=apply(HTML.__call__, (self, (client, bself), REQUEST), kw)
            if type(r) is not type('') or RESPONSE is None:
                if not self._cache_namespace_keys:
                    self.ZCacheable_set(r)
                return r

        finally: security.removeContext(self)

        have_key=RESPONSE.headers.has_key
        if not (have_key('content-type') or have_key('Content-Type')):
            if self.__dict__.has_key('content_type'):
                c=self.content_type
            else:
                c, e=guess_content_type(self.__name__, r)
            RESPONSE.setHeader('Content-Type', c)
        result = decapitate(r, RESPONSE)
        if not self._cache_namespace_keys:
            self.ZCacheable_set(result)
        return result
Example #11
0
    def _get_content_type(self, filename, body, id, content_type=None):
        # Consult self.content_type first, this is either
        # the default (unknown/unknown) or it got a value from a
        # .metadata file
        default_type = 'unknown/unknown'
        if getattr(self, 'content_type', default_type) != default_type:
            return self.content_type

        # Use the (imperfect) content type guessing
        # mechanism from OFS.Image, which ultimately uses the
        # Python mimetypes module.
        if not isinstance(body, basestring):
            body = body.data

        content_type, enc = guess_content_type(filename, body, content_type)

        return content_type
Example #12
0
 def _fileitemkeywords(self, lang):
     """ """
     res = ''
     if txng_converters:
         fileitem = self.getFileItem(lang)
         data = str(fileitem.data)
         mimetype, encoding = guess_content_type(self.getId(), data)
         converter = ConverterRegistry.get(mimetype)
         if converter:
             try:
                 res, encoding = converter.convert2(data, encoding, mimetype)
             except:
                 try:
                     res = converter.convert(data)
                 except:
                     pass
     return res
Example #13
0
    def _get_content_type(self, filename, body, id, content_type=None):
        # Consult self.content_type first, this is either
        # the default (unknown/unknown) or it got a value from a
        # .metadata file
        default_type = 'unknown/unknown'
        if getattr(self, 'content_type', default_type) != default_type:
            return self.content_type

        # Use the (imperfect) content type guessing
        # mechanism from OFS.Image, which ultimately uses the
        # Python mimetypes module.
        if not isinstance(body, basestring):
            body = body.data

        content_type, enc = guess_content_type(filename, body, content_type)

        return content_type
Example #14
0
 def _fileitemkeywords(self, lang):
     """ """
     res = ''
     if txng_converters:
         fileitem = self.getFileItem(lang)
         data = str(fileitem.data)
         mimetype, encoding = guess_content_type(self.getId(), data)
         converter = ConverterRegistry.get(mimetype)
         if converter:
             try:
                 res, encoding = converter.convert2(data, encoding,
                                                    mimetype)
             except:
                 try:
                     res = converter.convert(data)
                 except:
                     pass
     return res
Example #15
0
    def _get_content_type(self, file, body, id, content_type=None):
        # Consult self.content_type first, this is either
        # the default (unknown/unknown) or it got a value from a
        # .metadata file
        default_type = 'unknown/unknown'
        if getattr(self, 'content_type', default_type) != default_type:
            return self.content_type

        # Next, look at file headers
        headers = getattr(file, 'headers', None)
        if headers and headers.has_key('content-type'):
            content_type = headers['content-type']
        else:
            # Last resort: Use the (imperfect) content type guessing
            # mechanism from OFS.Image, which ultimately uses the
            # Python mimetypes module.
            if type(body) is not type(''): body = body.data
            content_type, enc = guess_content_type(
                getattr(file, 'filename', id), body, content_type)

        return content_type
Example #16
0
    def _get_content_type(self, file, body, id, content_type=None):
        # Consult self.content_type first, this is either
        # the default (unknown/unknown) or it got a value from a
        # .metadata file
        default_type = 'unknown/unknown'
        if getattr(self, 'content_type', default_type) != default_type:
            return self.content_type

        # Next, look at file headers
        headers=getattr(file, 'headers', None)
        if headers and headers.has_key('content-type'):
            content_type=headers['content-type']
        else:
            # Last resort: Use the (imperfect) content type guessing
            # mechanism from OFS.Image, which ultimately uses the
            # Python mimetypes module.
            if type(body) is not type(''): body=body.data
            content_type, enc=guess_content_type(
                getattr(file, 'filename',id), body, content_type)

        return content_type
Example #17
0
def doFile(context, filename, data):
    """Create, modify or delete the specified file or image.

    An Image is created if the file suffix indicates it.
    Prints a status message and returns a boolean for success/failure.
    """
    dlog('doFile(%s,...)' % (filename))
    if options.dryrun:
        vlog(': dry run')
        return True
    folder = context.folder()
    existing = getattr(folder, filename, None)
    #if existing and options.ignore:
    #    vlog(': ignored')
    #    return True
    if existing and options.delete:
        folder._delObject(filename)
        get_transaction().commit()
        vlog(': deleted')
        return True
    elif existing and options.replace:
        folder._getOb(filename).manage_upload(data)
        get_transaction().commit()
        vlog(': replaced')
        return True
    else:
        try:
            if guess_content_type(filename)[0][0:5] == 'image':
                folder._setObject(filename,
                                  OFS.Image.Image(filename, filename, ''))
            else:
                folder._setObject(filename,
                                  OFS.Image.File(filename, filename, ''))
            folder._getOb(filename).manage_upload(data)
            get_transaction().commit()
            vlog(': created')
            return True
        except BadRequest, e:
            vlog(': failed\n*** (%s)' % e)
            return False
Example #18
0
def doFile(context,filename,data):
    """Create, modify or delete the specified file or image.

    An Image is created if the file suffix indicates it.
    Prints a status message and returns a boolean for success/failure.
    """
    dlog('doFile(%s,...)' % (filename))
    if options.dryrun:
        vlog(': dry run')
        return True
    folder = context.folder()
    existing = getattr(folder,filename,None)
    #if existing and options.ignore:
    #    vlog(': ignored')
    #    return True
    if existing and options.delete:
        folder._delObject(filename)
        get_transaction().commit()
        vlog(': deleted')
        return True
    elif existing and options.replace:
        folder._getOb(filename).manage_upload(data)
        get_transaction().commit()
        vlog(': replaced')
        return True
    else:
        try:
            if guess_content_type(filename)[0][0:5] == 'image':
                folder._setObject(filename, OFS.Image.Image(filename,filename,''))
            else:
                folder._setObject(filename, OFS.Image.File(filename,filename,''))
            folder._getOb(filename).manage_upload(data)
            get_transaction().commit()
            vlog(': created')
            return True
        except BadRequest, e:
            vlog(': failed\n*** (%s)' % e)
            return False
Example #19
0
 def _populateConversionCacheWithHTML(self, zip_file=None):
   """
   Extract content from the ODF zip file and populate the document.
   Optional parameter zip_file prevents from converting content twice.
   """
   if zip_file is None:
     format_list = [x for x in self.getTargetFormatList()
                               if x.startswith('html') or x.endswith('html')]
     format = format_list[0]
     mime, data = self._getConversionFromProxyServer(format)
     archive_file = cStringIO.StringIO()
     archive_file.write(str(data))
     zip_file = zipfile.ZipFile(archive_file)
     must_close = 1
   else:
     must_close = 0
   for f in zip_file.infolist():
     filename = f.filename
     document = self.get(filename, None)
     if document is not None:
       self.manage_delObjects([filename]) # For compatibility with old implementation
     if filename.endswith('html'):
       mime = 'text/html'
       # call portal_transforms to strip HTML in safe mode
       portal = self.getPortalObject()
       transform_tool = getToolByName(portal, 'portal_transforms')
       data = transform_tool.convertToData('text/x-html-safe',
                                           zip_file.read(filename),
                                           object=self, context=self,
                                           mimetype=mime)
     else:
       mime = guess_content_type(filename)[0]
       data = Pdata(zip_file.read(filename))
     self.setConversion(data, mime=mime, format=EMBEDDED_FORMAT, filename=filename)
   if must_close:
     zip_file.close()
     archive_file.close()
Example #20
0
def _index_object(self, documentId, obj, threshold=None, attr=''):

    encoding = self.default_encoding
    source = mimetype = None

    # This is to support foreign file formats that
    # are stored as "File" objects when searching
    # through PrincipiaSearchSource

    if hasattr(obj, 'txng_get'):
        # Check if the object has a method txng_get()
        result = obj.txng_get([attr])
        if result is None: return None
        source, mimetype, encoding = result

    elif obj.meta_type in ('File', 'Portal File', 'Naaya File') and  \
       attr in ('PrincipiaSearchSource', 'SearchableText'):

        source = getattr(obj, attr, None)
        if source and not self.use_converters:
            if callable(source): source = source()
        else:
            source = str(obj)
        mimetype = obj.content_type

    elif obj.meta_type == 'ExtFile' and \
       attr in ('PrincipiaSearchSource', 'SearchableText'):
        source = obj.index_html()
        mimetype = obj.getContentType()

    elif obj.meta_type in ('ZMSFile', ):
        lang = attr[attr.rfind('_') + 1:]
        req = {'lang': lang}
        file = obj.getObjProperty('file', req)
        source = ''
        mimetype = None
        if file:
            source = file.getData()
            mimetype = file.getContentType()

    elif obj.meta_type in ('TTWObject', ) and attr not in ('SearchableText', ):
        field = obj.get(attr)
        source = str(field)
        if field.meta_type in ('ZMSFile', 'File'):
            mimetype = field.getContentType()
        else:
            mimetype = None

    else:
        # default behaviour: try to obtain the source from
        # the attribute or method call return value

        try:
            source = getattr(obj, attr)
            if callable(source): source = source()
            if not isinstance(source, unicode):
                source = str(source)
        except (AttributeError, TypeError):
            return None

    # If enabled, we try to find a valid document converter
    # and convert the data to get a hopefully text only representation
    # of the data.

    if self.use_converters:
        if mimetype is None or mimetype == 'application/octet-stream':
            mimetype, encoding = guess_content_type(obj.getId(), source)
            if not encoding:
                encoding = self.default_encoding

        try:
            converter = ConverterRegistry.get(mimetype)
        except RegistryException:
            LOG(
                'textindexng', ERROR,
                '%s could not be converted because no converter could be found for %s'
                % (obj.absolute_url(1), mimetype))
            return None

        if converter:
            try:
                source, encoding = converter.convert2(source, encoding,
                                                      mimetype)
            except:
                try:
                    source = converter.convert(source)
                except:
                    LOG('textindexng',
                        ERROR,
                        '%s could not be converted' % obj.absolute_url(1),
                        error=sys.exc_info())
                    return None

        if obj.meta_type == 'Portal File':
            source += ' ' + obj.SearchableText()

    # Now we try to get a valid encoding. For unicode strings
    # we have to perform no action. For string objects we check
    # if the document has an attibute (not a method) '<index>_encoding'.
    # As fallback we also check for the presence of an attribute
    # 'document_encoding'. Checking for the two attributes allows
    # us to define different encodings for different attributes
    # on an object. This is useful when an object stores multiple texts
    # as attributes within the same instance (e.g. for multilingual
    # versions of a text but with different encodings).
    # If no encoding is specified as object attribute, we will use
    # Python's default encoding.
    # After getting the encoding, we convert the data to unicode.

    if isinstance(source, str):
        if encoding is None:
            try:
                encoding = self.default_encoding
            except:
                encoding = self.default_encoding = 'iso-8859-15'

            for k in ['document_encoding', attr + '_encoding']:
                enc = getattr(obj, k, None)
                if enc is not None: encoding = enc

        if encoding == 'ascii': encoding = 'iso-8859-15'
        try:
            source = unicode(source, encoding, 'strict')
        except UnicodeDecodeError:
            LOG(
                'textindexng', WARNING,
                'UnicodeDecodeError raised from %s - ignoring unknown unicode characters'
                % obj.absolute_url(1))
            source = unicode(source, encoding, 'ignore')

    elif isinstance(source, unicode):
        pass
    else:
        raise TXNGError, "unknown object type"

    source = source.strip()
    if not source: return None

    # Normalization: apply translation table to data
    if self.use_normalizer:
        source = NormalizerRegistry.get(self.use_normalizer).process(source)

    # Split the text into a list of words
    SP = SplitterRegistry.get(self.use_splitter)

    _source = source
    words = SP(casefolding=self.splitter_casefolding,
               separator=self.splitter_separators,
               maxlen=self.splitter_max_len,
               singlechar=self.splitter_single_chars).split(_source)

    #  remove stopwords from data
    if self.use_stopwords:
        words = self.use_stopwords.process(words)

    # We pass the list of words to the corresponding lexicon
    # and obtain a list of wordIds. The "old" TextIndex iterated
    # over every single words (overhead).
    return self._lexicon.getWordIdList(words)
Example #21
0
 def _guessContentType(self):
     content_type, enc = guess_content_type(
                 self._filepath or self._entry_subpath, self._data)
     if content_type:
         self._content_type = content_type
Example #22
0
    def _index_object(self, documentId, obj, threshold=None, attr=''):

        encoding = self.default_encoding
        source = mimetype = None

        # This is to support foreign file formats that
        # are stored as "File" objects when searching
        # through PrincipiaSearchSource

        if hasattr(obj, 'txng_get'):
            # Check if the object has a method txng_get()
            result = obj.txng_get([attr])
            if result is None: return None
            source, mimetype, encoding = result

        elif obj.meta_type in ('File', 'Portal File') and  \
           attr in ('PrincipiaSearchSource', 'SearchableText'):

            source= getattr(obj, attr, None)
            if source and not self.use_converters:
                if callable(source): source = source()
            else:              
                source = str(obj)
            mimetype = obj.content_type

        elif obj.meta_type == 'ExtFile' and \
           attr in ('PrincipiaSearchSource', 'SearchableText'):
            source = obj.index_html()
            mimetype = obj.getContentType()

        elif obj.meta_type in ('ZMSFile',):
            lang = attr[attr.rfind('_')+1:]
            req = {'lang' : lang}
            file = obj.getObjProperty('file', req)
            source = ''
            mimetype = None
            if file:
                source = file.getData()
                mimetype = file.getContentType()
   
        elif obj.meta_type in ('TTWObject',) and attr not in ('SearchableText', ): 
            field = obj.get(attr)
            source = str(field)
            if field.meta_type in ( 'ZMSFile', 'File' ):
                mimetype = field.getContentType()
            else:
                mimetype = None

        else:
            # default behaviour: try to obtain the source from
            # the attribute or method call return value

            try:
                source = getattr(obj, attr)
                if callable(source): source = source()
                if not isinstance(source, unicode):
                    source = str(source)
            except (AttributeError, TypeError):
                return None
        
        # If enabled, we try to find a valid document converter
        # and convert the data to get a hopefully text only representation
        # of the data.

        if self.use_converters:
            if mimetype is None or mimetype == 'application/octet-stream':
                mimetype, encoding = guess_content_type(obj.getId(), source)
                if not encoding:
                    encoding = self.default_encoding

            try: 
                converter = ConverterRegistry.get(mimetype)
            except RegistryException: 
                LOG('textindexng', ERROR, '%s could not be converted because no converter could be found for %s' % (obj.absolute_url(1), mimetype))
                return None

            if converter:
                try:
                    source, encoding = converter.convert2(source, encoding, mimetype)
                except:
                    try:
                        source = converter.convert(source)
                    except:
                        LOG('textindexng', ERROR, '%s could not be converted' % obj.absolute_url(1), error=sys.exc_info())
                        return None

            if obj.meta_type == 'Portal File': 
                source += ' ' + obj.SearchableText()

        # Now we try to get a valid encoding. For unicode strings
        # we have to perform no action. For string objects we check
        # if the document has an attibute (not a method) '<index>_encoding'.
        # As fallback we also check for the presence of an attribute
        # 'document_encoding'. Checking for the two attributes allows
        # us to define different encodings for different attributes
        # on an object. This is useful when an object stores multiple texts
        # as attributes within the same instance (e.g. for multilingual
        # versions of a text but with different encodings). 
        # If no encoding is specified as object attribute, we will use
        # Python's default encoding.
        # After getting the encoding, we convert the data to unicode.

        if isinstance(source, str):
            if encoding is None:
                try: encoding = self.default_encoding
                except: encoding = self.default_encoding = 'iso-8859-15'

                for k in ['document_encoding', attr + '_encoding']:
                    enc = getattr(obj, k, None)
                    if enc is not None: encoding = enc  

            if encoding=='ascii': encoding ='iso-8859-15'         
            try:
                source = unicode(source, encoding, 'strict')
            except UnicodeDecodeError:
                LOG('textindexng', WARNING, 'UnicodeDecodeError raised from %s - ignoring unknown unicode characters'  % obj.absolute_url(1))
                source = unicode(source, encoding, 'ignore')
 
        elif isinstance(source, unicode):  pass
        else: raise TXNGError,"unknown object type" 

        source = source.strip()
        if not source: return None

        # Normalization: apply translation table to data
        if self.use_normalizer:
            source = NormalizerRegistry.get(self.use_normalizer).process(source)    
 
        # Split the text into a list of words
        SP = SplitterRegistry.get(self.use_splitter)

        _source = source
        words = SP(casefolding  = self.splitter_casefolding,
                   separator    = self.splitter_separators,
                   maxlen       = self.splitter_max_len,
                   singlechar   = self.splitter_single_chars
                   ).split(_source)

        #  remove stopwords from data
        if self.use_stopwords:
            words = self.use_stopwords.process( words ) 

        # We pass the list of words to the corresponding lexicon
        # and obtain a list of wordIds. The "old" TextIndex iterated
        # over every single words (overhead).
        return self._lexicon.getWordIdList(words)
Example #23
0
    #if existing and options.ignore:
    #    vlog(': ignored')
    #    return True
    if existing and options.delete:
        folder._delObject(filename)
        get_transaction().commit()
        vlog(': deleted')
        return True
    elif existing and options.replace:
        folder._getOb(filename).manage_upload(data)
        get_transaction().commit()
        vlog(': replaced')
        return True
    else:
        try:
            if guess_content_type(filename)[0][0:5] == 'image':
                folder._setObject(filename, OFS.Image.Image(filename,filename,''))
            else:
                folder._setObject(filename, OFS.Image.File(filename,filename,''))
            folder._getOb(filename).manage_upload(data)
            get_transaction().commit()
            vlog(': created')
            return True
        except BadRequest, e:
            vlog(': failed\n*** (%s)' % e)
            return False

def exportObj(path,dir):
    """Export a zope folder/wikipage/file/image as one or more files."""
    dlog('exportFile(%s,%s)' % (path,dir))
    vlog(path,newline=False)