Esempio n. 1
0
 def getNormalizerTable(self):     
     """ return the normalizer translation table """
     
     if self.use_normalizer:       
         return NormalizerRegistry.get(self.use_normalizer).getTable()
     else:
         return None
Esempio n. 2
0
    def lookupWordsByTruncation(self, word, left=0, right=0):
        """ perform right truncation lookup"""

        if self.use_normalizer:
            word = NormalizerRegistry.get(self.use_normalizer).process(word)    

        if self.splitter_casefolding: word = word.lower()
        if right:
            words = self._lexicon.getWordsForRightTruncation(word)
        if left:
            if  self.truncate_left:
                words = self._lexicon.getWordsForLeftTruncation(word)
            else: 
                raise TXNGError, "Left truncation not allowed"

        return self._lookup(words, do_autoexpand=0)
Esempio n. 3
0
    def _lookup(self, words, do_autoexpand=1):
        """ search a word or a list of words in the lexicon and 
            return a ResultSet of found documents.
        """

        docids = IISet()
        used_words = {} 

        #  remove stopwords from data
        if self.use_stopwords:
            words = self.use_stopwords.process( words ) 

        if self.use_thesaurus and self.thesaurus_mode == 'expand_always':
            TH = ThesaurusRegistry.get(self.use_thesaurus)
            for word in words[:]:
                r = TH.getTermsFor(word)
                words.extend(r)

        for word in words:

            # perform casefolding if necessary
            if self.splitter_casefolding:
                word = word.lower()

            if self.use_normalizer:
                word = NormalizerRegistry.get(self.use_normalizer).process(word)    
 
            used_words[word] = 1.0

            wid = self._lexicon.getWordId(word)

            # Retrieve list of docIds for this wordid
            if wid is not None:
                docids.update( self._storage.get(wid) )

            # perform autoexpansion of terms by performing
            # a search using right-truncation
            if do_autoexpand and self.autoexpand and len(word) >= self.autoexpand_limit:
                rs = self.lookupWordsByTruncation(word, right=1)
                docids.update(rs.docIds())
                wlen = len(word)
                for w in rs.words().keys():
                    used_words[w] = TRUNC_WEIGHT[len(w)-wlen]

        return ResultSet(docids, used_words)
Esempio n. 4
0
def _index_object(self, documentId, obj, threshold=None, attr=''):

    encoding = self.default_encoding
    source = mimetype = None

    # This is to support foreign file formats that
    # are stored as "File" objects when searching
    # through PrincipiaSearchSource

    if hasattr(obj, 'txng_get'):
        # Check if the object has a method txng_get()
        result = obj.txng_get([attr])
        if result is None: return None
        source, mimetype, encoding = result

    elif obj.meta_type in ('File', 'Portal File', 'Naaya File') and  \
       attr in ('PrincipiaSearchSource', 'SearchableText'):

        source = getattr(obj, attr, None)
        if source and not self.use_converters:
            if callable(source): source = source()
        else:
            source = str(obj)
        mimetype = obj.content_type

    elif obj.meta_type == 'ExtFile' and \
       attr in ('PrincipiaSearchSource', 'SearchableText'):
        source = obj.index_html()
        mimetype = obj.getContentType()

    elif obj.meta_type in ('ZMSFile', ):
        lang = attr[attr.rfind('_') + 1:]
        req = {'lang': lang}
        file = obj.getObjProperty('file', req)
        source = ''
        mimetype = None
        if file:
            source = file.getData()
            mimetype = file.getContentType()

    elif obj.meta_type in ('TTWObject', ) and attr not in ('SearchableText', ):
        field = obj.get(attr)
        source = str(field)
        if field.meta_type in ('ZMSFile', 'File'):
            mimetype = field.getContentType()
        else:
            mimetype = None

    else:
        # default behaviour: try to obtain the source from
        # the attribute or method call return value

        try:
            source = getattr(obj, attr)
            if callable(source): source = source()
            if not isinstance(source, unicode):
                source = str(source)
        except (AttributeError, TypeError):
            return None

    # If enabled, we try to find a valid document converter
    # and convert the data to get a hopefully text only representation
    # of the data.

    if self.use_converters:
        if mimetype is None or mimetype == 'application/octet-stream':
            mimetype, encoding = guess_content_type(obj.getId(), source)
            if not encoding:
                encoding = self.default_encoding

        try:
            converter = ConverterRegistry.get(mimetype)
        except RegistryException:
            LOG(
                'textindexng', ERROR,
                '%s could not be converted because no converter could be found for %s'
                % (obj.absolute_url(1), mimetype))
            return None

        if converter:
            try:
                source, encoding = converter.convert2(source, encoding,
                                                      mimetype)
            except:
                try:
                    source = converter.convert(source)
                except:
                    LOG('textindexng',
                        ERROR,
                        '%s could not be converted' % obj.absolute_url(1),
                        error=sys.exc_info())
                    return None

        if obj.meta_type == 'Portal File':
            source += ' ' + obj.SearchableText()

    # Now we try to get a valid encoding. For unicode strings
    # we have to perform no action. For string objects we check
    # if the document has an attibute (not a method) '<index>_encoding'.
    # As fallback we also check for the presence of an attribute
    # 'document_encoding'. Checking for the two attributes allows
    # us to define different encodings for different attributes
    # on an object. This is useful when an object stores multiple texts
    # as attributes within the same instance (e.g. for multilingual
    # versions of a text but with different encodings).
    # If no encoding is specified as object attribute, we will use
    # Python's default encoding.
    # After getting the encoding, we convert the data to unicode.

    if isinstance(source, str):
        if encoding is None:
            try:
                encoding = self.default_encoding
            except:
                encoding = self.default_encoding = 'iso-8859-15'

            for k in ['document_encoding', attr + '_encoding']:
                enc = getattr(obj, k, None)
                if enc is not None: encoding = enc

        if encoding == 'ascii': encoding = 'iso-8859-15'
        try:
            source = unicode(source, encoding, 'strict')
        except UnicodeDecodeError:
            LOG(
                'textindexng', WARNING,
                'UnicodeDecodeError raised from %s - ignoring unknown unicode characters'
                % obj.absolute_url(1))
            source = unicode(source, encoding, 'ignore')

    elif isinstance(source, unicode):
        pass
    else:
        raise TXNGError, "unknown object type"

    source = source.strip()
    if not source: return None

    # Normalization: apply translation table to data
    if self.use_normalizer:
        source = NormalizerRegistry.get(self.use_normalizer).process(source)

    # Split the text into a list of words
    SP = SplitterRegistry.get(self.use_splitter)

    _source = source
    words = SP(casefolding=self.splitter_casefolding,
               separator=self.splitter_separators,
               maxlen=self.splitter_max_len,
               singlechar=self.splitter_single_chars).split(_source)

    #  remove stopwords from data
    if self.use_stopwords:
        words = self.use_stopwords.process(words)

    # We pass the list of words to the corresponding lexicon
    # and obtain a list of wordIds. The "old" TextIndex iterated
    # over every single words (overhead).
    return self._lexicon.getWordIdList(words)
Esempio n. 5
0
    def _index_object(self, documentId, obj, threshold=None, attr=''):

        encoding = self.default_encoding
        source = mimetype = None

        # This is to support foreign file formats that
        # are stored as "File" objects when searching
        # through PrincipiaSearchSource

        if hasattr(obj, 'txng_get'):
            # Check if the object has a method txng_get()
            result = obj.txng_get([attr])
            if result is None: return None
            source, mimetype, encoding = result

        elif obj.meta_type in ('File', 'Portal File') and  \
           attr in ('PrincipiaSearchSource', 'SearchableText'):

            source= getattr(obj, attr, None)
            if source and not self.use_converters:
                if callable(source): source = source()
            else:              
                source = str(obj)
            mimetype = obj.content_type

        elif obj.meta_type == 'ExtFile' and \
           attr in ('PrincipiaSearchSource', 'SearchableText'):
            source = obj.index_html()
            mimetype = obj.getContentType()

        elif obj.meta_type in ('ZMSFile',):
            lang = attr[attr.rfind('_')+1:]
            req = {'lang' : lang}
            file = obj.getObjProperty('file', req)
            source = ''
            mimetype = None
            if file:
                source = file.getData()
                mimetype = file.getContentType()
   
        elif obj.meta_type in ('TTWObject',) and attr not in ('SearchableText', ): 
            field = obj.get(attr)
            source = str(field)
            if field.meta_type in ( 'ZMSFile', 'File' ):
                mimetype = field.getContentType()
            else:
                mimetype = None

        else:
            # default behaviour: try to obtain the source from
            # the attribute or method call return value

            try:
                source = getattr(obj, attr)
                if callable(source): source = source()
                if not isinstance(source, unicode):
                    source = str(source)
            except (AttributeError, TypeError):
                return None
        
        # If enabled, we try to find a valid document converter
        # and convert the data to get a hopefully text only representation
        # of the data.

        if self.use_converters:
            if mimetype is None or mimetype == 'application/octet-stream':
                mimetype, encoding = guess_content_type(obj.getId(), source)
                if not encoding:
                    encoding = self.default_encoding

            try: 
                converter = ConverterRegistry.get(mimetype)
            except RegistryException: 
                LOG('textindexng', ERROR, '%s could not be converted because no converter could be found for %s' % (obj.absolute_url(1), mimetype))
                return None

            if converter:
                try:
                    source, encoding = converter.convert2(source, encoding, mimetype)
                except:
                    try:
                        source = converter.convert(source)
                    except:
                        LOG('textindexng', ERROR, '%s could not be converted' % obj.absolute_url(1), error=sys.exc_info())
                        return None

            if obj.meta_type == 'Portal File': 
                source += ' ' + obj.SearchableText()

        # Now we try to get a valid encoding. For unicode strings
        # we have to perform no action. For string objects we check
        # if the document has an attibute (not a method) '<index>_encoding'.
        # As fallback we also check for the presence of an attribute
        # 'document_encoding'. Checking for the two attributes allows
        # us to define different encodings for different attributes
        # on an object. This is useful when an object stores multiple texts
        # as attributes within the same instance (e.g. for multilingual
        # versions of a text but with different encodings). 
        # If no encoding is specified as object attribute, we will use
        # Python's default encoding.
        # After getting the encoding, we convert the data to unicode.

        if isinstance(source, str):
            if encoding is None:
                try: encoding = self.default_encoding
                except: encoding = self.default_encoding = 'iso-8859-15'

                for k in ['document_encoding', attr + '_encoding']:
                    enc = getattr(obj, k, None)
                    if enc is not None: encoding = enc  

            if encoding=='ascii': encoding ='iso-8859-15'         
            try:
                source = unicode(source, encoding, 'strict')
            except UnicodeDecodeError:
                LOG('textindexng', WARNING, 'UnicodeDecodeError raised from %s - ignoring unknown unicode characters'  % obj.absolute_url(1))
                source = unicode(source, encoding, 'ignore')
 
        elif isinstance(source, unicode):  pass
        else: raise TXNGError,"unknown object type" 

        source = source.strip()
        if not source: return None

        # Normalization: apply translation table to data
        if self.use_normalizer:
            source = NormalizerRegistry.get(self.use_normalizer).process(source)    
 
        # Split the text into a list of words
        SP = SplitterRegistry.get(self.use_splitter)

        _source = source
        words = SP(casefolding  = self.splitter_casefolding,
                   separator    = self.splitter_separators,
                   maxlen       = self.splitter_max_len,
                   singlechar   = self.splitter_single_chars
                   ).split(_source)

        #  remove stopwords from data
        if self.use_stopwords:
            words = self.use_stopwords.process( words ) 

        # We pass the list of words to the corresponding lexicon
        # and obtain a list of wordIds. The "old" TextIndex iterated
        # over every single words (overhead).
        return self._lexicon.getWordIdList(words)
Esempio n. 6
0
# This software is governed by a license. See
# LICENSE.txt for the terms of this license.
#
###########################################################################


# check for parser modules

import os
from Products.TextIndexNG2.Registry import NormalizerRegistry
from Products.TextIndexNG2.Normalizer import FileNormalizer
from Products.TextIndexNG2 import fast_startup


if not fast_startup:

    _oldcwd = os.getcwd()

    os.chdir(__path__[0])
    files = os.listdir('.')
    files = [ x   for x in files if x.endswith('.txt') ] 

    for f  in files:

        n = FileNormalizer(f)
        try: NormalizerRegistry.register(n.getLanguage(), n) 
        except: 
            pass

    os.chdir(_oldcwd)