def __init__(self, id, extra, caller):
        self.id = id
        self.title = id

        # fields
        fields = [id]  # default
        if get(extra, 'fields', []):
            fields = get(extra, 'fields')

        self.index = Index(
            fields=fields,
            lexicon=get(extra, 'lexicon', DEFAULT_LEXICON),
            storage=get(extra, 'storage', DEFAULT_STORAGE),
            splitter=get(extra, 'splitter', DEFAULT_SPLITTER),
            autoexpand=get(extra, 'autoexpand', 'off'),
            autoexpand_limit=get(extra, 'autoexpand_limit', 4),
            query_parser=get(extra, 'query_parser', 'txng.parsers.en'),
            use_stemmer=get(extra, 'use_stemmer', False),
            languages=get(extra, 'languages', ('en', )),
            use_stopwords=bool(get(extra, 'use_stopwords')),
            default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING),
            use_normalizer=bool(get(extra, 'use_normalizer')),
            dedicated_storage=bool(get(extra, 'dedicated_storage')),
            splitter_casefolding=bool(get(extra, 'splitter_casefolding',
                                          True)),
            splitter_additional_chars=get(extra, 'splitter_additional_chars',
                                          DEFAULT_ADDITIONAL_CHARS),
            index_unknown_languages=bool(
                get(extra, 'index_unknown_languages', True)),
            ranking=bool(get(extra, 'ranking')),
            ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)),
        )
Example #2
0
 def testSimpleSplitterOnQuery(self):
     # The simple splitter strips punctuation. Thus foo.bar becomes foobar.
     I = Index(fields=('text', ), languages=('en', ))
     I.index_object(Mock('en', text=u'foo.bar baz'), 1)
     lexicon = I.getLexicon()
     self.assertEquals(['baz', 'foobar'], lexicon.getWordsForLanguage('en'))
     self._test(I, u'foo.bar', 'en', (1, ), 'text')
 def testSimpleSplitterOnQuery(self):
     # The simple splitter strips punctuation. Thus foo.bar becomes foobar.
     I = Index(fields=('text',), languages=('en',))
     I.index_object(Mock('en', text=u'foo.bar baz'), 1)
     lexicon = I.getLexicon()
     self.assertEquals(['baz', 'foobar'],
                       lexicon.getWordsForLanguage('en'))
     self._test(I, u'foo.bar', 'en', (1,), 'text')
 def testSplitterWithAdditionalChars(self):
     I = Index(fields=('text',), languages=('en',),
               splitter_additional_chars='.-+')
     I.index_object(Mock('en', text=u'c++x hello-world algol68'), 1)
     lexicon = I.getLexicon()
     self.assertEquals(['algol68', 'cx', 'hello-world'],
                       lexicon.getWordsForLanguage('en'))
     self._test(I, u'c++x OR xyz', 'en', (1,), 'text')
     self._test(I, u'c++x', 'en', (1,), 'text')
Example #5
0
 def testSplitterWithAdditionalChars(self):
     I = Index(fields=('text', ),
               languages=('en', ),
               splitter_additional_chars='.-+')
     I.index_object(Mock('en', text=u'c++x hello-world algol68'), 1)
     lexicon = I.getLexicon()
     self.assertEquals(['algol68', 'cx', 'hello-world'],
                       lexicon.getWordsForLanguage('en'))
     self._test(I, u'c++x OR xyz', 'en', (1, ), 'text')
     self._test(I, u'c++x', 'en', (1, ), 'text')
Example #6
0
 def testDE(self):
     I = Index(fields=('text', ),
               languages=('de', ),
               index_unknown_languages=False)
     I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15')), 1)
     # this raises an exception because the index does not know about 'fr' or 'en'
     self.assertRaises(ValueError, I.index_object,
                       Mock('fr', text=unicode(fr1, 'iso-8859-15')), 4)
     self.assertRaises(ValueError, I.index_object,
                       Mock('en', text=unicode(en1, 'iso-8859-15')), 5)
 def testIndexWithOneLanguage(self):
     o1 = Mock(text=u'The quick brown fox', language='en')
     o2 = Mock(text=u'der schnelle braune fuchs', language='de')
     o3 = Mock(text=u'je ne sais pas', language='fr')
     I = Index(fields=('text',), dedicated_storage=True, languages=('en',), index_unknown_languages=False)
     I.index_object(o1, 1)
     self.assertRaises(ValueError, I.index_object, o2, 2)
     self.assertRaises(ValueError, I.index_object, o2, 3)
     en_words = I._lexicon.getWordsForLanguage('en')
     en_words.sort()
     self.assertEqual(en_words, ['brown', 'fox', 'quick', 'the'])
Example #8
0
 def testStopWords(self):
     I = Index(splitter_casefolding=True,
               use_stopwords=False,
               fields=('foo', ))
     self._test(I, u'the black blue fox', ('the', 'black', 'blue', 'fox'))
     I = Index(splitter_casefolding=True,
               use_stopwords=True,
               fields=('foo', ))
     self._test(I, u'the black blue fox', ('black', 'blue', 'fox'), 'en')
     self._test(I, u'das Auto auf dem garten',
                ('das', 'auto', 'auf', 'dem', 'garten'), 'en')
     self._test(I, u'das Auto auf dem garten',
                ('das', 'auto', 'auf', 'dem', 'garten'), 'xx')
     self._test(I, u'das Auto auf dem garten', ('auto', 'garten'), 'de')
Example #9
0
 def testIndexWithOneLanguage(self):
     o1 = Mock(text=u'The quick brown fox', language='en')
     o2 = Mock(text=u'der schnelle braune fuchs', language='de')
     o3 = Mock(text=u'je ne sais pas', language='fr')
     I = Index(fields=('text', ),
               dedicated_storage=True,
               languages=('en', ),
               index_unknown_languages=False)
     I.index_object(o1, 1)
     self.assertRaises(ValueError, I.index_object, o2, 2)
     self.assertRaises(ValueError, I.index_object, o2, 3)
     en_words = I._lexicon.getWordsForLanguage('en')
     en_words.sort()
     self.assertEqual(en_words, ['brown', 'fox', 'quick', 'the'])
Example #10
0
 def testSingleLanguageDependentSearches(self):
     I = Index(fields=('text', ), languages=('de', 'fr', 'en'))
     self.setupIndex(I)
     self._test(I, u'Gleich ihr', 'de', (1, ))
     self._test(I, u'sich', 'de', (1, 2, 3))
     self._test(I, u'"an sich"', 'de', (3, ))
     self._test(I, u'"an sich"', 'fr', ())
     self._test(I, u'"YXXX YYY"', 'de', ())
     self._test(I, u'emanzipation ', 'de', (2, 3))
     self._test(I, u'"denken zur emanzipation" not selbsterhaltung', 'de',
                ())
     self._test(I, u'"conceptualist"', 'en', (
         7,
         9,
     ))
     self._test(I, u'emanzipation -denken', 'de', (3, ))
     self._test(I, u'emanzipation not denken', 'de', (3, ))
     self._test(I, u'emanzipation and not denken', 'de', (3, ))
     self._test(I, u'not denken and emanzipation ', 'de', (3, ))
     self._test(
         I,
         u'"that we have to choose between postpatriarchial conceptualist theory textual and objectivism"',
         'en', ())
     self._test(
         I,
         u'"that we have to choose between postpatriarchial conceptualist theory and textual objectivism"',
         'en', (9, ))
Example #11
0
    def testWithAndWithoutStopwords(self):
        I = Index(fields=('text', ),
                  languages=('de', 'fr', 'en'),
                  use_stopwords=False)
        self.setupIndex(I)
        self._test(I, u'das opfer wird uns frei machen', 'de', (1, ))

        I = Index(fields=('text', ),
                  languages=('de', 'fr', 'en'),
                  use_stopwords=True)
        self.setupIndex(I)
        # This should give a hit since 'das' should be filtered from the query
        self._test(I, u'das opfer wird uns frei machen', 'de', (1, ))
        self._test(I, u'DaS opfer wird uns frei machen', 'de', (1, ))
        self._test(I, u'sur les pantalons pour homme', 'fr', (5, ))
        self._test(I, u'413 sur les pantalons pour homme', 'fr', (5, ))
    def __init__(self, id, extra, caller):
        self.id = id
        self.title = id

        # fields
        fields = [id] # default
        if get(extra, 'fields', []):
            fields = get(extra, 'fields')

        self.index = Index(fields=fields,
                           lexicon=get(extra, 'lexicon', DEFAULT_LEXICON),
                           storage=get(extra, 'storage', DEFAULT_STORAGE),
                           splitter=get(extra, 'splitter', DEFAULT_SPLITTER),
                           autoexpand=get(extra, 'autoexpand', 'off'),
                           autoexpand_limit=get(extra, 'autoexpand_limit', 4),
                           query_parser=get(extra, 'query_parser', 'txng.parsers.en'),
                           use_stemmer=get(extra, 'use_stemmer', False),
                           languages=get(extra, 'languages', ('en',)),
                           use_stopwords=bool(get(extra, 'use_stopwords')),
                           default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING),
                           use_normalizer=bool(get(extra, 'use_normalizer')),
                           dedicated_storage=bool(get(extra, 'dedicated_storage')),
                           splitter_casefolding=bool(get(extra, 'splitter_casefolding', True)),
                           splitter_additional_chars=get(extra, 'splitter_additional_chars', DEFAULT_ADDITIONAL_CHARS),
                           index_unknown_languages=bool(get(extra, 'index_unknown_languages', True)),
                           ranking=bool(get(extra, 'ranking')),
                           ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)),
                           )
    def testUnindex2(self):
        """ now with random adding removal """


        index = Index(fields=('text', ), languages=('en',), splitter_additional_chars='.-+')
        for i in range(1, 200):
            self._addDoc(index, i)

        for iterations in range(100):

            num = random.choice(range(1, 200))
            if random.randint(0, 1) == 0:
                index.unindex_object(num)
            else:
                self._addDoc(index, num)

            result = self.check_storage(index)
            self.assertEqual(result, True)
Example #14
0
    def testUnindex2(self):
        """ now with random adding removal """

        index = Index(fields=('text', ),
                      languages=('en', ),
                      splitter_additional_chars='.-+')
        for i in range(1, 200):
            self._addDoc(index, i)

        for iterations in range(100):

            num = random.choice(range(1, 200))
            if random.randint(0, 1) == 0:
                index.unindex_object(num)
            else:
                self._addDoc(index, num)

            result = self.check_storage(index)
            self.assertEqual(result, True)
    def testUnindex(self):
        """ check storage consistency with random document removals
        """

        index = Index(fields=('text', ), languages=('en',), splitter_additional_chars='.-+')
        for i in range(1, 200):
            self._addDoc(index, i)

        # remove all indexed documents in random order
        # and check the consistency of the storage upon
        # each removal

        lst = range(1, 200)
        while lst:
            num = random.choice(lst)
            index.unindex_object(num)
            result = self.check_storage(index)
            self.assertEqual(result, True)
            lst.remove(num)
Example #16
0
    def testUnindex(self):
        """ check storage consistency with random document removals
        """

        index = Index(fields=('text', ),
                      languages=('en', ),
                      splitter_additional_chars='.-+')
        for i in range(1, 200):
            self._addDoc(index, i)

        # remove all indexed documents in random order
        # and check the consistency of the storage upon
        # each removal

        lst = range(1, 200)
        while lst:
            num = random.choice(lst)
            index.unindex_object(num)
            result = self.check_storage(index)
            self.assertEqual(result, True)
            lst.remove(num)
Example #17
0
 def testNormalizer(self):
     I = Index(splitter_casefolding=True,
               use_stopwords=False,
               use_normalizer=True,
               fields=('foo', ))
     self._test(I, u'für und über drüben gehen Wir',
                (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'),
                'de')
     self._test(I, u'fÜr und über drÜben gehen Wir',
                (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'),
                'de')
     self._test(I, u'für und über drüben gehen Wir',
                (u'für', u'und', u'über', u'drüben', u'gehen', u'wir'),
                'en')
Example #18
0
    def testRanking(self):
        r = ResultSet(DocidList((2, 3)), (('foo', 5), ))

        called = []
        result = object()

        def ranking_function(*args):
            called.append(args)
            return result

        index = Index()
        r.ranking(ranking_function, index)
        self.assertEquals(1, len(called))
        self.assertEquals((index, r, config.DEFAULT_LANGUAGE, 50), called[0])
        self.assertEquals(result, r.ranked_results)
Example #19
0
 def testSearchAllFields(self):
     o1 = Mock('en', text=u'The quick brown fox', title=u'Fox')
     o2 = Mock('en', text=u'Mary had a little lamb.', title=u'Quick Mary')
     o3 = Mock('en', text=u'Pop goes the weasel!', title=u'Weasel')
     I = Index(fields=('title', 'text'), languages=('en', ))
     I.index_object(o1, 1)
     I.index_object(o2, 2)
     I.index_object(o3, 3)
     res = I.search(u'quick')
     self.assertEquals([2], list(res.getDocids()))
     res = I.search(u'quick', search_all_fields=True)
     self.assertEquals([1, 2], list(res.getDocids()))
     self.assertRaises(ValueError,
                       I.search,
                       u'quick',
                       field='text',
                       search_all_fields=True)
 def testMultipleFieldsMultipleLanguages(self):
     I = Index(fields=('text','author'), languages=('de', 'fr', 'en'))
     I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1)
     I.index_object(Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2)
     I.index_object(Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3)
     self._test(I, u'andreas jung', 'en', ())
     self._test(I, u'andreas jung', 'de', ())
     self._test(I, u'andreas jung', 'de', (1,), 'author')
     self._test(I, u'jung   andreas', 'de', (1,), 'author')
     self._test(I, u'"jung   andreas"', 'de', (), 'author')
     self._test(I, u'"andreas jung"', 'de', (1,), 'author')
     self._test(I, u'andrea jung', 'de', (2,), 'author')
     self._test(I, u'andreas jung', 'de', (1,), 'author')
     self._test(I, u'na*', 'de', (3,), 'author')
Example #21
0
 def testGermanStemmer(self):
     I = Index(fields=('text', ),
               languages=('de', 'fr', 'en'),
               use_stemmer=True)
     self.setupIndex(I)
     self._test(I, u'Gleich ihr', 'de', (1, ))
     self._test(I, u'Gleiche ihren', 'de', (1, ))
     self._test(I, u'existentiellen Eigentlichen', 'de', (1, ))
     self._test(I, u'existentiell Eigentlich', 'de', (1, ))
     self._test(I, u'"existentiellen Eigentlichen"', 'de', (1, ))
     self._test(I, u'"existentiell Eigentlicher"', 'de', (1, ))
     self._test(I, u'"existentiell Eigentliche"', 'de', (1, ))
     # enabled stemming -> no wildcard searches supported
     self.assertRaises(ValueError, I.search, 'existentiell*')
     self.assertRaises(ValueError, I.search, 'existent?foo')
     self.assertRaises(ValueError, I.search, '*a')
 def testSearchAllFields(self):
     o1 = Mock('en', text=u'The quick brown fox', title=u'Fox')
     o2 = Mock('en', text=u'Mary had a little lamb.', title=u'Quick Mary')
     o3 = Mock('en', text=u'Pop goes the weasel!', title=u'Weasel')
     I = Index(fields=('title', 'text'), languages=('en',))
     I.index_object(o1, 1)
     I.index_object(o2, 2)
     I.index_object(o3, 3)
     res = I.search(u'quick')
     self.assertEquals([2], list(res.getDocids()))
     res = I.search(u'quick', search_all_fields=True)
     self.assertEquals([1, 2], list(res.getDocids()))
     self.assertRaises(
         ValueError,
         I.search, u'quick', field='text', search_all_fields=True)
Example #23
0
 def testMultipleFieldsMultipleLanguages(self):
     I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en'))
     I.index_object(
         Mock('de',
              text=unicode(de1, 'iso-8859-15'),
              author=u'Andreas Jung'), 1)
     I.index_object(
         Mock('de', text=unicode(de2, 'iso-8859-15'),
              author=u'Andrea Jung'), 2)
     I.index_object(
         Mock('de', text=unicode(de3, 'iso-8859-15'),
              author=u'der Nasbär'), 3)
     self._test(I, u'andreas jung', 'en', ())
     self._test(I, u'andreas jung', 'de', ())
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     self._test(I, u'jung   andreas', 'de', (1, ), 'author')
     self._test(I, u'"jung   andreas"', 'de', (), 'author')
     self._test(I, u'"andreas jung"', 'de', (1, ), 'author')
     self._test(I, u'andrea jung', 'de', (2, ), 'author')
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     self._test(I, u'na*', 'de', (3, ), 'author')
Example #24
0
 def testSplitterOnQueryWithDefaultSplitter(self):
     from zopyx.txng3.core.splitter import SplitterFactory
     provideUtility(SplitterFactory,
                    zope.component.interfaces.IFactory,
                    name='txng.splitters.default')
     I = Index(fields=('text', ),
               languages=('en', ),
               splitter_additional_chars='-',
               splitter='txng.splitters.default')
     I.index_object(Mock('en', text=u'asdf abc.de-Efgh bla bla fasel'), 1)
     I.index_object(Mock('en', text=u'asdf abc de-Efgh bla bla fasel'), 2)
     I.index_object(Mock('en', text=u'asdf abc'), 3)
     lexicon = I.getLexicon()
     self._test(I, u'abc.de-Efgh', 'en', (1, 2), 'text')
     # Test with a more complex query
     self._test(I, u'sth OR abc.de-Efgh', 'en', (1, 2), 'text')
     # Test with a "not" query
     self._test(I, u'asdf AND NOT abc.de-Efgh', 'en', (3, ), 'text')
Example #25
0
    def test_ranking_method(self):
        result = []
        called = []

        def ranking(*args):
            called.append(args)
            return result

        provideUtility(ranking, IRanking, name='testranking')
        provideUtility(
            zopyx.txng3.core.storage.StorageWithTermFrequencyFactory,
            zope.component.interfaces.IFactory,
            name='freq')

        index = Index(fields=('text', ),
                      storage='freq',
                      ranking=True,
                      ranking_method='testranking')
        result = index.search(u'foo')
        self.assertEquals(1, len(called))
        self.assertEquals((index, result, DEFAULT_LANGUAGE, 50), called[0])
 def testSplitterOnQueryWithDefaultSplitter(self):
     from zopyx.txng3.core.splitter import SplitterFactory
     provideUtility(SplitterFactory, zope.component.interfaces.IFactory,
         name='txng.splitters.default')
     I = Index(fields=('text',), languages=('en',),
              splitter_additional_chars='-',
              splitter='txng.splitters.default')
     I.index_object(Mock('en', text=u'asdf abc.de-Efgh bla bla fasel'), 1)
     I.index_object(Mock('en', text=u'asdf abc de-Efgh bla bla fasel'), 2)
     I.index_object(Mock('en', text=u'asdf abc'), 3)
     lexicon = I.getLexicon()
     self._test(I, u'abc.de-Efgh', 'en', (1, 2), 'text')
     # Test with a more complex query
     self._test(I, u'sth OR abc.de-Efgh', 'en', (1, 2), 'text')
     # Test with a "not" query
     self._test(I, u'asdf AND NOT abc.de-Efgh', 'en', (3,), 'text')
Example #27
0
 def __init__(self,
              field_name=None,
              interface=None,
              field_callable=False,
              use_stemmer=defaults['use_stemmer'],
              dedicated_storage=defaults['dedicated_storage'],
              ranking=defaults['ranking'],
              use_normalizer=defaults['use_normalizer'],
              languages=defaults['languages'],
              use_stopwords=defaults['use_stopwords'],
              autoexpand_limit=defaults['autoexpand_limit'],
              splitter=defaults['splitter'],
              index_unknown_languages=defaults['index_unknown_languages'],
              query_parser=defaults['query_parser'],
              lexicon=defaults['lexicon'],
              splitter_additional_chars=defaults['splitter_add_chars'],
              storage=defaults['storage'],
              splitter_casefolding=defaults['splitter_casefolding'],
              asIFSet=True):
     if ranking:
         util = createObject(storage)
         if not IStorageWithTermFrequency.providedBy(util):
             raise ValueError("This storage cannot be used for ranking")
     if isinstance(field_name, basestring):
         _fields = field_name.split(' ')
     else:
         _fields = field_name
     zope.catalog.attribute.AttributeIndex.__init__(
         self, _fields[0], interface, field_callable)
     if len(_fields) < 2:
         dedicated_storage = False
     _default_fields = [_fields[0]]
     self._index = Index(
         fields=_fields,
         languages=languages.split(' '),
         use_stemmer=use_stemmer,
         dedicated_storage=dedicated_storage,
         ranking=ranking,
         use_normalizer=use_normalizer,
         use_stopwords=use_stopwords,
         storage=storage,
         autoexpand_limit=autoexpand_limit,
         splitter=splitter,
         lexicon=lexicon,
         index_unknown_languages=index_unknown_languages,
         query_parser=query_parser,
         splitter_additional_chars=splitter_additional_chars,
         splitter_casefolding=splitter_casefolding
     )
     self.languages=languages
     self.use_stemmer=use_stemmer
     self.dedicated_storage=dedicated_storage
     self.ranking=ranking
     self.use_normalizer=use_normalizer
     self.use_stopwords=use_stopwords
     self.interface = interface
     self.storage=storage
     self.autoexpand_limit=autoexpand_limit
     self.default_fields=_default_fields
     self._fields=_fields
     self.splitter=splitter
     self.lexicon=lexicon
     self.index_unknown_languages=index_unknown_languages
     self.query_parser=query_parser
     self.splitter_additional_chars=splitter_additional_chars
     self.splitter_casefolding=splitter_casefolding
     self._asIFSet = asIFSet
Example #28
0
class TingIndex(zope.catalog.text.TextIndex, 
                persistent.Persistent):

    zope.interface.implements(
        zope.index.interfaces.IInjection,
        zope.index.interfaces.IStatistics,
        zope.index.interfaces.IIndexSearch,
        ITingIndex)
    
    def __init__(self,
                 field_name=None,
                 interface=None,
                 field_callable=False,
                 use_stemmer=defaults['use_stemmer'],
                 dedicated_storage=defaults['dedicated_storage'],
                 ranking=defaults['ranking'],
                 use_normalizer=defaults['use_normalizer'],
                 languages=defaults['languages'],
                 use_stopwords=defaults['use_stopwords'],
                 autoexpand_limit=defaults['autoexpand_limit'],
                 splitter=defaults['splitter'],
                 index_unknown_languages=defaults['index_unknown_languages'],
                 query_parser=defaults['query_parser'],
                 lexicon=defaults['lexicon'],
                 splitter_additional_chars=defaults['splitter_add_chars'],
                 storage=defaults['storage'],
                 splitter_casefolding=defaults['splitter_casefolding'],
                 asIFSet=True):
        if ranking:
            util = createObject(storage)
            if not IStorageWithTermFrequency.providedBy(util):
                raise ValueError("This storage cannot be used for ranking")
        if isinstance(field_name, basestring):
            _fields = field_name.split(' ')
        else:
            _fields = field_name
        zope.catalog.attribute.AttributeIndex.__init__(
            self, _fields[0], interface, field_callable)
        if len(_fields) < 2:
            dedicated_storage = False
        _default_fields = [_fields[0]]
        self._index = Index(
            fields=_fields,
            languages=languages.split(' '),
            use_stemmer=use_stemmer,
            dedicated_storage=dedicated_storage,
            ranking=ranking,
            use_normalizer=use_normalizer,
            use_stopwords=use_stopwords,
            storage=storage,
            autoexpand_limit=autoexpand_limit,
            splitter=splitter,
            lexicon=lexicon,
            index_unknown_languages=index_unknown_languages,
            query_parser=query_parser,
            splitter_additional_chars=splitter_additional_chars,
            splitter_casefolding=splitter_casefolding
        )
        self.languages=languages
        self.use_stemmer=use_stemmer
        self.dedicated_storage=dedicated_storage
        self.ranking=ranking
        self.use_normalizer=use_normalizer
        self.use_stopwords=use_stopwords
        self.interface = interface
        self.storage=storage
        self.autoexpand_limit=autoexpand_limit
        self.default_fields=_default_fields
        self._fields=_fields
        self.splitter=splitter
        self.lexicon=lexicon
        self.index_unknown_languages=index_unknown_languages
        self.query_parser=query_parser
        self.splitter_additional_chars=splitter_additional_chars
        self.splitter_casefolding=splitter_casefolding
        self._asIFSet = asIFSet

    def clear(self):
        self._index.clear()

    def documentCount(self):
        """See interface IStatistics
        """
        return len(self._index.getStorage(self.default_fields[0]))

    def wordCount(self):
        """See interface IStatistics
        """
        return len(self._index.getLexicon())

    def index_doc(self, docid, value):
        """See interface IInjection
        """
        if value is not None:
            self._index.index_object(value, docid)

    def unindex_doc(self, docid):
        """See interface IInjection
        """
        self._index.unindex_object(docid)

    def apply(self, query):
        kw = dict()
        if isinstance(query, dict):
            kw.update(query)
            query = kw['query']
            del kw['query']
        res = self._index.search(query, **kw).getDocids()
        if self._asIFSet:
            return BTrees.IFBTree.IFSet(res)
        return res
 def testIndexAndUnindex(self):
     I = Index(fields=('text','author'), languages=('de', 'fr', 'en'))
     I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1)
     I.index_object(Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2)
     I.index_object(Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3)
     self._test(I, u'andreas jung', 'de', (1,), 'author')
     I.unindex_object(1)
     I.unindex_object(2)
     I.unindex_object(3)
     I.unindex_object(9999)
     self._test(I, u'andreas jung', 'de', (), 'author')
     I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1)
     self._test(I, u'andreas jung', 'de', (1,), 'author')
     self._test(I, u'andreas jung', 'de', (), 'text')
     self._test(I, u'das opfer wird', 'de', (1,), 'text')
     I.index_object(Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 1)
     self._test(I, u'andrea jung', 'de', (1,), 'author')
 def testReindex2(self):
     I = Index(fields=('text',), languages=('de',))
     I.index_object(Mock('de', text=u'foo bar'), 1)
     I.index_object(Mock('de', text=u'foo'), 1)
     self._test(I, u'bar', 'de', (), 'text')
     self._test(I, u'foo', 'de', (1,), 'text')
 def testDE(self):
     I = Index(fields=('text',), languages=('de', ), index_unknown_languages=False)
     I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15')), 1)
     # this raises an exception because the index does not know about 'fr' or 'en'
     self.assertRaises(ValueError, I.index_object, Mock('fr', text=unicode(fr1, 'iso-8859-15')), 4)
     self.assertRaises(ValueError, I.index_object, Mock('en', text=unicode(en1, 'iso-8859-15')), 5)
Example #32
0
 def testIndexAndUnindex(self):
     I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en'))
     I.index_object(
         Mock('de',
              text=unicode(de1, 'iso-8859-15'),
              author=u'Andreas Jung'), 1)
     I.index_object(
         Mock('de', text=unicode(de2, 'iso-8859-15'),
              author=u'Andrea Jung'), 2)
     I.index_object(
         Mock('de', text=unicode(de3, 'iso-8859-15'),
              author=u'der Nasbär'), 3)
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     I.unindex_object(1)
     I.unindex_object(2)
     I.unindex_object(3)
     I.unindex_object(9999)
     self._test(I, u'andreas jung', 'de', (), 'author')
     I.index_object(
         Mock('de',
              text=unicode(de1, 'iso-8859-15'),
              author=u'Andreas Jung'), 1)
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     self._test(I, u'andreas jung', 'de', (), 'text')
     self._test(I, u'das opfer wird', 'de', (1, ), 'text')
     I.index_object(
         Mock('de', text=unicode(de2, 'iso-8859-15'),
              author=u'Andrea Jung'), 1)
     self._test(I, u'andrea jung', 'de', (1, ), 'author')
Example #33
0
 def testEmptyQuery(self):
     I = Index(fields=('foo', ))
     self.assertRaises(ValueError, I.search, query='')
Example #34
0
 def testReindex2(self):
     I = Index(fields=('text', ), languages=('de', ))
     I.index_object(Mock('de', text=u'foo bar'), 1)
     I.index_object(Mock('de', text=u'foo'), 1)
     self._test(I, u'bar', 'de', (), 'text')
     self._test(I, u'foo', 'de', (1, ), 'text')
Example #35
0
 def __init__(self,
              field_name=None,
              interface=None,
              field_callable=False,
              use_stemmer=defaults['use_stemmer'],
              dedicated_storage=defaults['dedicated_storage'],
              ranking=defaults['ranking'],
              use_normalizer=defaults['use_normalizer'],
              languages=defaults['languages'],
              use_stopwords=defaults['use_stopwords'],
              autoexpand_limit=defaults['autoexpand_limit'],
              splitter=defaults['splitter'],
              index_unknown_languages=defaults['index_unknown_languages'],
              query_parser=defaults['query_parser'],
              lexicon=defaults['lexicon'],
              splitter_additional_chars=defaults['splitter_add_chars'],
              storage=defaults['storage'],
              splitter_casefolding=defaults['splitter_casefolding'],
              asIFSet=True):
     if ranking:
         util = createObject(storage)
         if not IStorageWithTermFrequency.providedBy(util):
             raise ValueError("This storage cannot be used for ranking")
     if isinstance(field_name, basestring):
         _fields = field_name.split(' ')
     else:
         _fields = field_name
     zope.catalog.attribute.AttributeIndex.__init__(self, _fields[0],
                                                    interface,
                                                    field_callable)
     if len(_fields) < 2:
         dedicated_storage = False
     _default_fields = [_fields[0]]
     self._index = Index(
         fields=_fields,
         languages=languages.split(' '),
         use_stemmer=use_stemmer,
         dedicated_storage=dedicated_storage,
         ranking=ranking,
         use_normalizer=use_normalizer,
         use_stopwords=use_stopwords,
         storage=storage,
         autoexpand_limit=autoexpand_limit,
         splitter=splitter,
         lexicon=lexicon,
         index_unknown_languages=index_unknown_languages,
         query_parser=query_parser,
         splitter_additional_chars=splitter_additional_chars,
         splitter_casefolding=splitter_casefolding)
     self.languages = languages
     self.use_stemmer = use_stemmer
     self.dedicated_storage = dedicated_storage
     self.ranking = ranking
     self.use_normalizer = use_normalizer
     self.use_stopwords = use_stopwords
     self.interface = interface
     self.storage = storage
     self.autoexpand_limit = autoexpand_limit
     self.default_fields = _default_fields
     self._fields = _fields
     self.splitter = splitter
     self.lexicon = lexicon
     self.index_unknown_languages = index_unknown_languages
     self.query_parser = query_parser
     self.splitter_additional_chars = splitter_additional_chars
     self.splitter_casefolding = splitter_casefolding
     self._asIFSet = asIFSet
class TextIndexNG3(SimpleItem, PropertyManager):

    implements(ITextIndexNG3, IPluggableIndex)

    meta_type = 'TextIndexNG3'
    default_encoding = 'iso-8859-15'    # I think we don't need this anymore
    management_page_charset = 'utf-8'   # needed for several ZMI methods
    manage_options = ( {'label' : 'Index', 'action': 'manage_workspace'},
                       {'label' : 'Vocabulary', 'action' : 'vocabularyform'},
                       {'label' : 'Test', 'action' : 'queryform'},
                       {'label' : 'Converters', 'action' : 'converters'},
                       {'label' : 'Thesaurus', 'action' : 'thesaurus'},
                       {'label' : 'Adapters', 'action' : 'adapters'},
                     ) +\
                     SimpleItem.manage_options + \
                     PropertyManager.manage_options    

    query_options = ('query', 'encoding', 'parser', 'language', 'field',
                     'autoexpand', 'similarity_ratio',
                     'ranking', 'ranking_maxhits', 'thesaurus',
                     'search_all_fields')

    def __init__(self, id, extra, caller):
        self.id = id
        self.title = id

        # fields
        fields = [id] # default
        if get(extra, 'fields', []):
            fields = get(extra, 'fields')

        self.index = Index(fields=fields,
                           lexicon=get(extra, 'lexicon', DEFAULT_LEXICON),
                           storage=get(extra, 'storage', DEFAULT_STORAGE),
                           splitter=get(extra, 'splitter', DEFAULT_SPLITTER),
                           autoexpand=get(extra, 'autoexpand', 'off'),
                           autoexpand_limit=get(extra, 'autoexpand_limit', 4),
                           query_parser=get(extra, 'query_parser', 'txng.parsers.en'),
                           use_stemmer=get(extra, 'use_stemmer', False),
                           languages=get(extra, 'languages', ('en',)),
                           use_stopwords=bool(get(extra, 'use_stopwords')),
                           default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING),
                           use_normalizer=bool(get(extra, 'use_normalizer')),
                           dedicated_storage=bool(get(extra, 'dedicated_storage')),
                           splitter_casefolding=bool(get(extra, 'splitter_casefolding', True)),
                           splitter_additional_chars=get(extra, 'splitter_additional_chars', DEFAULT_ADDITIONAL_CHARS),
                           index_unknown_languages=bool(get(extra, 'index_unknown_languages', True)),
                           ranking=bool(get(extra, 'ranking')),
                           ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)),
                           )

    def clear(self):
        """ clear the index """
        self.index.clear()

    def index_object(self, docid, obj, threshold=None):
        result = self.index.index_object(obj, docid)
        return int(result)

    def unindex_object(self, docid):
        self.index.unindex_object(docid)
        return 1

    def getIndexSourceNames(self):
        """ return indexed fields """
        return self.index.fields

    def indexSize(self):
        return len(self.index.getLexicon())

    def getEntryForObject(self, docid, default=None):
        """Get all information contained for 'docid'.

        Returns a string representing a mapping field -> list of indexed words
        for dedicated storages or a list of indexed words for shared storage.
        """
        getWord = self.index.getLexicon().getWord
        d = {}
        for field in self.index.fields:
            try:
                wids = self.index.getStorage(field).getWordIdsForDocId(docid)
            except StorageException:
                wids = ()
            words = [getWord(wid) for wid in wids]
            d[field] = words
        if not self.index.dedicated_storage:
            return repr(d[self.index.fields[0]])
        return repr(d)

    def _apply_index(self, request, cid=''):

        # parse the query options
        record = parseIndexRequest(request, self.getId(), self.query_options)
        if record.keys is None: 
            return None

        # prepare query (must be unicode string)
        query = record.keys[0]
        if not isinstance(query, unicode):
            query = unicode(query, record.get('encoding', self.index.default_encoding), 'ignore')
        if not query:
            return None

        # options
        options = {}
        for k in ('parser', 'language', 'field', 'autoexpand',
                  'similarity_ratio', 'thesaurus', 'ranking',
                  'ranking_maxhits', 'search_all_fields'):
            v = getattr(record, k, marker)
            if v is not marker:
                options[k] = v

        result = self.index.search(query, **options)                                               
        ranked_resultset = result.getRankedResults()
        if ranked_resultset:
            return ranked_resultset, self.id
        else:
            return result.getDocids(), self.id

    def __len__(self):
        return len(self.index)
    numObjects = __len__

    def manage_workspace(self, REQUEST):
        """ redirect to manage since we can not override manage_workspace
            through a Five browser view
        """
        from zope.component import getMultiAdapter
        view = getMultiAdapter((self, REQUEST), name='manageform')
        return view() 
Example #37
0
 def testSettings(self):
     I = Index(fields=('foo', ))
     self.assertEqual(I.splitter, 'txng.splitters.simple')
Example #38
0
 def test_default_ranking_is_cosine(self):
     index = Index()
     self.assertEquals('txng.ranking.cosine', index.ranking_method)
Example #39
0
 def testSearchAllFieldsNotSupportWithoutDedicatedStorage(self):
     I = Index(fields=('title', 'text'), dedicated_storage=False)
     self.assertRaises(ValueError,
                       I.search,
                       u'quick',
                       search_all_fields=True)
class TextIndexNG3(SimpleItem, PropertyManager):

    implements(ITextIndexNG3, IPluggableIndex)

    meta_type = 'TextIndexNG3'
    default_encoding = 'iso-8859-15'  # I think we don't need this anymore
    management_page_charset = 'utf-8'  # needed for several ZMI methods
    manage_options = ( {'label' : 'Index', 'action': 'manage_workspace'},
                       {'label' : 'Vocabulary', 'action' : 'vocabularyform'},
                       {'label' : 'Test', 'action' : 'queryform'},
                       {'label' : 'Converters', 'action' : 'converters'},
                       {'label' : 'Thesaurus', 'action' : 'thesaurus'},
                       {'label' : 'Adapters', 'action' : 'adapters'},
                     ) +\
                     SimpleItem.manage_options + \
                     PropertyManager.manage_options

    query_options = ('query', 'encoding', 'parser', 'language', 'field',
                     'autoexpand', 'similarity_ratio', 'ranking',
                     'ranking_maxhits', 'thesaurus', 'search_all_fields')

    def __init__(self, id, extra, caller):
        self.id = id
        self.title = id

        # fields
        fields = [id]  # default
        if get(extra, 'fields', []):
            fields = get(extra, 'fields')

        self.index = Index(
            fields=fields,
            lexicon=get(extra, 'lexicon', DEFAULT_LEXICON),
            storage=get(extra, 'storage', DEFAULT_STORAGE),
            splitter=get(extra, 'splitter', DEFAULT_SPLITTER),
            autoexpand=get(extra, 'autoexpand', 'off'),
            autoexpand_limit=get(extra, 'autoexpand_limit', 4),
            query_parser=get(extra, 'query_parser', 'txng.parsers.en'),
            use_stemmer=get(extra, 'use_stemmer', False),
            languages=get(extra, 'languages', ('en', )),
            use_stopwords=bool(get(extra, 'use_stopwords')),
            default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING),
            use_normalizer=bool(get(extra, 'use_normalizer')),
            dedicated_storage=bool(get(extra, 'dedicated_storage')),
            splitter_casefolding=bool(get(extra, 'splitter_casefolding',
                                          True)),
            splitter_additional_chars=get(extra, 'splitter_additional_chars',
                                          DEFAULT_ADDITIONAL_CHARS),
            index_unknown_languages=bool(
                get(extra, 'index_unknown_languages', True)),
            ranking=bool(get(extra, 'ranking')),
            ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)),
        )

    def clear(self):
        """ clear the index """
        self.index.clear()

    def index_object(self, docid, obj, threshold=None):
        result = self.index.index_object(obj, docid)
        return int(result)

    def unindex_object(self, docid):
        self.index.unindex_object(docid)
        return 1

    def getIndexSourceNames(self):
        """ return indexed fields """
        return self.index.fields

    def getIndexQueryNames(self):
        """ Return queryable parameters """
        return [self.id]

    def indexSize(self):
        return len(self.index.getLexicon())

    def getEntryForObject(self, docid, default=None):
        """Get all information contained for 'docid'.

        Returns a string representing a mapping field -> list of indexed words
        for dedicated storages or a list of indexed words for shared storage.
        """
        getWord = self.index.getLexicon().getWord
        d = {}
        for field in self.index.fields:
            try:
                wids = self.index.getStorage(field).getWordIdsForDocId(docid)
            except StorageException:
                wids = ()
            words = [getWord(wid) for wid in wids]
            d[field] = words
        if not self.index.dedicated_storage:
            return repr(d[self.index.fields[0]])
        return repr(d)

    def _apply_index(self, request, cid=''):

        # parse the query options
        record = parseIndexRequest(request, self.getId(), self.query_options)
        if record.keys is None:
            return None

        # prepare query (must be unicode string)
        query = record.keys[0]
        if not isinstance(query, str):
            query = str(query,
                        record.get('encoding', self.index.default_encoding),
                        'ignore')
        if not query:
            return None

        # options
        options = {}
        for k in ('parser', 'language', 'field', 'autoexpand',
                  'similarity_ratio', 'thesaurus', 'ranking',
                  'ranking_maxhits', 'search_all_fields'):
            v = getattr(record, k, marker)
            if v is not marker:
                options[k] = v

        result = self.index.search(query, **options)
        ranked_resultset = result.getRankedResults()
        if ranked_resultset:
            return ranked_resultset, self.id
        else:
            return result.getDocids(), self.id

    def __len__(self):
        return len(self.index)

    numObjects = __len__

    def manage_workspace(self, REQUEST):
        """ redirect to manage since we can not override manage_workspace
            through a Five browser view
        """
        from zope.component import getMultiAdapter
        view = getMultiAdapter((self, REQUEST), name='manageform')
        return view()
Example #41
0
 def testEmpty(self):
     I = Index(fields=('oo', ))
     self._test(I, u'', ())
Example #42
0
 def testSplitter(self):
     I = Index(splitter_casefolding=False, fields=('foo', ))
     self._test(I, u'a B c', ('a', 'B', 'c'))
Example #43
0
 def testSetup(self):
     I = Index(fields=('text', ), languages=('de', 'fr', 'en'))
     self.setupIndex(I)
Example #44
0
 def testBBBCosineRanking(self):
     r = ResultSet(DocidList((2, 3)), (('foo', 5), ))
     index = Index()
     r.cosine_ranking(index)
Example #45
0
class TingIndex(zope.catalog.text.TextIndex, persistent.Persistent):

    zope.interface.implements(zope.index.interfaces.IInjection,
                              zope.index.interfaces.IStatistics,
                              zope.index.interfaces.IIndexSearch, ITingIndex)

    def __init__(self,
                 field_name=None,
                 interface=None,
                 field_callable=False,
                 use_stemmer=defaults['use_stemmer'],
                 dedicated_storage=defaults['dedicated_storage'],
                 ranking=defaults['ranking'],
                 use_normalizer=defaults['use_normalizer'],
                 languages=defaults['languages'],
                 use_stopwords=defaults['use_stopwords'],
                 autoexpand_limit=defaults['autoexpand_limit'],
                 splitter=defaults['splitter'],
                 index_unknown_languages=defaults['index_unknown_languages'],
                 query_parser=defaults['query_parser'],
                 lexicon=defaults['lexicon'],
                 splitter_additional_chars=defaults['splitter_add_chars'],
                 storage=defaults['storage'],
                 splitter_casefolding=defaults['splitter_casefolding'],
                 asIFSet=True):
        if ranking:
            util = createObject(storage)
            if not IStorageWithTermFrequency.providedBy(util):
                raise ValueError("This storage cannot be used for ranking")
        if isinstance(field_name, basestring):
            _fields = field_name.split(' ')
        else:
            _fields = field_name
        zope.catalog.attribute.AttributeIndex.__init__(self, _fields[0],
                                                       interface,
                                                       field_callable)
        if len(_fields) < 2:
            dedicated_storage = False
        _default_fields = [_fields[0]]
        self._index = Index(
            fields=_fields,
            languages=languages.split(' '),
            use_stemmer=use_stemmer,
            dedicated_storage=dedicated_storage,
            ranking=ranking,
            use_normalizer=use_normalizer,
            use_stopwords=use_stopwords,
            storage=storage,
            autoexpand_limit=autoexpand_limit,
            splitter=splitter,
            lexicon=lexicon,
            index_unknown_languages=index_unknown_languages,
            query_parser=query_parser,
            splitter_additional_chars=splitter_additional_chars,
            splitter_casefolding=splitter_casefolding)
        self.languages = languages
        self.use_stemmer = use_stemmer
        self.dedicated_storage = dedicated_storage
        self.ranking = ranking
        self.use_normalizer = use_normalizer
        self.use_stopwords = use_stopwords
        self.interface = interface
        self.storage = storage
        self.autoexpand_limit = autoexpand_limit
        self.default_fields = _default_fields
        self._fields = _fields
        self.splitter = splitter
        self.lexicon = lexicon
        self.index_unknown_languages = index_unknown_languages
        self.query_parser = query_parser
        self.splitter_additional_chars = splitter_additional_chars
        self.splitter_casefolding = splitter_casefolding
        self._asIFSet = asIFSet

    def clear(self):
        self._index.clear()

    def documentCount(self):
        """See interface IStatistics
        """
        return len(self._index.getStorage(self.default_fields[0]))

    def wordCount(self):
        """See interface IStatistics
        """
        return len(self._index.getLexicon())

    def index_doc(self, docid, value):
        """See interface IInjection
        """
        if value is not None:
            self._index.index_object(value, docid)

    def unindex_doc(self, docid):
        """See interface IInjection
        """
        self._index.unindex_object(docid)

    def apply(self, query):
        kw = dict()
        if isinstance(query, dict):
            kw.update(query)
            query = kw['query']
            del kw['query']
        res = self._index.search(query, **kw).getDocids()
        if self._asIFSet:
            return BTrees.IFBTree.IFSet(res)
        return res