Exemple #1
0
 def testStopWords(self):
     I = Index(splitter_casefolding=True,
               use_stopwords=False,
               fields=('foo', ))
     self._test(I, u'the black blue fox', ('the', 'black', 'blue', 'fox'))
     I = Index(splitter_casefolding=True,
               use_stopwords=True,
               fields=('foo', ))
     self._test(I, u'the black blue fox', ('black', 'blue', 'fox'), 'en')
     self._test(I, u'das Auto auf dem garten',
                ('das', 'auto', 'auf', 'dem', 'garten'), 'en')
     self._test(I, u'das Auto auf dem garten',
                ('das', 'auto', 'auf', 'dem', 'garten'), 'xx')
     self._test(I, u'das Auto auf dem garten', ('auto', 'garten'), 'de')
Exemple #2
0
 def testSingleLanguageDependentSearches(self):
     I = Index(fields=('text', ), languages=('de', 'fr', 'en'))
     self.setupIndex(I)
     self._test(I, u'Gleich ihr', 'de', (1, ))
     self._test(I, u'sich', 'de', (1, 2, 3))
     self._test(I, u'"an sich"', 'de', (3, ))
     self._test(I, u'"an sich"', 'fr', ())
     self._test(I, u'"YXXX YYY"', 'de', ())
     self._test(I, u'emanzipation ', 'de', (2, 3))
     self._test(I, u'"denken zur emanzipation" not selbsterhaltung', 'de',
                ())
     self._test(I, u'"conceptualist"', 'en', (
         7,
         9,
     ))
     self._test(I, u'emanzipation -denken', 'de', (3, ))
     self._test(I, u'emanzipation not denken', 'de', (3, ))
     self._test(I, u'emanzipation and not denken', 'de', (3, ))
     self._test(I, u'not denken and emanzipation ', 'de', (3, ))
     self._test(
         I,
         u'"that we have to choose between postpatriarchial conceptualist theory textual and objectivism"',
         'en', ())
     self._test(
         I,
         u'"that we have to choose between postpatriarchial conceptualist theory and textual objectivism"',
         'en', (9, ))
    def __init__(self, id, extra, caller):
        self.id = id
        self.title = id

        # fields
        fields = [id]  # default
        if get(extra, 'fields', []):
            fields = get(extra, 'fields')

        self.index = Index(
            fields=fields,
            lexicon=get(extra, 'lexicon', DEFAULT_LEXICON),
            storage=get(extra, 'storage', DEFAULT_STORAGE),
            splitter=get(extra, 'splitter', DEFAULT_SPLITTER),
            autoexpand=get(extra, 'autoexpand', 'off'),
            autoexpand_limit=get(extra, 'autoexpand_limit', 4),
            query_parser=get(extra, 'query_parser', 'txng.parsers.en'),
            use_stemmer=get(extra, 'use_stemmer', False),
            languages=get(extra, 'languages', ('en', )),
            use_stopwords=bool(get(extra, 'use_stopwords')),
            default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING),
            use_normalizer=bool(get(extra, 'use_normalizer')),
            dedicated_storage=bool(get(extra, 'dedicated_storage')),
            splitter_casefolding=bool(get(extra, 'splitter_casefolding',
                                          True)),
            splitter_additional_chars=get(extra, 'splitter_additional_chars',
                                          DEFAULT_ADDITIONAL_CHARS),
            index_unknown_languages=bool(
                get(extra, 'index_unknown_languages', True)),
            ranking=bool(get(extra, 'ranking')),
            ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)),
        )
Exemple #4
0
 def testIndexAndUnindex(self):
     I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en'))
     I.index_object(
         Mock('de',
              text=unicode(de1, 'iso-8859-15'),
              author=u'Andreas Jung'), 1)
     I.index_object(
         Mock('de', text=unicode(de2, 'iso-8859-15'),
              author=u'Andrea Jung'), 2)
     I.index_object(
         Mock('de', text=unicode(de3, 'iso-8859-15'),
              author=u'der Nasbär'), 3)
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     I.unindex_object(1)
     I.unindex_object(2)
     I.unindex_object(3)
     I.unindex_object(9999)
     self._test(I, u'andreas jung', 'de', (), 'author')
     I.index_object(
         Mock('de',
              text=unicode(de1, 'iso-8859-15'),
              author=u'Andreas Jung'), 1)
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     self._test(I, u'andreas jung', 'de', (), 'text')
     self._test(I, u'das opfer wird', 'de', (1, ), 'text')
     I.index_object(
         Mock('de', text=unicode(de2, 'iso-8859-15'),
              author=u'Andrea Jung'), 1)
     self._test(I, u'andrea jung', 'de', (1, ), 'author')
Exemple #5
0
    def testWithAndWithoutStopwords(self):
        I = Index(fields=('text', ),
                  languages=('de', 'fr', 'en'),
                  use_stopwords=False)
        self.setupIndex(I)
        self._test(I, u'das opfer wird uns frei machen', 'de', (1, ))

        I = Index(fields=('text', ),
                  languages=('de', 'fr', 'en'),
                  use_stopwords=True)
        self.setupIndex(I)
        # This should give a hit since 'das' should be filtered from the query
        self._test(I, u'das opfer wird uns frei machen', 'de', (1, ))
        self._test(I, u'DaS opfer wird uns frei machen', 'de', (1, ))
        self._test(I, u'sur les pantalons pour homme', 'fr', (5, ))
        self._test(I, u'413 sur les pantalons pour homme', 'fr', (5, ))
Exemple #6
0
 def testSimpleSplitterOnQuery(self):
     # The simple splitter strips punctuation. Thus foo.bar becomes foobar.
     I = Index(fields=('text', ), languages=('en', ))
     I.index_object(Mock('en', text=u'foo.bar baz'), 1)
     lexicon = I.getLexicon()
     self.assertEquals(['baz', 'foobar'], lexicon.getWordsForLanguage('en'))
     self._test(I, u'foo.bar', 'en', (1, ), 'text')
Exemple #7
0
 def testDE(self):
     I = Index(fields=('text', ),
               languages=('de', ),
               index_unknown_languages=False)
     I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15')), 1)
     # this raises an exception because the index does not know about 'fr' or 'en'
     self.assertRaises(ValueError, I.index_object,
                       Mock('fr', text=unicode(fr1, 'iso-8859-15')), 4)
     self.assertRaises(ValueError, I.index_object,
                       Mock('en', text=unicode(en1, 'iso-8859-15')), 5)
Exemple #8
0
 def testSplitterWithAdditionalChars(self):
     I = Index(fields=('text', ),
               languages=('en', ),
               splitter_additional_chars='.-+')
     I.index_object(Mock('en', text=u'c++x hello-world algol68'), 1)
     lexicon = I.getLexicon()
     self.assertEquals(['algol68', 'cx', 'hello-world'],
                       lexicon.getWordsForLanguage('en'))
     self._test(I, u'c++x OR xyz', 'en', (1, ), 'text')
     self._test(I, u'c++x', 'en', (1, ), 'text')
Exemple #9
0
 def testNormalizer(self):
     I = Index(splitter_casefolding=True,
               use_stopwords=False,
               use_normalizer=True,
               fields=('foo', ))
     self._test(I, u'für und über drüben gehen Wir',
                (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'),
                'de')
     self._test(I, u'fÜr und über drÜben gehen Wir',
                (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'),
                'de')
     self._test(I, u'für und über drüben gehen Wir',
                (u'für', u'und', u'über', u'drüben', u'gehen', u'wir'),
                'en')
Exemple #10
0
 def testIndexWithOneLanguage(self):
     o1 = Mock(text=u'The quick brown fox', language='en')
     o2 = Mock(text=u'der schnelle braune fuchs', language='de')
     o3 = Mock(text=u'je ne sais pas', language='fr')
     I = Index(fields=('text', ),
               dedicated_storage=True,
               languages=('en', ),
               index_unknown_languages=False)
     I.index_object(o1, 1)
     self.assertRaises(ValueError, I.index_object, o2, 2)
     self.assertRaises(ValueError, I.index_object, o2, 3)
     en_words = I._lexicon.getWordsForLanguage('en')
     en_words.sort()
     self.assertEqual(en_words, ['brown', 'fox', 'quick', 'the'])
Exemple #11
0
    def testRanking(self):
        r = ResultSet(DocidList((2, 3)), (('foo', 5), ))

        called = []
        result = object()

        def ranking_function(*args):
            called.append(args)
            return result

        index = Index()
        r.ranking(ranking_function, index)
        self.assertEquals(1, len(called))
        self.assertEquals((index, r, config.DEFAULT_LANGUAGE, 50), called[0])
        self.assertEquals(result, r.ranked_results)
Exemple #12
0
 def testGermanStemmer(self):
     I = Index(fields=('text', ),
               languages=('de', 'fr', 'en'),
               use_stemmer=True)
     self.setupIndex(I)
     self._test(I, u'Gleich ihr', 'de', (1, ))
     self._test(I, u'Gleiche ihren', 'de', (1, ))
     self._test(I, u'existentiellen Eigentlichen', 'de', (1, ))
     self._test(I, u'existentiell Eigentlich', 'de', (1, ))
     self._test(I, u'"existentiellen Eigentlichen"', 'de', (1, ))
     self._test(I, u'"existentiell Eigentlicher"', 'de', (1, ))
     self._test(I, u'"existentiell Eigentliche"', 'de', (1, ))
     # enabled stemming -> no wildcard searches supported
     self.assertRaises(ValueError, I.search, 'existentiell*')
     self.assertRaises(ValueError, I.search, 'existent?foo')
     self.assertRaises(ValueError, I.search, '*a')
Exemple #13
0
 def testSearchAllFields(self):
     o1 = Mock('en', text=u'The quick brown fox', title=u'Fox')
     o2 = Mock('en', text=u'Mary had a little lamb.', title=u'Quick Mary')
     o3 = Mock('en', text=u'Pop goes the weasel!', title=u'Weasel')
     I = Index(fields=('title', 'text'), languages=('en', ))
     I.index_object(o1, 1)
     I.index_object(o2, 2)
     I.index_object(o3, 3)
     res = I.search(u'quick')
     self.assertEquals([2], list(res.getDocids()))
     res = I.search(u'quick', search_all_fields=True)
     self.assertEquals([1, 2], list(res.getDocids()))
     self.assertRaises(ValueError,
                       I.search,
                       u'quick',
                       field='text',
                       search_all_fields=True)
Exemple #14
0
 def testSplitterOnQueryWithDefaultSplitter(self):
     from zopyx.txng3.core.splitter import SplitterFactory
     provideUtility(SplitterFactory,
                    zope.component.interfaces.IFactory,
                    name='txng.splitters.default')
     I = Index(fields=('text', ),
               languages=('en', ),
               splitter_additional_chars='-',
               splitter='txng.splitters.default')
     I.index_object(Mock('en', text=u'asdf abc.de-Efgh bla bla fasel'), 1)
     I.index_object(Mock('en', text=u'asdf abc de-Efgh bla bla fasel'), 2)
     I.index_object(Mock('en', text=u'asdf abc'), 3)
     lexicon = I.getLexicon()
     self._test(I, u'abc.de-Efgh', 'en', (1, 2), 'text')
     # Test with a more complex query
     self._test(I, u'sth OR abc.de-Efgh', 'en', (1, 2), 'text')
     # Test with a "not" query
     self._test(I, u'asdf AND NOT abc.de-Efgh', 'en', (3, ), 'text')
Exemple #15
0
    def testUnindex2(self):
        """ now with random adding removal """

        index = Index(fields=('text', ),
                      languages=('en', ),
                      splitter_additional_chars='.-+')
        for i in range(1, 200):
            self._addDoc(index, i)

        for iterations in range(100):

            num = random.choice(range(1, 200))
            if random.randint(0, 1) == 0:
                index.unindex_object(num)
            else:
                self._addDoc(index, num)

            result = self.check_storage(index)
            self.assertEqual(result, True)
Exemple #16
0
    def test_ranking_method(self):
        result = []
        called = []

        def ranking(*args):
            called.append(args)
            return result

        provideUtility(ranking, IRanking, name='testranking')
        provideUtility(
            zopyx.txng3.core.storage.StorageWithTermFrequencyFactory,
            zope.component.interfaces.IFactory,
            name='freq')

        index = Index(fields=('text', ),
                      storage='freq',
                      ranking=True,
                      ranking_method='testranking')
        result = index.search(u'foo')
        self.assertEquals(1, len(called))
        self.assertEquals((index, result, DEFAULT_LANGUAGE, 50), called[0])
Exemple #17
0
 def testMultipleFieldsMultipleLanguages(self):
     I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en'))
     I.index_object(
         Mock('de',
              text=unicode(de1, 'iso-8859-15'),
              author=u'Andreas Jung'), 1)
     I.index_object(
         Mock('de', text=unicode(de2, 'iso-8859-15'),
              author=u'Andrea Jung'), 2)
     I.index_object(
         Mock('de', text=unicode(de3, 'iso-8859-15'),
              author=u'der Nasbär'), 3)
     self._test(I, u'andreas jung', 'en', ())
     self._test(I, u'andreas jung', 'de', ())
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     self._test(I, u'jung   andreas', 'de', (1, ), 'author')
     self._test(I, u'"jung   andreas"', 'de', (), 'author')
     self._test(I, u'"andreas jung"', 'de', (1, ), 'author')
     self._test(I, u'andrea jung', 'de', (2, ), 'author')
     self._test(I, u'andreas jung', 'de', (1, ), 'author')
     self._test(I, u'na*', 'de', (3, ), 'author')
Exemple #18
0
    def testUnindex(self):
        """ check storage consistency with random document removals
        """

        index = Index(fields=('text', ),
                      languages=('en', ),
                      splitter_additional_chars='.-+')
        for i in range(1, 200):
            self._addDoc(index, i)

        # remove all indexed documents in random order
        # and check the consistency of the storage upon
        # each removal

        lst = range(1, 200)
        while lst:
            num = random.choice(lst)
            index.unindex_object(num)
            result = self.check_storage(index)
            self.assertEqual(result, True)
            lst.remove(num)
Exemple #19
0
 def test_default_ranking_is_cosine(self):
     index = Index()
     self.assertEquals('txng.ranking.cosine', index.ranking_method)
Exemple #20
0
 def testSettings(self):
     I = Index(fields=('foo', ))
     self.assertEqual(I.splitter, 'txng.splitters.simple')
Exemple #21
0
 def testEmptyQuery(self):
     I = Index(fields=('foo', ))
     self.assertRaises(ValueError, I.search, query='')
Exemple #22
0
 def testSearchAllFieldsNotSupportWithoutDedicatedStorage(self):
     I = Index(fields=('title', 'text'), dedicated_storage=False)
     self.assertRaises(ValueError,
                       I.search,
                       u'quick',
                       search_all_fields=True)
Exemple #23
0
 def testSetup(self):
     I = Index(fields=('text', ), languages=('de', 'fr', 'en'))
     self.setupIndex(I)
Exemple #24
0
 def testReindex2(self):
     I = Index(fields=('text', ), languages=('de', ))
     I.index_object(Mock('de', text=u'foo bar'), 1)
     I.index_object(Mock('de', text=u'foo'), 1)
     self._test(I, u'bar', 'de', (), 'text')
     self._test(I, u'foo', 'de', (1, ), 'text')
Exemple #25
0
 def testEmpty(self):
     I = Index(fields=('oo', ))
     self._test(I, u'', ())
Exemple #26
0
 def testSplitter(self):
     I = Index(splitter_casefolding=False, fields=('foo', ))
     self._test(I, u'a B c', ('a', 'B', 'c'))
Exemple #27
0
 def testBBBCosineRanking(self):
     r = ResultSet(DocidList((2, 3)), (('foo', 5), ))
     index = Index()
     r.cosine_ranking(index)
Exemple #28
0
 def __init__(self,
              field_name=None,
              interface=None,
              field_callable=False,
              use_stemmer=defaults['use_stemmer'],
              dedicated_storage=defaults['dedicated_storage'],
              ranking=defaults['ranking'],
              use_normalizer=defaults['use_normalizer'],
              languages=defaults['languages'],
              use_stopwords=defaults['use_stopwords'],
              autoexpand_limit=defaults['autoexpand_limit'],
              splitter=defaults['splitter'],
              index_unknown_languages=defaults['index_unknown_languages'],
              query_parser=defaults['query_parser'],
              lexicon=defaults['lexicon'],
              splitter_additional_chars=defaults['splitter_add_chars'],
              storage=defaults['storage'],
              splitter_casefolding=defaults['splitter_casefolding'],
              asIFSet=True):
     if ranking:
         util = createObject(storage)
         if not IStorageWithTermFrequency.providedBy(util):
             raise ValueError("This storage cannot be used for ranking")
     if isinstance(field_name, basestring):
         _fields = field_name.split(' ')
     else:
         _fields = field_name
     zope.catalog.attribute.AttributeIndex.__init__(self, _fields[0],
                                                    interface,
                                                    field_callable)
     if len(_fields) < 2:
         dedicated_storage = False
     _default_fields = [_fields[0]]
     self._index = Index(
         fields=_fields,
         languages=languages.split(' '),
         use_stemmer=use_stemmer,
         dedicated_storage=dedicated_storage,
         ranking=ranking,
         use_normalizer=use_normalizer,
         use_stopwords=use_stopwords,
         storage=storage,
         autoexpand_limit=autoexpand_limit,
         splitter=splitter,
         lexicon=lexicon,
         index_unknown_languages=index_unknown_languages,
         query_parser=query_parser,
         splitter_additional_chars=splitter_additional_chars,
         splitter_casefolding=splitter_casefolding)
     self.languages = languages
     self.use_stemmer = use_stemmer
     self.dedicated_storage = dedicated_storage
     self.ranking = ranking
     self.use_normalizer = use_normalizer
     self.use_stopwords = use_stopwords
     self.interface = interface
     self.storage = storage
     self.autoexpand_limit = autoexpand_limit
     self.default_fields = _default_fields
     self._fields = _fields
     self.splitter = splitter
     self.lexicon = lexicon
     self.index_unknown_languages = index_unknown_languages
     self.query_parser = query_parser
     self.splitter_additional_chars = splitter_additional_chars
     self.splitter_casefolding = splitter_casefolding
     self._asIFSet = asIFSet