def testStopWords(self): I = Index(splitter_casefolding=True, use_stopwords=False, fields=('foo', )) self._test(I, u'the black blue fox', ('the', 'black', 'blue', 'fox')) I = Index(splitter_casefolding=True, use_stopwords=True, fields=('foo', )) self._test(I, u'the black blue fox', ('black', 'blue', 'fox'), 'en') self._test(I, u'das Auto auf dem garten', ('das', 'auto', 'auf', 'dem', 'garten'), 'en') self._test(I, u'das Auto auf dem garten', ('das', 'auto', 'auf', 'dem', 'garten'), 'xx') self._test(I, u'das Auto auf dem garten', ('auto', 'garten'), 'de')
def testSingleLanguageDependentSearches(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en')) self.setupIndex(I) self._test(I, u'Gleich ihr', 'de', (1, )) self._test(I, u'sich', 'de', (1, 2, 3)) self._test(I, u'"an sich"', 'de', (3, )) self._test(I, u'"an sich"', 'fr', ()) self._test(I, u'"YXXX YYY"', 'de', ()) self._test(I, u'emanzipation ', 'de', (2, 3)) self._test(I, u'"denken zur emanzipation" not selbsterhaltung', 'de', ()) self._test(I, u'"conceptualist"', 'en', ( 7, 9, )) self._test(I, u'emanzipation -denken', 'de', (3, )) self._test(I, u'emanzipation not denken', 'de', (3, )) self._test(I, u'emanzipation and not denken', 'de', (3, )) self._test(I, u'not denken and emanzipation ', 'de', (3, )) self._test( I, u'"that we have to choose between postpatriarchial conceptualist theory textual and objectivism"', 'en', ()) self._test( I, u'"that we have to choose between postpatriarchial conceptualist theory and textual objectivism"', 'en', (9, ))
def __init__(self, id, extra, caller): self.id = id self.title = id # fields fields = [id] # default if get(extra, 'fields', []): fields = get(extra, 'fields') self.index = Index( fields=fields, lexicon=get(extra, 'lexicon', DEFAULT_LEXICON), storage=get(extra, 'storage', DEFAULT_STORAGE), splitter=get(extra, 'splitter', DEFAULT_SPLITTER), autoexpand=get(extra, 'autoexpand', 'off'), autoexpand_limit=get(extra, 'autoexpand_limit', 4), query_parser=get(extra, 'query_parser', 'txng.parsers.en'), use_stemmer=get(extra, 'use_stemmer', False), languages=get(extra, 'languages', ('en', )), use_stopwords=bool(get(extra, 'use_stopwords')), default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING), use_normalizer=bool(get(extra, 'use_normalizer')), dedicated_storage=bool(get(extra, 'dedicated_storage')), splitter_casefolding=bool(get(extra, 'splitter_casefolding', True)), splitter_additional_chars=get(extra, 'splitter_additional_chars', DEFAULT_ADDITIONAL_CHARS), index_unknown_languages=bool( get(extra, 'index_unknown_languages', True)), ranking=bool(get(extra, 'ranking')), ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)), )
def testIndexAndUnindex(self): I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en')) I.index_object( Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) I.index_object( Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2) I.index_object( Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3) self._test(I, u'andreas jung', 'de', (1, ), 'author') I.unindex_object(1) I.unindex_object(2) I.unindex_object(3) I.unindex_object(9999) self._test(I, u'andreas jung', 'de', (), 'author') I.index_object( Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) self._test(I, u'andreas jung', 'de', (1, ), 'author') self._test(I, u'andreas jung', 'de', (), 'text') self._test(I, u'das opfer wird', 'de', (1, ), 'text') I.index_object( Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 1) self._test(I, u'andrea jung', 'de', (1, ), 'author')
def testWithAndWithoutStopwords(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en'), use_stopwords=False) self.setupIndex(I) self._test(I, u'das opfer wird uns frei machen', 'de', (1, )) I = Index(fields=('text', ), languages=('de', 'fr', 'en'), use_stopwords=True) self.setupIndex(I) # This should give a hit since 'das' should be filtered from the query self._test(I, u'das opfer wird uns frei machen', 'de', (1, )) self._test(I, u'DaS opfer wird uns frei machen', 'de', (1, )) self._test(I, u'sur les pantalons pour homme', 'fr', (5, )) self._test(I, u'413 sur les pantalons pour homme', 'fr', (5, ))
def testSimpleSplitterOnQuery(self): # The simple splitter strips punctuation. Thus foo.bar becomes foobar. I = Index(fields=('text', ), languages=('en', )) I.index_object(Mock('en', text=u'foo.bar baz'), 1) lexicon = I.getLexicon() self.assertEquals(['baz', 'foobar'], lexicon.getWordsForLanguage('en')) self._test(I, u'foo.bar', 'en', (1, ), 'text')
def testDE(self): I = Index(fields=('text', ), languages=('de', ), index_unknown_languages=False) I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15')), 1) # this raises an exception because the index does not know about 'fr' or 'en' self.assertRaises(ValueError, I.index_object, Mock('fr', text=unicode(fr1, 'iso-8859-15')), 4) self.assertRaises(ValueError, I.index_object, Mock('en', text=unicode(en1, 'iso-8859-15')), 5)
def testSplitterWithAdditionalChars(self): I = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='.-+') I.index_object(Mock('en', text=u'c++x hello-world algol68'), 1) lexicon = I.getLexicon() self.assertEquals(['algol68', 'cx', 'hello-world'], lexicon.getWordsForLanguage('en')) self._test(I, u'c++x OR xyz', 'en', (1, ), 'text') self._test(I, u'c++x', 'en', (1, ), 'text')
def testNormalizer(self): I = Index(splitter_casefolding=True, use_stopwords=False, use_normalizer=True, fields=('foo', )) self._test(I, u'für und über drüben gehen Wir', (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'), 'de') self._test(I, u'fÜr und über drÜben gehen Wir', (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'), 'de') self._test(I, u'für und über drüben gehen Wir', (u'für', u'und', u'über', u'drüben', u'gehen', u'wir'), 'en')
def testIndexWithOneLanguage(self): o1 = Mock(text=u'The quick brown fox', language='en') o2 = Mock(text=u'der schnelle braune fuchs', language='de') o3 = Mock(text=u'je ne sais pas', language='fr') I = Index(fields=('text', ), dedicated_storage=True, languages=('en', ), index_unknown_languages=False) I.index_object(o1, 1) self.assertRaises(ValueError, I.index_object, o2, 2) self.assertRaises(ValueError, I.index_object, o2, 3) en_words = I._lexicon.getWordsForLanguage('en') en_words.sort() self.assertEqual(en_words, ['brown', 'fox', 'quick', 'the'])
def testRanking(self): r = ResultSet(DocidList((2, 3)), (('foo', 5), )) called = [] result = object() def ranking_function(*args): called.append(args) return result index = Index() r.ranking(ranking_function, index) self.assertEquals(1, len(called)) self.assertEquals((index, r, config.DEFAULT_LANGUAGE, 50), called[0]) self.assertEquals(result, r.ranked_results)
def testGermanStemmer(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en'), use_stemmer=True) self.setupIndex(I) self._test(I, u'Gleich ihr', 'de', (1, )) self._test(I, u'Gleiche ihren', 'de', (1, )) self._test(I, u'existentiellen Eigentlichen', 'de', (1, )) self._test(I, u'existentiell Eigentlich', 'de', (1, )) self._test(I, u'"existentiellen Eigentlichen"', 'de', (1, )) self._test(I, u'"existentiell Eigentlicher"', 'de', (1, )) self._test(I, u'"existentiell Eigentliche"', 'de', (1, )) # enabled stemming -> no wildcard searches supported self.assertRaises(ValueError, I.search, 'existentiell*') self.assertRaises(ValueError, I.search, 'existent?foo') self.assertRaises(ValueError, I.search, '*a')
def testSearchAllFields(self): o1 = Mock('en', text=u'The quick brown fox', title=u'Fox') o2 = Mock('en', text=u'Mary had a little lamb.', title=u'Quick Mary') o3 = Mock('en', text=u'Pop goes the weasel!', title=u'Weasel') I = Index(fields=('title', 'text'), languages=('en', )) I.index_object(o1, 1) I.index_object(o2, 2) I.index_object(o3, 3) res = I.search(u'quick') self.assertEquals([2], list(res.getDocids())) res = I.search(u'quick', search_all_fields=True) self.assertEquals([1, 2], list(res.getDocids())) self.assertRaises(ValueError, I.search, u'quick', field='text', search_all_fields=True)
def testSplitterOnQueryWithDefaultSplitter(self): from zopyx.txng3.core.splitter import SplitterFactory provideUtility(SplitterFactory, zope.component.interfaces.IFactory, name='txng.splitters.default') I = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='-', splitter='txng.splitters.default') I.index_object(Mock('en', text=u'asdf abc.de-Efgh bla bla fasel'), 1) I.index_object(Mock('en', text=u'asdf abc de-Efgh bla bla fasel'), 2) I.index_object(Mock('en', text=u'asdf abc'), 3) lexicon = I.getLexicon() self._test(I, u'abc.de-Efgh', 'en', (1, 2), 'text') # Test with a more complex query self._test(I, u'sth OR abc.de-Efgh', 'en', (1, 2), 'text') # Test with a "not" query self._test(I, u'asdf AND NOT abc.de-Efgh', 'en', (3, ), 'text')
def testUnindex2(self): """ now with random adding removal """ index = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='.-+') for i in range(1, 200): self._addDoc(index, i) for iterations in range(100): num = random.choice(range(1, 200)) if random.randint(0, 1) == 0: index.unindex_object(num) else: self._addDoc(index, num) result = self.check_storage(index) self.assertEqual(result, True)
def test_ranking_method(self): result = [] called = [] def ranking(*args): called.append(args) return result provideUtility(ranking, IRanking, name='testranking') provideUtility( zopyx.txng3.core.storage.StorageWithTermFrequencyFactory, zope.component.interfaces.IFactory, name='freq') index = Index(fields=('text', ), storage='freq', ranking=True, ranking_method='testranking') result = index.search(u'foo') self.assertEquals(1, len(called)) self.assertEquals((index, result, DEFAULT_LANGUAGE, 50), called[0])
def testMultipleFieldsMultipleLanguages(self): I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en')) I.index_object( Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) I.index_object( Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2) I.index_object( Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3) self._test(I, u'andreas jung', 'en', ()) self._test(I, u'andreas jung', 'de', ()) self._test(I, u'andreas jung', 'de', (1, ), 'author') self._test(I, u'jung andreas', 'de', (1, ), 'author') self._test(I, u'"jung andreas"', 'de', (), 'author') self._test(I, u'"andreas jung"', 'de', (1, ), 'author') self._test(I, u'andrea jung', 'de', (2, ), 'author') self._test(I, u'andreas jung', 'de', (1, ), 'author') self._test(I, u'na*', 'de', (3, ), 'author')
def testUnindex(self): """ check storage consistency with random document removals """ index = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='.-+') for i in range(1, 200): self._addDoc(index, i) # remove all indexed documents in random order # and check the consistency of the storage upon # each removal lst = range(1, 200) while lst: num = random.choice(lst) index.unindex_object(num) result = self.check_storage(index) self.assertEqual(result, True) lst.remove(num)
def test_default_ranking_is_cosine(self): index = Index() self.assertEquals('txng.ranking.cosine', index.ranking_method)
def testSettings(self): I = Index(fields=('foo', )) self.assertEqual(I.splitter, 'txng.splitters.simple')
def testEmptyQuery(self): I = Index(fields=('foo', )) self.assertRaises(ValueError, I.search, query='')
def testSearchAllFieldsNotSupportWithoutDedicatedStorage(self): I = Index(fields=('title', 'text'), dedicated_storage=False) self.assertRaises(ValueError, I.search, u'quick', search_all_fields=True)
def testSetup(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en')) self.setupIndex(I)
def testReindex2(self): I = Index(fields=('text', ), languages=('de', )) I.index_object(Mock('de', text=u'foo bar'), 1) I.index_object(Mock('de', text=u'foo'), 1) self._test(I, u'bar', 'de', (), 'text') self._test(I, u'foo', 'de', (1, ), 'text')
def testEmpty(self): I = Index(fields=('oo', )) self._test(I, u'', ())
def testSplitter(self): I = Index(splitter_casefolding=False, fields=('foo', )) self._test(I, u'a B c', ('a', 'B', 'c'))
def testBBBCosineRanking(self): r = ResultSet(DocidList((2, 3)), (('foo', 5), )) index = Index() r.cosine_ranking(index)
def __init__(self, field_name=None, interface=None, field_callable=False, use_stemmer=defaults['use_stemmer'], dedicated_storage=defaults['dedicated_storage'], ranking=defaults['ranking'], use_normalizer=defaults['use_normalizer'], languages=defaults['languages'], use_stopwords=defaults['use_stopwords'], autoexpand_limit=defaults['autoexpand_limit'], splitter=defaults['splitter'], index_unknown_languages=defaults['index_unknown_languages'], query_parser=defaults['query_parser'], lexicon=defaults['lexicon'], splitter_additional_chars=defaults['splitter_add_chars'], storage=defaults['storage'], splitter_casefolding=defaults['splitter_casefolding'], asIFSet=True): if ranking: util = createObject(storage) if not IStorageWithTermFrequency.providedBy(util): raise ValueError("This storage cannot be used for ranking") if isinstance(field_name, basestring): _fields = field_name.split(' ') else: _fields = field_name zope.catalog.attribute.AttributeIndex.__init__(self, _fields[0], interface, field_callable) if len(_fields) < 2: dedicated_storage = False _default_fields = [_fields[0]] self._index = Index( fields=_fields, languages=languages.split(' '), use_stemmer=use_stemmer, dedicated_storage=dedicated_storage, ranking=ranking, use_normalizer=use_normalizer, use_stopwords=use_stopwords, storage=storage, autoexpand_limit=autoexpand_limit, splitter=splitter, lexicon=lexicon, index_unknown_languages=index_unknown_languages, query_parser=query_parser, splitter_additional_chars=splitter_additional_chars, splitter_casefolding=splitter_casefolding) self.languages = languages self.use_stemmer = use_stemmer self.dedicated_storage = dedicated_storage self.ranking = ranking self.use_normalizer = use_normalizer self.use_stopwords = use_stopwords self.interface = interface self.storage = storage self.autoexpand_limit = autoexpand_limit self.default_fields = _default_fields self._fields = _fields self.splitter = splitter self.lexicon = lexicon self.index_unknown_languages = index_unknown_languages self.query_parser = query_parser self.splitter_additional_chars = splitter_additional_chars self.splitter_casefolding = splitter_casefolding self._asIFSet = asIFSet