def __init__(self, id, extra, caller): self.id = id self.title = id # fields fields = [id] # default if get(extra, 'fields', []): fields = get(extra, 'fields') self.index = Index( fields=fields, lexicon=get(extra, 'lexicon', DEFAULT_LEXICON), storage=get(extra, 'storage', DEFAULT_STORAGE), splitter=get(extra, 'splitter', DEFAULT_SPLITTER), autoexpand=get(extra, 'autoexpand', 'off'), autoexpand_limit=get(extra, 'autoexpand_limit', 4), query_parser=get(extra, 'query_parser', 'txng.parsers.en'), use_stemmer=get(extra, 'use_stemmer', False), languages=get(extra, 'languages', ('en', )), use_stopwords=bool(get(extra, 'use_stopwords')), default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING), use_normalizer=bool(get(extra, 'use_normalizer')), dedicated_storage=bool(get(extra, 'dedicated_storage')), splitter_casefolding=bool(get(extra, 'splitter_casefolding', True)), splitter_additional_chars=get(extra, 'splitter_additional_chars', DEFAULT_ADDITIONAL_CHARS), index_unknown_languages=bool( get(extra, 'index_unknown_languages', True)), ranking=bool(get(extra, 'ranking')), ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)), )
def testSimpleSplitterOnQuery(self): # The simple splitter strips punctuation. Thus foo.bar becomes foobar. I = Index(fields=('text', ), languages=('en', )) I.index_object(Mock('en', text=u'foo.bar baz'), 1) lexicon = I.getLexicon() self.assertEquals(['baz', 'foobar'], lexicon.getWordsForLanguage('en')) self._test(I, u'foo.bar', 'en', (1, ), 'text')
def testSimpleSplitterOnQuery(self): # The simple splitter strips punctuation. Thus foo.bar becomes foobar. I = Index(fields=('text',), languages=('en',)) I.index_object(Mock('en', text=u'foo.bar baz'), 1) lexicon = I.getLexicon() self.assertEquals(['baz', 'foobar'], lexicon.getWordsForLanguage('en')) self._test(I, u'foo.bar', 'en', (1,), 'text')
def testSplitterWithAdditionalChars(self): I = Index(fields=('text',), languages=('en',), splitter_additional_chars='.-+') I.index_object(Mock('en', text=u'c++x hello-world algol68'), 1) lexicon = I.getLexicon() self.assertEquals(['algol68', 'cx', 'hello-world'], lexicon.getWordsForLanguage('en')) self._test(I, u'c++x OR xyz', 'en', (1,), 'text') self._test(I, u'c++x', 'en', (1,), 'text')
def testSplitterWithAdditionalChars(self): I = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='.-+') I.index_object(Mock('en', text=u'c++x hello-world algol68'), 1) lexicon = I.getLexicon() self.assertEquals(['algol68', 'cx', 'hello-world'], lexicon.getWordsForLanguage('en')) self._test(I, u'c++x OR xyz', 'en', (1, ), 'text') self._test(I, u'c++x', 'en', (1, ), 'text')
def testDE(self): I = Index(fields=('text', ), languages=('de', ), index_unknown_languages=False) I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15')), 1) # this raises an exception because the index does not know about 'fr' or 'en' self.assertRaises(ValueError, I.index_object, Mock('fr', text=unicode(fr1, 'iso-8859-15')), 4) self.assertRaises(ValueError, I.index_object, Mock('en', text=unicode(en1, 'iso-8859-15')), 5)
def testIndexWithOneLanguage(self): o1 = Mock(text=u'The quick brown fox', language='en') o2 = Mock(text=u'der schnelle braune fuchs', language='de') o3 = Mock(text=u'je ne sais pas', language='fr') I = Index(fields=('text',), dedicated_storage=True, languages=('en',), index_unknown_languages=False) I.index_object(o1, 1) self.assertRaises(ValueError, I.index_object, o2, 2) self.assertRaises(ValueError, I.index_object, o2, 3) en_words = I._lexicon.getWordsForLanguage('en') en_words.sort() self.assertEqual(en_words, ['brown', 'fox', 'quick', 'the'])
def testStopWords(self): I = Index(splitter_casefolding=True, use_stopwords=False, fields=('foo', )) self._test(I, u'the black blue fox', ('the', 'black', 'blue', 'fox')) I = Index(splitter_casefolding=True, use_stopwords=True, fields=('foo', )) self._test(I, u'the black blue fox', ('black', 'blue', 'fox'), 'en') self._test(I, u'das Auto auf dem garten', ('das', 'auto', 'auf', 'dem', 'garten'), 'en') self._test(I, u'das Auto auf dem garten', ('das', 'auto', 'auf', 'dem', 'garten'), 'xx') self._test(I, u'das Auto auf dem garten', ('auto', 'garten'), 'de')
def testIndexWithOneLanguage(self): o1 = Mock(text=u'The quick brown fox', language='en') o2 = Mock(text=u'der schnelle braune fuchs', language='de') o3 = Mock(text=u'je ne sais pas', language='fr') I = Index(fields=('text', ), dedicated_storage=True, languages=('en', ), index_unknown_languages=False) I.index_object(o1, 1) self.assertRaises(ValueError, I.index_object, o2, 2) self.assertRaises(ValueError, I.index_object, o2, 3) en_words = I._lexicon.getWordsForLanguage('en') en_words.sort() self.assertEqual(en_words, ['brown', 'fox', 'quick', 'the'])
def testSingleLanguageDependentSearches(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en')) self.setupIndex(I) self._test(I, u'Gleich ihr', 'de', (1, )) self._test(I, u'sich', 'de', (1, 2, 3)) self._test(I, u'"an sich"', 'de', (3, )) self._test(I, u'"an sich"', 'fr', ()) self._test(I, u'"YXXX YYY"', 'de', ()) self._test(I, u'emanzipation ', 'de', (2, 3)) self._test(I, u'"denken zur emanzipation" not selbsterhaltung', 'de', ()) self._test(I, u'"conceptualist"', 'en', ( 7, 9, )) self._test(I, u'emanzipation -denken', 'de', (3, )) self._test(I, u'emanzipation not denken', 'de', (3, )) self._test(I, u'emanzipation and not denken', 'de', (3, )) self._test(I, u'not denken and emanzipation ', 'de', (3, )) self._test( I, u'"that we have to choose between postpatriarchial conceptualist theory textual and objectivism"', 'en', ()) self._test( I, u'"that we have to choose between postpatriarchial conceptualist theory and textual objectivism"', 'en', (9, ))
def testWithAndWithoutStopwords(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en'), use_stopwords=False) self.setupIndex(I) self._test(I, u'das opfer wird uns frei machen', 'de', (1, )) I = Index(fields=('text', ), languages=('de', 'fr', 'en'), use_stopwords=True) self.setupIndex(I) # This should give a hit since 'das' should be filtered from the query self._test(I, u'das opfer wird uns frei machen', 'de', (1, )) self._test(I, u'DaS opfer wird uns frei machen', 'de', (1, )) self._test(I, u'sur les pantalons pour homme', 'fr', (5, )) self._test(I, u'413 sur les pantalons pour homme', 'fr', (5, ))
def __init__(self, id, extra, caller): self.id = id self.title = id # fields fields = [id] # default if get(extra, 'fields', []): fields = get(extra, 'fields') self.index = Index(fields=fields, lexicon=get(extra, 'lexicon', DEFAULT_LEXICON), storage=get(extra, 'storage', DEFAULT_STORAGE), splitter=get(extra, 'splitter', DEFAULT_SPLITTER), autoexpand=get(extra, 'autoexpand', 'off'), autoexpand_limit=get(extra, 'autoexpand_limit', 4), query_parser=get(extra, 'query_parser', 'txng.parsers.en'), use_stemmer=get(extra, 'use_stemmer', False), languages=get(extra, 'languages', ('en',)), use_stopwords=bool(get(extra, 'use_stopwords')), default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING), use_normalizer=bool(get(extra, 'use_normalizer')), dedicated_storage=bool(get(extra, 'dedicated_storage')), splitter_casefolding=bool(get(extra, 'splitter_casefolding', True)), splitter_additional_chars=get(extra, 'splitter_additional_chars', DEFAULT_ADDITIONAL_CHARS), index_unknown_languages=bool(get(extra, 'index_unknown_languages', True)), ranking=bool(get(extra, 'ranking')), ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)), )
def testUnindex2(self): """ now with random adding removal """ index = Index(fields=('text', ), languages=('en',), splitter_additional_chars='.-+') for i in range(1, 200): self._addDoc(index, i) for iterations in range(100): num = random.choice(range(1, 200)) if random.randint(0, 1) == 0: index.unindex_object(num) else: self._addDoc(index, num) result = self.check_storage(index) self.assertEqual(result, True)
def testUnindex2(self): """ now with random adding removal """ index = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='.-+') for i in range(1, 200): self._addDoc(index, i) for iterations in range(100): num = random.choice(range(1, 200)) if random.randint(0, 1) == 0: index.unindex_object(num) else: self._addDoc(index, num) result = self.check_storage(index) self.assertEqual(result, True)
def testUnindex(self): """ check storage consistency with random document removals """ index = Index(fields=('text', ), languages=('en',), splitter_additional_chars='.-+') for i in range(1, 200): self._addDoc(index, i) # remove all indexed documents in random order # and check the consistency of the storage upon # each removal lst = range(1, 200) while lst: num = random.choice(lst) index.unindex_object(num) result = self.check_storage(index) self.assertEqual(result, True) lst.remove(num)
def testUnindex(self): """ check storage consistency with random document removals """ index = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='.-+') for i in range(1, 200): self._addDoc(index, i) # remove all indexed documents in random order # and check the consistency of the storage upon # each removal lst = range(1, 200) while lst: num = random.choice(lst) index.unindex_object(num) result = self.check_storage(index) self.assertEqual(result, True) lst.remove(num)
def testNormalizer(self): I = Index(splitter_casefolding=True, use_stopwords=False, use_normalizer=True, fields=('foo', )) self._test(I, u'für und über drüben gehen Wir', (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'), 'de') self._test(I, u'fÜr und über drÜben gehen Wir', (u'fuer', u'und', u'ueber', u'drueben', u'gehen', u'wir'), 'de') self._test(I, u'für und über drüben gehen Wir', (u'für', u'und', u'über', u'drüben', u'gehen', u'wir'), 'en')
def testRanking(self): r = ResultSet(DocidList((2, 3)), (('foo', 5), )) called = [] result = object() def ranking_function(*args): called.append(args) return result index = Index() r.ranking(ranking_function, index) self.assertEquals(1, len(called)) self.assertEquals((index, r, config.DEFAULT_LANGUAGE, 50), called[0]) self.assertEquals(result, r.ranked_results)
def testSearchAllFields(self): o1 = Mock('en', text=u'The quick brown fox', title=u'Fox') o2 = Mock('en', text=u'Mary had a little lamb.', title=u'Quick Mary') o3 = Mock('en', text=u'Pop goes the weasel!', title=u'Weasel') I = Index(fields=('title', 'text'), languages=('en', )) I.index_object(o1, 1) I.index_object(o2, 2) I.index_object(o3, 3) res = I.search(u'quick') self.assertEquals([2], list(res.getDocids())) res = I.search(u'quick', search_all_fields=True) self.assertEquals([1, 2], list(res.getDocids())) self.assertRaises(ValueError, I.search, u'quick', field='text', search_all_fields=True)
def testMultipleFieldsMultipleLanguages(self): I = Index(fields=('text','author'), languages=('de', 'fr', 'en')) I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) I.index_object(Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2) I.index_object(Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3) self._test(I, u'andreas jung', 'en', ()) self._test(I, u'andreas jung', 'de', ()) self._test(I, u'andreas jung', 'de', (1,), 'author') self._test(I, u'jung andreas', 'de', (1,), 'author') self._test(I, u'"jung andreas"', 'de', (), 'author') self._test(I, u'"andreas jung"', 'de', (1,), 'author') self._test(I, u'andrea jung', 'de', (2,), 'author') self._test(I, u'andreas jung', 'de', (1,), 'author') self._test(I, u'na*', 'de', (3,), 'author')
def testGermanStemmer(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en'), use_stemmer=True) self.setupIndex(I) self._test(I, u'Gleich ihr', 'de', (1, )) self._test(I, u'Gleiche ihren', 'de', (1, )) self._test(I, u'existentiellen Eigentlichen', 'de', (1, )) self._test(I, u'existentiell Eigentlich', 'de', (1, )) self._test(I, u'"existentiellen Eigentlichen"', 'de', (1, )) self._test(I, u'"existentiell Eigentlicher"', 'de', (1, )) self._test(I, u'"existentiell Eigentliche"', 'de', (1, )) # enabled stemming -> no wildcard searches supported self.assertRaises(ValueError, I.search, 'existentiell*') self.assertRaises(ValueError, I.search, 'existent?foo') self.assertRaises(ValueError, I.search, '*a')
def testSearchAllFields(self): o1 = Mock('en', text=u'The quick brown fox', title=u'Fox') o2 = Mock('en', text=u'Mary had a little lamb.', title=u'Quick Mary') o3 = Mock('en', text=u'Pop goes the weasel!', title=u'Weasel') I = Index(fields=('title', 'text'), languages=('en',)) I.index_object(o1, 1) I.index_object(o2, 2) I.index_object(o3, 3) res = I.search(u'quick') self.assertEquals([2], list(res.getDocids())) res = I.search(u'quick', search_all_fields=True) self.assertEquals([1, 2], list(res.getDocids())) self.assertRaises( ValueError, I.search, u'quick', field='text', search_all_fields=True)
def testMultipleFieldsMultipleLanguages(self): I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en')) I.index_object( Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) I.index_object( Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2) I.index_object( Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3) self._test(I, u'andreas jung', 'en', ()) self._test(I, u'andreas jung', 'de', ()) self._test(I, u'andreas jung', 'de', (1, ), 'author') self._test(I, u'jung andreas', 'de', (1, ), 'author') self._test(I, u'"jung andreas"', 'de', (), 'author') self._test(I, u'"andreas jung"', 'de', (1, ), 'author') self._test(I, u'andrea jung', 'de', (2, ), 'author') self._test(I, u'andreas jung', 'de', (1, ), 'author') self._test(I, u'na*', 'de', (3, ), 'author')
def testSplitterOnQueryWithDefaultSplitter(self): from zopyx.txng3.core.splitter import SplitterFactory provideUtility(SplitterFactory, zope.component.interfaces.IFactory, name='txng.splitters.default') I = Index(fields=('text', ), languages=('en', ), splitter_additional_chars='-', splitter='txng.splitters.default') I.index_object(Mock('en', text=u'asdf abc.de-Efgh bla bla fasel'), 1) I.index_object(Mock('en', text=u'asdf abc de-Efgh bla bla fasel'), 2) I.index_object(Mock('en', text=u'asdf abc'), 3) lexicon = I.getLexicon() self._test(I, u'abc.de-Efgh', 'en', (1, 2), 'text') # Test with a more complex query self._test(I, u'sth OR abc.de-Efgh', 'en', (1, 2), 'text') # Test with a "not" query self._test(I, u'asdf AND NOT abc.de-Efgh', 'en', (3, ), 'text')
def test_ranking_method(self): result = [] called = [] def ranking(*args): called.append(args) return result provideUtility(ranking, IRanking, name='testranking') provideUtility( zopyx.txng3.core.storage.StorageWithTermFrequencyFactory, zope.component.interfaces.IFactory, name='freq') index = Index(fields=('text', ), storage='freq', ranking=True, ranking_method='testranking') result = index.search(u'foo') self.assertEquals(1, len(called)) self.assertEquals((index, result, DEFAULT_LANGUAGE, 50), called[0])
def testSplitterOnQueryWithDefaultSplitter(self): from zopyx.txng3.core.splitter import SplitterFactory provideUtility(SplitterFactory, zope.component.interfaces.IFactory, name='txng.splitters.default') I = Index(fields=('text',), languages=('en',), splitter_additional_chars='-', splitter='txng.splitters.default') I.index_object(Mock('en', text=u'asdf abc.de-Efgh bla bla fasel'), 1) I.index_object(Mock('en', text=u'asdf abc de-Efgh bla bla fasel'), 2) I.index_object(Mock('en', text=u'asdf abc'), 3) lexicon = I.getLexicon() self._test(I, u'abc.de-Efgh', 'en', (1, 2), 'text') # Test with a more complex query self._test(I, u'sth OR abc.de-Efgh', 'en', (1, 2), 'text') # Test with a "not" query self._test(I, u'asdf AND NOT abc.de-Efgh', 'en', (3,), 'text')
def __init__(self, field_name=None, interface=None, field_callable=False, use_stemmer=defaults['use_stemmer'], dedicated_storage=defaults['dedicated_storage'], ranking=defaults['ranking'], use_normalizer=defaults['use_normalizer'], languages=defaults['languages'], use_stopwords=defaults['use_stopwords'], autoexpand_limit=defaults['autoexpand_limit'], splitter=defaults['splitter'], index_unknown_languages=defaults['index_unknown_languages'], query_parser=defaults['query_parser'], lexicon=defaults['lexicon'], splitter_additional_chars=defaults['splitter_add_chars'], storage=defaults['storage'], splitter_casefolding=defaults['splitter_casefolding'], asIFSet=True): if ranking: util = createObject(storage) if not IStorageWithTermFrequency.providedBy(util): raise ValueError("This storage cannot be used for ranking") if isinstance(field_name, basestring): _fields = field_name.split(' ') else: _fields = field_name zope.catalog.attribute.AttributeIndex.__init__( self, _fields[0], interface, field_callable) if len(_fields) < 2: dedicated_storage = False _default_fields = [_fields[0]] self._index = Index( fields=_fields, languages=languages.split(' '), use_stemmer=use_stemmer, dedicated_storage=dedicated_storage, ranking=ranking, use_normalizer=use_normalizer, use_stopwords=use_stopwords, storage=storage, autoexpand_limit=autoexpand_limit, splitter=splitter, lexicon=lexicon, index_unknown_languages=index_unknown_languages, query_parser=query_parser, splitter_additional_chars=splitter_additional_chars, splitter_casefolding=splitter_casefolding ) self.languages=languages self.use_stemmer=use_stemmer self.dedicated_storage=dedicated_storage self.ranking=ranking self.use_normalizer=use_normalizer self.use_stopwords=use_stopwords self.interface = interface self.storage=storage self.autoexpand_limit=autoexpand_limit self.default_fields=_default_fields self._fields=_fields self.splitter=splitter self.lexicon=lexicon self.index_unknown_languages=index_unknown_languages self.query_parser=query_parser self.splitter_additional_chars=splitter_additional_chars self.splitter_casefolding=splitter_casefolding self._asIFSet = asIFSet
class TingIndex(zope.catalog.text.TextIndex, persistent.Persistent): zope.interface.implements( zope.index.interfaces.IInjection, zope.index.interfaces.IStatistics, zope.index.interfaces.IIndexSearch, ITingIndex) def __init__(self, field_name=None, interface=None, field_callable=False, use_stemmer=defaults['use_stemmer'], dedicated_storage=defaults['dedicated_storage'], ranking=defaults['ranking'], use_normalizer=defaults['use_normalizer'], languages=defaults['languages'], use_stopwords=defaults['use_stopwords'], autoexpand_limit=defaults['autoexpand_limit'], splitter=defaults['splitter'], index_unknown_languages=defaults['index_unknown_languages'], query_parser=defaults['query_parser'], lexicon=defaults['lexicon'], splitter_additional_chars=defaults['splitter_add_chars'], storage=defaults['storage'], splitter_casefolding=defaults['splitter_casefolding'], asIFSet=True): if ranking: util = createObject(storage) if not IStorageWithTermFrequency.providedBy(util): raise ValueError("This storage cannot be used for ranking") if isinstance(field_name, basestring): _fields = field_name.split(' ') else: _fields = field_name zope.catalog.attribute.AttributeIndex.__init__( self, _fields[0], interface, field_callable) if len(_fields) < 2: dedicated_storage = False _default_fields = [_fields[0]] self._index = Index( fields=_fields, languages=languages.split(' '), use_stemmer=use_stemmer, dedicated_storage=dedicated_storage, ranking=ranking, use_normalizer=use_normalizer, use_stopwords=use_stopwords, storage=storage, autoexpand_limit=autoexpand_limit, splitter=splitter, lexicon=lexicon, index_unknown_languages=index_unknown_languages, query_parser=query_parser, splitter_additional_chars=splitter_additional_chars, splitter_casefolding=splitter_casefolding ) self.languages=languages self.use_stemmer=use_stemmer self.dedicated_storage=dedicated_storage self.ranking=ranking self.use_normalizer=use_normalizer self.use_stopwords=use_stopwords self.interface = interface self.storage=storage self.autoexpand_limit=autoexpand_limit self.default_fields=_default_fields self._fields=_fields self.splitter=splitter self.lexicon=lexicon self.index_unknown_languages=index_unknown_languages self.query_parser=query_parser self.splitter_additional_chars=splitter_additional_chars self.splitter_casefolding=splitter_casefolding self._asIFSet = asIFSet def clear(self): self._index.clear() def documentCount(self): """See interface IStatistics """ return len(self._index.getStorage(self.default_fields[0])) def wordCount(self): """See interface IStatistics """ return len(self._index.getLexicon()) def index_doc(self, docid, value): """See interface IInjection """ if value is not None: self._index.index_object(value, docid) def unindex_doc(self, docid): """See interface IInjection """ self._index.unindex_object(docid) def apply(self, query): kw = dict() if isinstance(query, dict): kw.update(query) query = kw['query'] del kw['query'] res = self._index.search(query, **kw).getDocids() if self._asIFSet: return BTrees.IFBTree.IFSet(res) return res
def testIndexAndUnindex(self): I = Index(fields=('text','author'), languages=('de', 'fr', 'en')) I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) I.index_object(Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2) I.index_object(Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3) self._test(I, u'andreas jung', 'de', (1,), 'author') I.unindex_object(1) I.unindex_object(2) I.unindex_object(3) I.unindex_object(9999) self._test(I, u'andreas jung', 'de', (), 'author') I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) self._test(I, u'andreas jung', 'de', (1,), 'author') self._test(I, u'andreas jung', 'de', (), 'text') self._test(I, u'das opfer wird', 'de', (1,), 'text') I.index_object(Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 1) self._test(I, u'andrea jung', 'de', (1,), 'author')
def testReindex2(self): I = Index(fields=('text',), languages=('de',)) I.index_object(Mock('de', text=u'foo bar'), 1) I.index_object(Mock('de', text=u'foo'), 1) self._test(I, u'bar', 'de', (), 'text') self._test(I, u'foo', 'de', (1,), 'text')
def testDE(self): I = Index(fields=('text',), languages=('de', ), index_unknown_languages=False) I.index_object(Mock('de', text=unicode(de1, 'iso-8859-15')), 1) # this raises an exception because the index does not know about 'fr' or 'en' self.assertRaises(ValueError, I.index_object, Mock('fr', text=unicode(fr1, 'iso-8859-15')), 4) self.assertRaises(ValueError, I.index_object, Mock('en', text=unicode(en1, 'iso-8859-15')), 5)
def testIndexAndUnindex(self): I = Index(fields=('text', 'author'), languages=('de', 'fr', 'en')) I.index_object( Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) I.index_object( Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 2) I.index_object( Mock('de', text=unicode(de3, 'iso-8859-15'), author=u'der Nasbär'), 3) self._test(I, u'andreas jung', 'de', (1, ), 'author') I.unindex_object(1) I.unindex_object(2) I.unindex_object(3) I.unindex_object(9999) self._test(I, u'andreas jung', 'de', (), 'author') I.index_object( Mock('de', text=unicode(de1, 'iso-8859-15'), author=u'Andreas Jung'), 1) self._test(I, u'andreas jung', 'de', (1, ), 'author') self._test(I, u'andreas jung', 'de', (), 'text') self._test(I, u'das opfer wird', 'de', (1, ), 'text') I.index_object( Mock('de', text=unicode(de2, 'iso-8859-15'), author=u'Andrea Jung'), 1) self._test(I, u'andrea jung', 'de', (1, ), 'author')
def testEmptyQuery(self): I = Index(fields=('foo', )) self.assertRaises(ValueError, I.search, query='')
def testReindex2(self): I = Index(fields=('text', ), languages=('de', )) I.index_object(Mock('de', text=u'foo bar'), 1) I.index_object(Mock('de', text=u'foo'), 1) self._test(I, u'bar', 'de', (), 'text') self._test(I, u'foo', 'de', (1, ), 'text')
def __init__(self, field_name=None, interface=None, field_callable=False, use_stemmer=defaults['use_stemmer'], dedicated_storage=defaults['dedicated_storage'], ranking=defaults['ranking'], use_normalizer=defaults['use_normalizer'], languages=defaults['languages'], use_stopwords=defaults['use_stopwords'], autoexpand_limit=defaults['autoexpand_limit'], splitter=defaults['splitter'], index_unknown_languages=defaults['index_unknown_languages'], query_parser=defaults['query_parser'], lexicon=defaults['lexicon'], splitter_additional_chars=defaults['splitter_add_chars'], storage=defaults['storage'], splitter_casefolding=defaults['splitter_casefolding'], asIFSet=True): if ranking: util = createObject(storage) if not IStorageWithTermFrequency.providedBy(util): raise ValueError("This storage cannot be used for ranking") if isinstance(field_name, basestring): _fields = field_name.split(' ') else: _fields = field_name zope.catalog.attribute.AttributeIndex.__init__(self, _fields[0], interface, field_callable) if len(_fields) < 2: dedicated_storage = False _default_fields = [_fields[0]] self._index = Index( fields=_fields, languages=languages.split(' '), use_stemmer=use_stemmer, dedicated_storage=dedicated_storage, ranking=ranking, use_normalizer=use_normalizer, use_stopwords=use_stopwords, storage=storage, autoexpand_limit=autoexpand_limit, splitter=splitter, lexicon=lexicon, index_unknown_languages=index_unknown_languages, query_parser=query_parser, splitter_additional_chars=splitter_additional_chars, splitter_casefolding=splitter_casefolding) self.languages = languages self.use_stemmer = use_stemmer self.dedicated_storage = dedicated_storage self.ranking = ranking self.use_normalizer = use_normalizer self.use_stopwords = use_stopwords self.interface = interface self.storage = storage self.autoexpand_limit = autoexpand_limit self.default_fields = _default_fields self._fields = _fields self.splitter = splitter self.lexicon = lexicon self.index_unknown_languages = index_unknown_languages self.query_parser = query_parser self.splitter_additional_chars = splitter_additional_chars self.splitter_casefolding = splitter_casefolding self._asIFSet = asIFSet
class TextIndexNG3(SimpleItem, PropertyManager): implements(ITextIndexNG3, IPluggableIndex) meta_type = 'TextIndexNG3' default_encoding = 'iso-8859-15' # I think we don't need this anymore management_page_charset = 'utf-8' # needed for several ZMI methods manage_options = ( {'label' : 'Index', 'action': 'manage_workspace'}, {'label' : 'Vocabulary', 'action' : 'vocabularyform'}, {'label' : 'Test', 'action' : 'queryform'}, {'label' : 'Converters', 'action' : 'converters'}, {'label' : 'Thesaurus', 'action' : 'thesaurus'}, {'label' : 'Adapters', 'action' : 'adapters'}, ) +\ SimpleItem.manage_options + \ PropertyManager.manage_options query_options = ('query', 'encoding', 'parser', 'language', 'field', 'autoexpand', 'similarity_ratio', 'ranking', 'ranking_maxhits', 'thesaurus', 'search_all_fields') def __init__(self, id, extra, caller): self.id = id self.title = id # fields fields = [id] # default if get(extra, 'fields', []): fields = get(extra, 'fields') self.index = Index(fields=fields, lexicon=get(extra, 'lexicon', DEFAULT_LEXICON), storage=get(extra, 'storage', DEFAULT_STORAGE), splitter=get(extra, 'splitter', DEFAULT_SPLITTER), autoexpand=get(extra, 'autoexpand', 'off'), autoexpand_limit=get(extra, 'autoexpand_limit', 4), query_parser=get(extra, 'query_parser', 'txng.parsers.en'), use_stemmer=get(extra, 'use_stemmer', False), languages=get(extra, 'languages', ('en',)), use_stopwords=bool(get(extra, 'use_stopwords')), default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING), use_normalizer=bool(get(extra, 'use_normalizer')), dedicated_storage=bool(get(extra, 'dedicated_storage')), splitter_casefolding=bool(get(extra, 'splitter_casefolding', True)), splitter_additional_chars=get(extra, 'splitter_additional_chars', DEFAULT_ADDITIONAL_CHARS), index_unknown_languages=bool(get(extra, 'index_unknown_languages', True)), ranking=bool(get(extra, 'ranking')), ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)), ) def clear(self): """ clear the index """ self.index.clear() def index_object(self, docid, obj, threshold=None): result = self.index.index_object(obj, docid) return int(result) def unindex_object(self, docid): self.index.unindex_object(docid) return 1 def getIndexSourceNames(self): """ return indexed fields """ return self.index.fields def indexSize(self): return len(self.index.getLexicon()) def getEntryForObject(self, docid, default=None): """Get all information contained for 'docid'. Returns a string representing a mapping field -> list of indexed words for dedicated storages or a list of indexed words for shared storage. """ getWord = self.index.getLexicon().getWord d = {} for field in self.index.fields: try: wids = self.index.getStorage(field).getWordIdsForDocId(docid) except StorageException: wids = () words = [getWord(wid) for wid in wids] d[field] = words if not self.index.dedicated_storage: return repr(d[self.index.fields[0]]) return repr(d) def _apply_index(self, request, cid=''): # parse the query options record = parseIndexRequest(request, self.getId(), self.query_options) if record.keys is None: return None # prepare query (must be unicode string) query = record.keys[0] if not isinstance(query, unicode): query = unicode(query, record.get('encoding', self.index.default_encoding), 'ignore') if not query: return None # options options = {} for k in ('parser', 'language', 'field', 'autoexpand', 'similarity_ratio', 'thesaurus', 'ranking', 'ranking_maxhits', 'search_all_fields'): v = getattr(record, k, marker) if v is not marker: options[k] = v result = self.index.search(query, **options) ranked_resultset = result.getRankedResults() if ranked_resultset: return ranked_resultset, self.id else: return result.getDocids(), self.id def __len__(self): return len(self.index) numObjects = __len__ def manage_workspace(self, REQUEST): """ redirect to manage since we can not override manage_workspace through a Five browser view """ from zope.component import getMultiAdapter view = getMultiAdapter((self, REQUEST), name='manageform') return view()
def testSettings(self): I = Index(fields=('foo', )) self.assertEqual(I.splitter, 'txng.splitters.simple')
def test_default_ranking_is_cosine(self): index = Index() self.assertEquals('txng.ranking.cosine', index.ranking_method)
def testSearchAllFieldsNotSupportWithoutDedicatedStorage(self): I = Index(fields=('title', 'text'), dedicated_storage=False) self.assertRaises(ValueError, I.search, u'quick', search_all_fields=True)
class TextIndexNG3(SimpleItem, PropertyManager): implements(ITextIndexNG3, IPluggableIndex) meta_type = 'TextIndexNG3' default_encoding = 'iso-8859-15' # I think we don't need this anymore management_page_charset = 'utf-8' # needed for several ZMI methods manage_options = ( {'label' : 'Index', 'action': 'manage_workspace'}, {'label' : 'Vocabulary', 'action' : 'vocabularyform'}, {'label' : 'Test', 'action' : 'queryform'}, {'label' : 'Converters', 'action' : 'converters'}, {'label' : 'Thesaurus', 'action' : 'thesaurus'}, {'label' : 'Adapters', 'action' : 'adapters'}, ) +\ SimpleItem.manage_options + \ PropertyManager.manage_options query_options = ('query', 'encoding', 'parser', 'language', 'field', 'autoexpand', 'similarity_ratio', 'ranking', 'ranking_maxhits', 'thesaurus', 'search_all_fields') def __init__(self, id, extra, caller): self.id = id self.title = id # fields fields = [id] # default if get(extra, 'fields', []): fields = get(extra, 'fields') self.index = Index( fields=fields, lexicon=get(extra, 'lexicon', DEFAULT_LEXICON), storage=get(extra, 'storage', DEFAULT_STORAGE), splitter=get(extra, 'splitter', DEFAULT_SPLITTER), autoexpand=get(extra, 'autoexpand', 'off'), autoexpand_limit=get(extra, 'autoexpand_limit', 4), query_parser=get(extra, 'query_parser', 'txng.parsers.en'), use_stemmer=get(extra, 'use_stemmer', False), languages=get(extra, 'languages', ('en', )), use_stopwords=bool(get(extra, 'use_stopwords')), default_encoding=get(extra, 'default_encoding', DEFAULT_ENCODING), use_normalizer=bool(get(extra, 'use_normalizer')), dedicated_storage=bool(get(extra, 'dedicated_storage')), splitter_casefolding=bool(get(extra, 'splitter_casefolding', True)), splitter_additional_chars=get(extra, 'splitter_additional_chars', DEFAULT_ADDITIONAL_CHARS), index_unknown_languages=bool( get(extra, 'index_unknown_languages', True)), ranking=bool(get(extra, 'ranking')), ranking_method=(get(extra, 'ranking_method', DEFAULT_RANKING)), ) def clear(self): """ clear the index """ self.index.clear() def index_object(self, docid, obj, threshold=None): result = self.index.index_object(obj, docid) return int(result) def unindex_object(self, docid): self.index.unindex_object(docid) return 1 def getIndexSourceNames(self): """ return indexed fields """ return self.index.fields def getIndexQueryNames(self): """ Return queryable parameters """ return [self.id] def indexSize(self): return len(self.index.getLexicon()) def getEntryForObject(self, docid, default=None): """Get all information contained for 'docid'. Returns a string representing a mapping field -> list of indexed words for dedicated storages or a list of indexed words for shared storage. """ getWord = self.index.getLexicon().getWord d = {} for field in self.index.fields: try: wids = self.index.getStorage(field).getWordIdsForDocId(docid) except StorageException: wids = () words = [getWord(wid) for wid in wids] d[field] = words if not self.index.dedicated_storage: return repr(d[self.index.fields[0]]) return repr(d) def _apply_index(self, request, cid=''): # parse the query options record = parseIndexRequest(request, self.getId(), self.query_options) if record.keys is None: return None # prepare query (must be unicode string) query = record.keys[0] if not isinstance(query, str): query = str(query, record.get('encoding', self.index.default_encoding), 'ignore') if not query: return None # options options = {} for k in ('parser', 'language', 'field', 'autoexpand', 'similarity_ratio', 'thesaurus', 'ranking', 'ranking_maxhits', 'search_all_fields'): v = getattr(record, k, marker) if v is not marker: options[k] = v result = self.index.search(query, **options) ranked_resultset = result.getRankedResults() if ranked_resultset: return ranked_resultset, self.id else: return result.getDocids(), self.id def __len__(self): return len(self.index) numObjects = __len__ def manage_workspace(self, REQUEST): """ redirect to manage since we can not override manage_workspace through a Five browser view """ from zope.component import getMultiAdapter view = getMultiAdapter((self, REQUEST), name='manageform') return view()
def testEmpty(self): I = Index(fields=('oo', )) self._test(I, u'', ())
def testSplitter(self): I = Index(splitter_casefolding=False, fields=('foo', )) self._test(I, u'a B c', ('a', 'B', 'c'))
def testSetup(self): I = Index(fields=('text', ), languages=('de', 'fr', 'en')) self.setupIndex(I)
def testBBBCosineRanking(self): r = ResultSet(DocidList((2, 3)), (('foo', 5), )) index = Index() r.cosine_ranking(index)
class TingIndex(zope.catalog.text.TextIndex, persistent.Persistent): zope.interface.implements(zope.index.interfaces.IInjection, zope.index.interfaces.IStatistics, zope.index.interfaces.IIndexSearch, ITingIndex) def __init__(self, field_name=None, interface=None, field_callable=False, use_stemmer=defaults['use_stemmer'], dedicated_storage=defaults['dedicated_storage'], ranking=defaults['ranking'], use_normalizer=defaults['use_normalizer'], languages=defaults['languages'], use_stopwords=defaults['use_stopwords'], autoexpand_limit=defaults['autoexpand_limit'], splitter=defaults['splitter'], index_unknown_languages=defaults['index_unknown_languages'], query_parser=defaults['query_parser'], lexicon=defaults['lexicon'], splitter_additional_chars=defaults['splitter_add_chars'], storage=defaults['storage'], splitter_casefolding=defaults['splitter_casefolding'], asIFSet=True): if ranking: util = createObject(storage) if not IStorageWithTermFrequency.providedBy(util): raise ValueError("This storage cannot be used for ranking") if isinstance(field_name, basestring): _fields = field_name.split(' ') else: _fields = field_name zope.catalog.attribute.AttributeIndex.__init__(self, _fields[0], interface, field_callable) if len(_fields) < 2: dedicated_storage = False _default_fields = [_fields[0]] self._index = Index( fields=_fields, languages=languages.split(' '), use_stemmer=use_stemmer, dedicated_storage=dedicated_storage, ranking=ranking, use_normalizer=use_normalizer, use_stopwords=use_stopwords, storage=storage, autoexpand_limit=autoexpand_limit, splitter=splitter, lexicon=lexicon, index_unknown_languages=index_unknown_languages, query_parser=query_parser, splitter_additional_chars=splitter_additional_chars, splitter_casefolding=splitter_casefolding) self.languages = languages self.use_stemmer = use_stemmer self.dedicated_storage = dedicated_storage self.ranking = ranking self.use_normalizer = use_normalizer self.use_stopwords = use_stopwords self.interface = interface self.storage = storage self.autoexpand_limit = autoexpand_limit self.default_fields = _default_fields self._fields = _fields self.splitter = splitter self.lexicon = lexicon self.index_unknown_languages = index_unknown_languages self.query_parser = query_parser self.splitter_additional_chars = splitter_additional_chars self.splitter_casefolding = splitter_casefolding self._asIFSet = asIFSet def clear(self): self._index.clear() def documentCount(self): """See interface IStatistics """ return len(self._index.getStorage(self.default_fields[0])) def wordCount(self): """See interface IStatistics """ return len(self._index.getLexicon()) def index_doc(self, docid, value): """See interface IInjection """ if value is not None: self._index.index_object(value, docid) def unindex_doc(self, docid): """See interface IInjection """ self._index.unindex_object(docid) def apply(self, query): kw = dict() if isinstance(query, dict): kw.update(query) query = kw['query'] del kw['query'] res = self._index.search(query, **kw).getDocids() if self._asIFSet: return BTrees.IFBTree.IFSet(res) return res