def testPairs(self): t1 = IIBTree([(1, 10), (3, 30), (7, 70)]) t2 = IIBTree([(3, 30), (5, 50), (7, 7), (9, 90)]) allkeys = [1, 3, 5, 7, 9] b1 = IIBucket(t1) b2 = IIBucket(t2) for x in t1, t2, b1, b2: for key in x.keys(): self.assertEqual(key in allkeys, 1) for y in t1, t2, b1, b2: for w1, w2 in (0, 0), (1, 10), (10, 1), (2, 3): # Test the union. expected = [] for key in allkeys: if x.has_key(key) or y.has_key(key): result = x.get(key, 0) * w1 + y.get(key, 0) * w2 expected.append((key, result)) expected.sort() got = mass_weightedUnion([(x, w1), (y, w2)]) self.assertEqual(expected, list(got.items())) got = mass_weightedUnion([(y, w2), (x, w1)]) self.assertEqual(expected, list(got.items())) # Test the intersection. expected = [] for key in allkeys: if x.has_key(key) and y.has_key(key): result = x[key] * w1 + y[key] * w2 expected.append((key, result)) expected.sort() got = mass_weightedIntersection([(x, w1), (y, w2)]) self.assertEqual(expected, list(got.items())) got = mass_weightedIntersection([(y, w2), (x, w1)]) self.assertEqual(expected, list(got.items()))
def _trivial(l_): # l is empty or has only one (mapping, weight) pair. If there is a # pair, we may still need to multiply the mapping by its weight. assert len(l_) <= 1 if len(l_) == 0: return IIBucket() [(result, weight)] = l_ if weight != 1: dummy, result = weightedUnion(IIBucket(), result, 0, weight) return result
def histogram(self, type=type, TupleType=type(())): """Return a mapping which provides a histogram of the number of elements found at each point in the index.""" histogram = IIBucket() for (key, value) in self._index.items(): if type(value) is TupleType: entry=1 else: entry = len(value) histogram[entry] = histogram.get(entry, 0) + 1 return histogram
def histogram(self, type=type, TupleType=type(())): """Return a mapping which provides a histogram of the number of elements found at each point in the index.""" histogram = IIBucket() for (key, value) in self._index.items(): if type(value) is TupleType: entry = 1 else: entry = len(value) histogram[entry] = histogram.get(entry, 0) + 1 return histogram
def _apply_index(self, index, value): """ Default portal_catalog index _apply_index """ index_id = index.getId() apply_index = getattr(index, '_apply_index', None) if not apply_index: return IIBucket(), (index_id, ) rset = apply_index({index_id: value}) if not rset: return IIBucket(), (index_id, ) return rset
def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N #K1 = self.K1 #B = self.B #K1_plus1 = K1 + 1.0 #B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() score(result, d2f.items(), docid2len, idf, meandoclen) L.append((result, 1)) return L
def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N K1 = self.K1 B = self.B K1_plus1 = K1 + 1.0 B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() for docid, f in d2f.items(): lenweight = B_from1 + B * docid2len[docid] / meandoclen tf = f * K1_plus1 / (f + K1 * lenweight) result[docid] = scaled_int(tf * idf) L.append((result, 1)) return L
def near(self, x): result = IIBucket() dict = self._dict xdict = x._dict xhas = xdict.has_key positions = self._index.positions for id, score in dict.items(): if not xhas(id): continue p = (map(lambda i: (i, 0), positions(id, self._words)) + map(lambda i: (i, 1), positions(id, x._words))) p.sort() d = lp = 9999 li = None lsrc = None for i, src in p: if i is not li and src is not lsrc and li is not None: d = min(d, i - li) li = i lsrc = src if d == lp: score = min(score, xdict[id]) # synonyms else: score = (score + xdict[id]) / d result[id] = score return self.__class__(result, union(self._words, x._words), self._index)
def _apply_index(self, index, value): """ Default portal_catalog index _apply_index """ index_id = index.getId() apply_index = getattr(index, '_apply_index', None) if not apply_index: return IIBucket(), (index_id,) if isinstance(value, unicode): value = value.encode('utf-8', 'replace') rset = apply_index({index_id: value}) if not rset: return IIBucket(), (index_id,) return rset
def apply_index(self, index, value): """ Custom catalog apply_index method """ ctool = getToolByName(self, 'portal_catalog') catalog = queryMultiAdapter((self, ctool), IFacetedCatalog) if not catalog: return IIBucket(), (index.getId(), ) return catalog.apply_index(index, value)
def testIdentity(self): t = IIBTree([(1, 2)]) b = IIBucket([(1, 2)]) for x in t, b: for func in mass_weightedUnion, mass_weightedIntersection: result = func([(x, 1)]) self.assertEqual(len(result), 1) self.assertEqual(list(result.items()), list(x.items()))
def search(self, term): b = IIBucket() if term == "foo": b[1] = b[3] = 1 elif term == "bar": b[1] = b[2] = 1 elif term == "ham": b[1] = b[2] = b[3] = b[4] = 1 return b
def __init__(self, d, words, index, TupleType=type(())): self._index = index if type(words) is not OOSet: words = OOSet(words) self._words = words if (type(d) is TupleType): d = IIBucket((d, )) elif type(d) is not IIBucket: d = IIBucket(d) self._dict = d self.__getitem__ = d.__getitem__ try: self.__nonzero__ = d.__nonzero__ except: pass self.get = d.get
def items(self): d = IIBucket() if self.ranked_results: max = self.ranked_results[0][1] for k, v in self.ranked_results: if max == 0: d[k] = 0 else: d[k] = int(v / max * 1024.0) return d
def _apply_index(self, request, cid=''): """ Apply the index to query parameters given in the argument, request The argument should be a mapping object. If the request does not contain the needed parameters, then None is returned. Otherwise two objects are returned. The first object is a ResultSet containing the record numbers of the matching records. The second object is a tuple containing the names of all data fields used. """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys == None: return None # Changed for 2.4 # We use the default operator that can me managed via the ZMI qop = record.get('operator', self.useOperator) # We keep this for pre-2.4 compatibility # This stinking code should go away somewhere. A global # textindex_operator makes no sense when using multiple # text indexes inside a catalog. An index operator should # should be specified on a per-index base if request.has_key('textindex_operator'): qop = request['textindex_operator'] warnings.warn("The usage of the 'textindex_operator' " "is no longer recommended.\n" "Please use a mapping object and the " "'operator' key to specify the operator.") query_operator = operator_dict.get(qop) if query_operator is None: raise exceptions.RuntimeError, ("Invalid operator '%s' " "for a TextIndex" % escape(qop)) r = None for key in record.keys: key = key.strip() if not key: continue b = self.query(key, query_operator).bucket() w, r = weightedIntersection(r, b) if r is not None: return r, (self.id, ) return (IIBucket(), (self.id, ))
def testScalarMultiply(self): t = IIBTree([(1, 2), (2, 3), (3, 4)]) allkeys = [1, 2, 3] b = IIBucket(t) for x in t, b: self.assertEqual(list(x.keys()), allkeys) for func in mass_weightedUnion, mass_weightedIntersection: for factor in 0, 1, 5, 10: result = func([(x, factor)]) self.assertEqual(allkeys, list(result.keys())) for key in x.keys(): self.assertEqual(x[key] * factor, result[key])
def _search_wids(self, wids): if not wids: return [] N = float(self.document_count()) L = [] DictType = type({}) for wid in wids: assert wid in self._wordinfo # caller responsible for OOV d2w = self._wordinfo[wid] # maps docid to w(docid, wid) idf = inverse_doc_frequency(len(d2w), N) # an unscaled float if isinstance(d2w, DictType): d2w = IIBucket(d2w) L.append((d2w, scaled_int(idf))) return L
def _search_wids(self, wids): # The workhorse. Return a list of (IIBucket, weight) pairs, one pair # for each wid t in wids. The IIBucket, times the weight, maps D to # TF(D,t) * IDF(t) for every docid D containing t. # As currently written, the weights are always 1, and the IIBucket maps # D to TF(D,t)*IDF(t) directly, where the product is computed # as a float but stored as a scaled_int. # Cautions: _search_wids hardcodes the the scaled_int function. if not wids: return [] N = float(self.document_count()) # total # of docs try: doclen = self._totaldoclen() except TypeError: # _totaldoclen has not yet been upgraded doclen = self._totaldoclen meandoclen = doclen / N K1 = self.K1 B = self.B K1_plus1 = K1 + 1.0 B_from1 = 1.0 - B # f(D, t) * (k1 + 1) # TF(D, t) = ------------------------------------------- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) L = [] docid2len = self._docweight for t in wids: d2f = self._wordinfo[t] # map {docid -> f(docid, t)} idf = inverse_doc_frequency(len(d2f), N) # an unscaled float result = IIBucket() # inner score loop, was implemented in C before idf *= 1024.0 # float out part of the scaled_int computation for docid, f in d2f.items(): lenweight = B_from1 + B * docid2len[docid] / meandoclen tf = f * K1_plus1 / (f + K1 * lenweight) result[docid] = int(tf * idf + 0.5) L.append((result, 1)) return L
def apply_index(self, index, value): """ Apply index according with portal type mapping """ index_id = index.getId() if index_id != 'portal_type': return self._apply_index(index, value) if value not in self.context.objectIds(): return self._apply_index(index, value) facet = self.context._getOb(value) rset = IIBucket() ptype = getattr(facet, 'search_type', None) if ptype: rset = self._apply_index(index, ptype) if rset: rset = IISet(rset[0]) index = self.catalog._catalog.getIndex('object_provides') if not index: return rset, (index_id, ) interface = getattr(facet, 'search_interface', None) if not interface: return rset, (index_id, ) oset = self._apply_index(index, interface) if not oset: return rset, (index_id, ) oset = IISet(oset[0]) if not rset: return oset, (index_id, ) rset = weightedIntersection(rset, oset)[1] return rset, (index_id, )
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id+'_encoding') if safe_callable(encoding ): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source,encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word==last: continue last=word wordScores[word]=wordScores.get(word,0)+1 # Convert scores to use wids: widScores=IIBucket() getWid=lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)]=score del wordScores currentWids=IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert=self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids=widScores.keys() if wids != currentWids.keys(): self._unindex[documentId]=wids return len(wids)
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id + '_encoding') if safe_callable(encoding): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source, encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word == last: continue last = word wordScores[word] = wordScores.get(word, 0) + 1 # Convert scores to use wids: widScores = IIBucket() getWid = lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)] = score del wordScores currentWids = IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert = self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids = widScores.keys() if wids != currentWids.keys(): self._unindex[documentId] = wids return len(wids)
def setUp(self): self.t = IIBucket()