Ejemplo n.º 1
0
    def testPairs(self):
        t1 = IIBTree([(1, 10), (3, 30), (7, 70)])
        t2 = IIBTree([(3, 30), (5, 50), (7, 7), (9, 90)])
        allkeys = [1, 3, 5, 7, 9]
        b1 = IIBucket(t1)
        b2 = IIBucket(t2)
        for x in t1, t2, b1, b2:
            for key in x.keys():
                self.assertEqual(key in allkeys, 1)
            for y in t1, t2, b1, b2:
                for w1, w2 in (0, 0), (1, 10), (10, 1), (2, 3):
                    # Test the union.
                    expected = []
                    for key in allkeys:
                        if x.has_key(key) or y.has_key(key):
                            result = x.get(key, 0) * w1 + y.get(key, 0) * w2
                            expected.append((key, result))
                    expected.sort()
                    got = mass_weightedUnion([(x, w1), (y, w2)])
                    self.assertEqual(expected, list(got.items()))
                    got = mass_weightedUnion([(y, w2), (x, w1)])
                    self.assertEqual(expected, list(got.items()))

                    # Test the intersection.
                    expected = []
                    for key in allkeys:
                        if x.has_key(key) and y.has_key(key):
                            result = x[key] * w1 + y[key] * w2
                            expected.append((key, result))
                    expected.sort()
                    got = mass_weightedIntersection([(x, w1), (y, w2)])
                    self.assertEqual(expected, list(got.items()))
                    got = mass_weightedIntersection([(y, w2), (x, w1)])
                    self.assertEqual(expected, list(got.items()))
Ejemplo n.º 2
0
def _trivial(l_):
    # l is empty or has only one (mapping, weight) pair. If there is a
    # pair, we may still need to multiply the mapping by its weight.
    assert len(l_) <= 1
    if len(l_) == 0:
        return IIBucket()
    [(result, weight)] = l_
    if weight != 1:
        dummy, result = weightedUnion(IIBucket(), result, 0, weight)
    return result
    def histogram(self, type=type, TupleType=type(())):
        """Return a mapping which provides a histogram of the number of
        elements found at each point in the index."""

        histogram = IIBucket()
        for (key, value) in self._index.items():
            if type(value) is TupleType: entry=1
            else: entry = len(value)
            histogram[entry] = histogram.get(entry, 0) + 1

        return histogram
Ejemplo n.º 4
0
    def histogram(self, type=type, TupleType=type(())):
        """Return a mapping which provides a histogram of the number of
        elements found at each point in the index."""

        histogram = IIBucket()
        for (key, value) in self._index.items():
            if type(value) is TupleType: entry = 1
            else: entry = len(value)
            histogram[entry] = histogram.get(entry, 0) + 1

        return histogram
Ejemplo n.º 5
0
    def _apply_index(self, index, value):
        """ Default portal_catalog index _apply_index
        """
        index_id = index.getId()

        apply_index = getattr(index, '_apply_index', None)
        if not apply_index:
            return IIBucket(), (index_id, )

        rset = apply_index({index_id: value})
        if not rset:
            return IIBucket(), (index_id, )

        return rset
Ejemplo n.º 6
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        #K1 = self.K1
        #B = self.B
        #K1_plus1 = K1 + 1.0
        #B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            score(result, d2f.items(), docid2len, idf, meandoclen)
            L.append((result, 1))
        return L
Ejemplo n.º 7
0
    def _search_wids(self, wids):
        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = scaled_int(tf * idf)
            L.append((result, 1))
        return L
Ejemplo n.º 8
0
    def near(self, x):
        result = IIBucket()
        dict = self._dict
        xdict = x._dict
        xhas = xdict.has_key
        positions = self._index.positions
        for id, score in dict.items():
            if not xhas(id): continue
            p = (map(lambda i: (i, 0), positions(id, self._words)) +
                 map(lambda i: (i, 1), positions(id, x._words)))
            p.sort()
            d = lp = 9999
            li = None
            lsrc = None
            for i, src in p:
                if i is not li and src is not lsrc and li is not None:
                    d = min(d, i - li)
                li = i
                lsrc = src
            if d == lp: score = min(score, xdict[id])  # synonyms
            else: score = (score + xdict[id]) / d
            result[id] = score

        return self.__class__(result, union(self._words, x._words),
                              self._index)
Ejemplo n.º 9
0
    def _apply_index(self, index, value):
        """ Default portal_catalog index _apply_index
        """
        index_id = index.getId()

        apply_index = getattr(index, '_apply_index', None)
        if not apply_index:
            return IIBucket(), (index_id,)

        if isinstance(value, unicode):
            value = value.encode('utf-8', 'replace')
        rset = apply_index({index_id: value})

        if not rset:
            return IIBucket(), (index_id,)

        return rset
Ejemplo n.º 10
0
 def apply_index(self, index, value):
     """ Custom catalog apply_index method
     """
     ctool = getToolByName(self, 'portal_catalog')
     catalog = queryMultiAdapter((self, ctool), IFacetedCatalog)
     if not catalog:
         return IIBucket(), (index.getId(), )
     return catalog.apply_index(index, value)
Ejemplo n.º 11
0
 def testIdentity(self):
     t = IIBTree([(1, 2)])
     b = IIBucket([(1, 2)])
     for x in t, b:
         for func in mass_weightedUnion, mass_weightedIntersection:
             result = func([(x, 1)])
             self.assertEqual(len(result), 1)
             self.assertEqual(list(result.items()), list(x.items()))
Ejemplo n.º 12
0
 def search(self, term):
     b = IIBucket()
     if term == "foo":
         b[1] = b[3] = 1
     elif term == "bar":
         b[1] = b[2] = 1
     elif term == "ham":
         b[1] = b[2] = b[3] = b[4] = 1
     return b
Ejemplo n.º 13
0
    def __init__(self, d, words, index, TupleType=type(())):
        self._index = index

        if type(words) is not OOSet: words = OOSet(words)
        self._words = words

        if (type(d) is TupleType):
            d = IIBucket((d, ))
        elif type(d) is not IIBucket:
            d = IIBucket(d)

        self._dict = d
        self.__getitem__ = d.__getitem__
        try:
            self.__nonzero__ = d.__nonzero__
        except:
            pass
        self.get = d.get
Ejemplo n.º 14
0
 def items(self):
     d = IIBucket()
     if self.ranked_results:
         max = self.ranked_results[0][1]
         for k, v in self.ranked_results:
             if max == 0:
                 d[k] = 0
             else:
                 d[k] = int(v / max * 1024.0)
     return d
Ejemplo n.º 15
0
    def _apply_index(self, request, cid=''):
        """ Apply the index to query parameters given in the argument,
        request

        The argument should be a mapping object.

        If the request does not contain the needed parameters, then
        None is returned.

        Otherwise two objects are returned.  The first object is a
        ResultSet containing the record numbers of the matching
        records.  The second object is a tuple containing the names of
        all data fields used.
        """

        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys == None: return None

        # Changed for 2.4
        # We use the default operator that can me managed via the ZMI

        qop = record.get('operator', self.useOperator)

        # We keep this for pre-2.4 compatibility
        # This stinking code should go away somewhere. A global
        # textindex_operator makes no sense when using multiple
        # text indexes inside a catalog. An index operator should
        # should be specified on a per-index base

        if request.has_key('textindex_operator'):
            qop = request['textindex_operator']
            warnings.warn("The usage of the 'textindex_operator' "
                          "is no longer recommended.\n"
                          "Please use a mapping object and the "
                          "'operator' key to specify the operator.")

        query_operator = operator_dict.get(qop)
        if query_operator is None:
            raise exceptions.RuntimeError, ("Invalid operator '%s' "
                                            "for a TextIndex" % escape(qop))
        r = None

        for key in record.keys:
            key = key.strip()
            if not key:
                continue

            b = self.query(key, query_operator).bucket()
            w, r = weightedIntersection(r, b)

        if r is not None:
            return r, (self.id, )

        return (IIBucket(), (self.id, ))
Ejemplo n.º 16
0
 def testScalarMultiply(self):
     t = IIBTree([(1, 2), (2, 3), (3, 4)])
     allkeys = [1, 2, 3]
     b = IIBucket(t)
     for x in t, b:
         self.assertEqual(list(x.keys()), allkeys)
         for func in mass_weightedUnion, mass_weightedIntersection:
             for factor in 0, 1, 5, 10:
                 result = func([(x, factor)])
                 self.assertEqual(allkeys, list(result.keys()))
                 for key in x.keys():
                     self.assertEqual(x[key] * factor, result[key])
Ejemplo n.º 17
0
 def _search_wids(self, wids):
     if not wids:
         return []
     N = float(self.document_count())
     L = []
     DictType = type({})
     for wid in wids:
         assert wid in self._wordinfo  # caller responsible for OOV
         d2w = self._wordinfo[wid]  # maps docid to w(docid, wid)
         idf = inverse_doc_frequency(len(d2w), N)  # an unscaled float
         if isinstance(d2w, DictType):
             d2w = IIBucket(d2w)
         L.append((d2w, scaled_int(idf)))
     return L
Ejemplo n.º 18
0
    def _search_wids(self, wids):
        # The workhorse. Return a list of (IIBucket, weight) pairs, one pair
        # for each wid t in wids. The IIBucket, times the weight, maps D to
        # TF(D,t) * IDF(t) for every docid D containing t.
        # As currently written, the weights are always 1, and the IIBucket maps
        # D to TF(D,t)*IDF(t) directly, where the product is computed
        # as a float but stored as a scaled_int.
        # Cautions: _search_wids hardcodes the the scaled_int function.

        if not wids:
            return []
        N = float(self.document_count())  # total # of docs
        try:
            doclen = self._totaldoclen()
        except TypeError:
            # _totaldoclen has not yet been upgraded
            doclen = self._totaldoclen
        meandoclen = doclen / N
        K1 = self.K1
        B = self.B
        K1_plus1 = K1 + 1.0
        B_from1 = 1.0 - B

        #                           f(D, t) * (k1 + 1)
        #   TF(D, t) =  -------------------------------------------
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
        docid2len = self._docweight
        for t in wids:
            d2f = self._wordinfo[t]  # map {docid -> f(docid, t)}
            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
            result = IIBucket()

            # inner score loop, was implemented in C before
            idf *= 1024.0  # float out part of the scaled_int computation
            for docid, f in d2f.items():
                lenweight = B_from1 + B * docid2len[docid] / meandoclen
                tf = f * K1_plus1 / (f + K1 * lenweight)
                result[docid] = int(tf * idf + 0.5)

            L.append((result, 1))
        return L
Ejemplo n.º 19
0
    def apply_index(self, index, value):
        """ Apply index according with portal type mapping
        """
        index_id = index.getId()
        if index_id != 'portal_type':
            return self._apply_index(index, value)

        if value not in self.context.objectIds():
            return self._apply_index(index, value)

        facet = self.context._getOb(value)

        rset = IIBucket()
        ptype = getattr(facet, 'search_type', None)
        if ptype:
            rset = self._apply_index(index, ptype)
            if rset:
                rset = IISet(rset[0])

        index = self.catalog._catalog.getIndex('object_provides')
        if not index:
            return rset, (index_id, )

        interface = getattr(facet, 'search_interface', None)
        if not interface:
            return rset, (index_id, )

        oset = self._apply_index(index, interface)
        if not oset:
            return rset, (index_id, )

        oset = IISet(oset[0])

        if not rset:
            return oset, (index_id, )

        rset = weightedIntersection(rset, oset)[1]

        return rset, (index_id, )
    def index_object(self, documentId, obj, threshold=None):
        """ Index an object:
        'documentId' is the integer id of the document

        'obj' is the object to be indexed

        'threshold' is the number of words to process between
        commiting subtransactions.  If 'None' subtransactions are
        disabled. """

        # sniff the object for our 'id', the 'document source' of the
        # index is this attribute.  If it smells callable, call it.
        try:
            source = getattr(obj, self.id)
            if safe_callable(source):
                source = source()

            if not isinstance(source, UnicodeType):
                source = str(source)

        except (AttributeError, TypeError):
            return 0

        # sniff the object for 'id'+'_encoding'

        try:
            encoding = getattr(obj, self.id+'_encoding')
            if safe_callable(encoding ):
                encoding = str(encoding())
            else:
                encoding = str(encoding)
        except (AttributeError, TypeError):
            encoding = 'latin1'

        lexicon = self.getLexicon()

        splitter = lexicon.Splitter

        wordScores = OIBTree()
        last = None

        # Run through the words and score them

        for word in list(splitter(source,encoding=encoding)):
            if word[0] == '\"':
                last = self._subindex(word[1:-1], wordScores, last, splitter)
            else:
                if word==last: continue
                last=word
                wordScores[word]=wordScores.get(word,0)+1

        # Convert scores to use wids:
        widScores=IIBucket()
        getWid=lexicon.getWordId
        for word, score in wordScores.items():
            widScores[getWid(word)]=score

        del wordScores

        currentWids=IISet(self._unindex.get(documentId, []))

        # Get rid of document words that are no longer indexed
        self.unindex_objectWids(documentId, difference(currentWids, widScores))

        # Now index the words. Note that the new xIBTrees are clever
        # enough to do nothing when there isn't a change. Woo hoo.
        insert=self.insertForwardIndexEntry
        for wid, score in widScores.items():
            insert(wid, documentId, score)

        # Save the unindexing info if it's changed:
        wids=widScores.keys()
        if wids != currentWids.keys():
            self._unindex[documentId]=wids

        return len(wids)
Ejemplo n.º 21
0
    def index_object(self, documentId, obj, threshold=None):
        """ Index an object:
        'documentId' is the integer id of the document

        'obj' is the object to be indexed

        'threshold' is the number of words to process between
        commiting subtransactions.  If 'None' subtransactions are
        disabled. """

        # sniff the object for our 'id', the 'document source' of the
        # index is this attribute.  If it smells callable, call it.
        try:
            source = getattr(obj, self.id)
            if safe_callable(source):
                source = source()

            if not isinstance(source, UnicodeType):
                source = str(source)

        except (AttributeError, TypeError):
            return 0

        # sniff the object for 'id'+'_encoding'

        try:
            encoding = getattr(obj, self.id + '_encoding')
            if safe_callable(encoding):
                encoding = str(encoding())
            else:
                encoding = str(encoding)
        except (AttributeError, TypeError):
            encoding = 'latin1'

        lexicon = self.getLexicon()

        splitter = lexicon.Splitter

        wordScores = OIBTree()
        last = None

        # Run through the words and score them

        for word in list(splitter(source, encoding=encoding)):
            if word[0] == '\"':
                last = self._subindex(word[1:-1], wordScores, last, splitter)
            else:
                if word == last: continue
                last = word
                wordScores[word] = wordScores.get(word, 0) + 1

        # Convert scores to use wids:
        widScores = IIBucket()
        getWid = lexicon.getWordId
        for word, score in wordScores.items():
            widScores[getWid(word)] = score

        del wordScores

        currentWids = IISet(self._unindex.get(documentId, []))

        # Get rid of document words that are no longer indexed
        self.unindex_objectWids(documentId, difference(currentWids, widScores))

        # Now index the words. Note that the new xIBTrees are clever
        # enough to do nothing when there isn't a change. Woo hoo.
        insert = self.insertForwardIndexEntry
        for wid, score in widScores.items():
            insert(wid, documentId, score)

        # Save the unindexing info if it's changed:
        wids = widScores.keys()
        if wids != currentWids.keys():
            self._unindex[documentId] = wids

        return len(wids)
Ejemplo n.º 22
0
 def setUp(self):
     self.t = IIBucket()