コード例 #1
0
    def testMany(self):
        import random
        inputs = [(-i, i) for i in range(50)]

        reversed_inputs = inputs[:]
        reversed_inputs.reverse()

        # Test the N-best for a variety of n (1, 6, 11, ... 50).
        for n in range(1, len(inputs) + 1, 5):
            expected = inputs[-n:]
            expected.reverse()

            random_inputs = inputs[:]
            random.shuffle(random_inputs)

            for source in inputs, reversed_inputs, random_inputs:
                # Try feeding them one at a time.
                nb = NBest(n)
                for item, score in source:
                    nb.add(item, score)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                # And again in one gulp.
                nb = NBest(n)
                nb.addmany(source)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                for i in range(1, n + 1):
                    self.assertEqual(nb.pop_smallest(), expected[-i])
                self.assertRaises(IndexError, nb.pop_smallest)
コード例 #2
0
ファイル: testNBest.py プロジェクト: wpjunior/proled
    def testMany(self):
        import random
        inputs = [(-i, i) for i in range(50)]

        reversed_inputs = inputs[:]
        reversed_inputs.reverse()

        # Test the N-best for a variety of n (1, 6, 11, ... 50).
        for n in range(1, len(inputs)+1, 5):
            expected = inputs[-n:]
            expected.reverse()

            random_inputs = inputs[:]
            random.shuffle(random_inputs)

            for source in inputs, reversed_inputs, random_inputs:
                # Try feeding them one at a time.
                nb = NBest(n)
                for item, score in source:
                    nb.add(item, score)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                # And again in one gulp.
                nb = NBest(n)
                nb.addmany(source)
                self.assertEqual(len(nb), n)
                self.assertEqual(nb.capacity(), n)
                self.assertEqual(nb.getbest(), expected)

                for i in range(1, n+1):
                    self.assertEqual(nb.pop_smallest(), expected[-i])
                self.assertRaises(IndexError, nb.pop_smallest)
コード例 #3
0
ファイル: SetOps.py プロジェクト: OS2World/APP-SERVER-Zope
def mass_weightedUnion(L):
    "A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
    if len(L) < 2:
        return _trivial(L)
    # Balance unions as closely as possible, smallest to largest.
    merge = NBest(len(L))
    for x, weight in L:
        merge.add((x, weight), len(x))
    while len(merge) > 1:
        # Merge the two smallest so far, and add back to the queue.
        (x, wx), dummy = merge.pop_smallest()
        (y, wy), dummy = merge.pop_smallest()
        dummy, z = weightedUnion(x, y, wx, wy)
        merge.add((z, 1), len(z))
    (result, weight), dummy = merge.pop_smallest()
    return result
コード例 #4
0
ファイル: SetOps.py プロジェクト: icemac/Products.ZCatalog
def mass_weightedUnion(l_):
    "A list of (mapping, weight) pairs -> their weightedUnion IIBucket."
    if len(l_) < 2:
        return _trivial(l_)
    # Balance unions as closely as possible, smallest to largest.
    merge = NBest(len(l_))
    for x, weight in l_:
        merge.add((x, weight), len(x))
    while len(merge) > 1:
        # Merge the two smallest so far, and add back to the queue.
        (x, wx), dummy = merge.pop_smallest()
        (y, wy), dummy = merge.pop_smallest()
        dummy, z = weightedUnion(x, y, wx, wy)
        merge.add((z, 1), len(z))
    (result, weight), dummy = merge.pop_smallest()
    return result
コード例 #5
0
    def testOne(self):
        nb = NBest(1)
        nb.add('a', 0)
        self.assertEqual(nb.getbest(), [('a', 0)])

        nb.add('b', 1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.add('c', -1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('f', 5)])
コード例 #6
0
ファイル: testNBest.py プロジェクト: wpjunior/proled
    def testOne(self):
        nb = NBest(1)
        nb.add('a', 0)
        self.assertEqual(nb.getbest(), [('a', 0)])

        nb.add('b', 1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.add('c', -1)
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('b', 1)])

        nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
        self.assertEqual(len(nb), 1)
        self.assertEqual(nb.capacity(), 1)
        self.assertEqual(nb.getbest(), [('f', 5)])
コード例 #7
0
ファイル: ResultSet.py プロジェクト: eaudeweb/naaya
    def cosine_ranking(self, index, hits=250):
        """ Calculate the ranking of the document based on the 
            cosine rule.
        """

        IDF = {}                # mapping term -> inverse document frequency
        cache = {}              # mapping term -> found docids
        wid_cache = {}          # mapping term -> wid
        N = len(index)          # length of collection
        nbest = NBest(hits)

        for term in self.words().keys():

            wid_cache[term] = wid = index.getLexicon().getWordId(term)                         
            docids = index.getStorage().getDocumentIdsForWordId(wid)
            cache[term] = docids

            # term frequence = number of documents a term appears in
            tf = len(docids)

            # calc and store the inverse document frequency given as
            # log(1+N/TF)
            if tf == 0: IDF[term] = 0
            else:       IDF[term] = log(1.0 + N / tf) 

        terms = list(self.words().keys())
        num_terms = len(terms)
        get_frequency = index.getStorage().getWordFrequency
        for docid in self.docIds():   # iterate over all found documents

            rank = 0.0                # ranking
            total_dwt = 0.0           # document weight

            for term in terms:
                if not docid in cache[term]: continue 

                # document term frequency = the number of times a term
                # appears within a particular document
                try:
                    dtf = get_frequency(docid, wid_cache[term])
                except KeyError:
                    continue

                # document term weight = the weight of a term within a
                # document and is calculated as:
                dtw = (1.0 + log(dtf)) * IDF[term] 

                # query term frequency and query max frequency are set
                # to 1 by default
                qtf = qmf = 1    

                # query term weight is the weight given to each term in the
                # query and is calculated as:        
                qtw = (0.5 + (0.5 * qtf/qmf)) * IDF[term] * self.words()[term]

                # add this stuff to the ranking
                rank += (qtw * dtw) 
                total_dwt += (dtw * dtw)
#                print 'q:%12d/%10s: dtf=%8.5f dtw=%8.5f rank=%8.5f totaldtw=%8.5f' % (docid, term.encode('iso-8859-15'),dtf, dtw,rank, total_dwt)

            total_dwt = sqrt(total_dwt)
            if total_dwt == 0:
                rank = 0
            else:
#                print "\t",rank, total_dwt, rank/total_dwt
#                rank = rank / total_dwt     # normalization
                rank = rank  / num_terms
                rank = int(rank * 1000 + 0.5)   # scale rank to be an integer

            nbest.add(docid, rank)

        self._result = IIBTree()
        for docid, score in nbest.getbest():
            self._result[docid] = score