コード例 #1
0
 def testCori(self):
     "Test combining with CORI relevance ranking."
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index all/rel.algorithm=cori "foo bar"')
     clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
     # A Database is required for relevance ranking
     db = FakeDatabase(self.session, None)
     # A RecordStore is required for CORI score calculation
     recStore = FakeRecordStore(self.session, None)
     # Test self.a
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.a)
     # I is used in calculating score for each item
     I = (math.log((db.totalItems + 0.5) / matches) /
          math.log(db.totalItems + 1.0))
     expectedScores = []
     for rsi in [self.rsi1, self.rsi3]:
         size = recStore.fetch_recordMetadata(self.session,
                                              rsi.id,
                                              'wordCount')
         T = (rsi.occurences /
              (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount))
              )
         expectedScores.append(0.4 + (0.6 * T * I))
     self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
     # Test self.b
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.b], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.b)
     # I is used in calculating score for each item
     I = (math.log((db.totalItems + 0.5) / matches) /
          math.log(db.totalItems + 1.0))
     expectedScores = []
     for rsi in [self.rsi2, self.rsi4]:
         size = recStore.fetch_recordMetadata(self.session,
                                              rsi.id,
                                              'wordCount')
         T = (rsi.occurences /
              (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount))
              )
         expectedScores.append(0.4 + (0.6 * T * I))
     self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
コード例 #2
0
 def testCombineSumWeights(self):
     "Test combining ResultSet scores by summation."
     # A clause / boolean is required to combine ResultSets
     # Use TF-IDF because it's most simple to calculate
     clause = cqlparse('my.index '
                       'all/rel.algorithm=tfidf/rel.combine=sum '
                       '"foo bar"')
     
     clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
     # A Database is required for relevance ranking
     db = FakeDatabase(self.session, None, parent=None)
     # Create a new ResultSet to combine into
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a, self.b], clause, db)
     # Check return value is a Resultset
     self.assertIsInstance(rs, SimpleResultSet)
     # Check merged ResultSet has 1 item
     self.assertEqual(len(rs), 1)
     # Check that merged ResultSet contains the correct item
     self.assertIn(self.rsi1, rs)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check combined scores correct
     matches = len(self.b)
     self.assertEqual(rs[0].weight,
                      sum([5 * math.log(db.totalItems / matches),
                           3 * math.log(db.totalItems / matches)
                           ]
                          )
                      )
コード例 #3
0
ファイル: database.py プロジェクト: Cheshire-Grampa/cheshire3
    def _search(self, session, query):
        if not hasattr(query, 'leftOperand'):
            # Check resultset
            rsid = query.getResultSetId()
            if (rsid):
                # Get existing result set
                if rsid.find('/') > -1:
                    (rssid, rsid) = rsid.split('/', 1)
                    rss = self.get_object(session, rssid)
                else:
                    rss = self.get_object(session, "defaultResultSetStore")
                rset =  rss.fetch_resultSet(session, rsid)
                rset.fromStore = 1
                return rset
            else:
                pm = self.get_path(session, 'protocolMap')
                if not pm:
                    self._cacheProtocolMaps(session)
                    pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/')
                    self.paths['protocolMap'] = pm
                idx = pm.resolveIndex(session, query)
                if (idx != None):
                    query.config = pm
                    rs = idx.search(session, query, self)
                    query.config = None
                    rs.query = query
                    return rs
                else:
                    # unsupported index
                    raise ObjectDoesNotExistException(query.index.toCQL())

        else:
            # get the indexStore
            left = self._search(session, query.leftOperand)
            right = self._search(session, query.rightOperand)
            if left.__class__ == right.__class__:
                new = left.__class__(session, [], recordStore=left.recordStore)
            elif left.__class__ == BitmapResultSet:
                # Want to switch the left/right, but rset assumes list[0] is same type
                new = right.__class__(session, [], recordStore=right.recordStore)
                if query.boolean.value == 'prox':
                    # bitmaps can't do prox, so just raise
                    raise QueryException("Cannot use Prox with %s" % left.index.toCQL(), 18)
                elif query.boolean.value == 'not':
                    # can't reorder without changing query
                    return new.combine(session, [left, right], query, self)
                else:
                    return new.combine(session, [right, left], query, self)
            elif right.__class__ == BitmapResultSet:
                new = left.__class__(session, [], recordStore=left.recordStore)
            else:
                new = SimpleResultSet(session, [])
            rs = new.combine(session, [left, right], query, self)
            trip = cql.Triple()
            trip.leftOperand = left.query
            trip.rightOperand = right.query
            trip.boolean = query.boolean
            rs.query = trip
            return rs
コード例 #4
0
 def testTfidf(self):
     "Test combining with TF-IDF relevance ranking."
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index all/rel.algorithm=tfidf "foo bar"')
     clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
     # A Database is required for relevance ranking
     db = FakeDatabase(self.session, None, parent=None)
     # Test self.a
     # Create a new ResultSet to combine into
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.a)
     self.assertListEqual([rsi.weight for rsi in rs],
                          [5 * math.log(db.totalItems / matches),
                           1 * math.log(db.totalItems / matches)]
                          )
     # Test self.b
     # Create a new ResultSet to combine into
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.b], clause, db)
     self.assertEqual(len(rs), 2)
     for rsi in rs:
         # Check that each ResultSetItem has a score (weight)
         self.assertTrue(hasattr(rsi, 'weight'))
         # Check that each ResultSetItem has a scaled score less than 1
         self.assertLessEqual(rsi.scaledWeight, 1.0)
     # Check scores are correct and in order
     matches = len(self.b)
     self.assertListEqual([rsi.weight for rsi in rs],
                          [3 * math.log(db.totalItems / matches),
                           2 * math.log(db.totalItems / matches)]
                          )
コード例 #5
0
 def testCombineAll(self):
     "Test combining ResultSets with 'all'"
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index all "foo"')
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a, self.b], clause)
     # Check return value is a Resultset
     self.assertIsInstance(rs, SimpleResultSet)
     # Check merged ResultSet has 1 item
     self.assertEqual(len(rs), 1)
     # Check that merged ResultSet contains the correct item
     self.assertIn(self.rsi1, rs)
コード例 #6
0
 def testCombineAny(self):
     "Test combining ResultSets with 'any'"
     # A clause / boolean is required to combine ResultSets
     clause = cqlparse('my.index any "foo"')
     # Create a new ResultSet to combine into 
     rs = SimpleResultSet(self.session)
     rs = rs.combine(self.session, [self.a, self.b], clause)
     # Check return value is a Resultset
     self.assertIsInstance(rs, SimpleResultSet)
     # Check merged ResultSet contains each ResultSetItem
     self.assertIn(self.rsi1, rs)
     self.assertIn(self.rsi2, rs)
     self.assertIn(self.rsi3, rs)
     self.assertIn(self.rsi4, rs)
     # Check merged ResultSet has 3 items (as rsi1 and rsi2 are identical)
     self.assertEqual(len(rs), 3)
コード例 #7
0
    def testOkapi(self):
        "Test combining with OKAPI BM-25 relevance ranking."
        # A clause / boolean is required to combine ResultSets
        b, k1, k3 = [0.75, 1.5, 1.5]
        clause = cqlparse('my.index all/rel.algorithm=okapi/'
                          'rel.const0={0}/'
                          'rel.const1={1}/'
                          'rel.const2={2}'
                          ' "foo bar"'.format(b, k1, k3))
        clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2")
        # A Database is required for relevance ranking
        db = FakeDatabase(self.session, None)
        # A RecordStore is required for CORI score calculation
        recStore = FakeRecordStore(self.session, None)
        # Test self.a
        # Create a new ResultSet to combine into 
        rs = SimpleResultSet(self.session)
        # Set ResultSet queryFrequency - required for OKAPI BM-25
        self.a.queryFreq = 1
        rs = rs.combine(self.session, [self.a], clause, db)
        self.assertEqual(len(rs), 2)
        for rsi in rs:
            # Check that each ResultSetItem has a score (weight)
            self.assertTrue(hasattr(rsi, 'weight'))
#            self.assertTrue(rsi.weight)
            # Check that each ResultSetItem has a scaled score less than 1
            self.assertLessEqual(rsi.scaledWeight, 1.0)
        # Check scores are correct and in order
        matches = len(self.a)
        idf = math.log(db.totalItems / matches)
        qtw = ((k3 + 1) * 1) / (k3 + 1)
        expectedScores = []
        for rsi in [self.rsi1, self.rsi3]:
            size = recStore.fetch_recordMetadata(self.session,
                                                 rsi.id,
                                                 'wordCount')
            T = (((k1 + 1) * rsi.occurences) /
                 ((k1 * ((1 - b) + b *
                         (size / db.meanWordCount)
                         )
                   ) +
                  rsi.occurences)
                 )
            expectedScores.append(idf * T * qtw)
        self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
        # Test self.b
        # Create a new ResultSet to combine into 
        rs = SimpleResultSet(self.session)
        # Set ResultSet queryFrequency - required for OKAPI BM-25
        self.b.queryFreq = 1
        rs = rs.combine(self.session, [self.b], clause, db)
        self.assertEqual(len(rs), 2)
        for rsi in rs:
            # Check that each ResultSetItem has a score (weight)
            self.assertTrue(hasattr(rsi, 'weight'))
#            self.assertTrue(rsi.weight)
            # Check that each ResultSetItem has a scaled score less than 1
            self.assertLessEqual(rsi.scaledWeight, 1.0)
        # Check scores are correct and in order
        matches = len(self.a)
        idf = math.log(db.totalItems / matches)
        qtw = ((k3 + 1) * 1) / (k3 + 1)
        expectedScores = []
        for rsi in [self.rsi2, self.rsi4]:
            size = recStore.fetch_recordMetadata(self.session,
                                                 rsi.id,
                                                 'wordCount')
            T = (((k1 + 1) * rsi.occurences) /
                 ((k1 * ((1 - b) + b *
                         (size / db.meanWordCount)
                         )
                   ) +
                  rsi.occurences)
                 )
            expectedScores.append(idf * T * qtw)
        self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
コード例 #8
0
ファイル: oaipmh.py プロジェクト: bloomonkey/archiveshub
    def _listResults(self, metadataPrefix, set_=None, from_=None, until=None):
        """Return a list of (datestamp, resultSet) tuples.

        Suitable for use by:
            - listIdentifiers
            - listRecords
        """
        session = self.session
        # Check set value
        if set_ and not set_.startswith('contributor:'):
            raise StopIteration
        elif set_:
            set_ = set_.split(':', 1)[-1]

        if until and until < self.earliestDatestamp:
            raise BadArgumentError('until argument value is earlier than '
                                   'earliestDatestamp.')
        if not from_:
            from_ = self.earliestDatestamp
        if not until:
            until = datetime.datetime.now()
            #(from_ < self.earliestDatestamp)
        if (until < from_):
            raise BadArgumentError('until argument value is earlier than from '
                                   'argument value.')
        q = cqlparse('rec.lastModificationDate > "%s" and '
                     'rec.lastModificationDate < "%s"' % (from_, until)
                     )
        # Actually need datestamp values as well as results - interact with
        # indexes directly for efficiency
        # Get CQL ProtocolMap
        pm = self.db.get_path(session, 'protocolMap')
        idx = pm.resolveIndex(session, q.leftOperand)
        q.config = pm
        res = {}
        for src in idx.sources[u'data']:
            res.update(src[1].process(session, [[str(from_)]]))
            res.update(src[1].process(session, [[str(until)]]))
        from_ = min(res.keys())
        until = max(res.keys())
        # Tweak until value to make it inclusive
        until = until[:-1] + chr(ord(until[-1]) + 1)
        termList = idx.fetch_termList(session, from_, 0, '>=', end=until)
        # Generate sequence of datestamp, resultSet tuples
        for t in termList:
            try:
                datetime_obj = datetime.datetime.strptime(
                    t[0],
                    u'%Y-%m-%dT%H:%M:%S'
                )
            except ValueError:
                datetime_obj = datetime.datetime.strptime(
                    t[0],
                    u'%Y-%m-%d %H:%M:%S'
                )
            datetime_rs = idx.construct_resultSet(session, t[1])
            if not set_:
                yield (datetime_obj, datetime_rs)
            else:
                # Filter by set
                set_q = cqlparse('vdb.identifier = {0}'.format(set_))
                set_rs = self.db.search(session, set_q)
                full_rs = SimpleResultSet(session)
                full_q = cqlparse('{0} and {1}'
                                  ''.format(q.toCQL(), set_q.toCQL()))
                yield (datetime_obj, full_rs.combine(session,
                                                     [datetime_rs, set_rs],
                                                     full_q
                                                     )
                       )