def testCori(self): "Test combining with CORI relevance ranking." # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all/rel.algorithm=cori "foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None) # A RecordStore is required for CORI score calculation recStore = FakeRecordStore(self.session, None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) # I is used in calculating score for each item I = (math.log((db.totalItems + 0.5) / matches) / math.log(db.totalItems + 1.0)) expectedScores = [] for rsi in [self.rsi1, self.rsi3]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (rsi.occurences / (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount)) ) expectedScores.append(0.4 + (0.6 * T * I)) self.assertListEqual([rsi.weight for rsi in rs], expectedScores) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.b) # I is used in calculating score for each item I = (math.log((db.totalItems + 0.5) / matches) / math.log(db.totalItems + 1.0)) expectedScores = [] for rsi in [self.rsi2, self.rsi4]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (rsi.occurences / (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount)) ) expectedScores.append(0.4 + (0.6 * T * I)) self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
def testCombineSumWeights(self): "Test combining ResultSet scores by summation." # A clause / boolean is required to combine ResultSets # Use TF-IDF because it's most simple to calculate clause = cqlparse('my.index ' 'all/rel.algorithm=tfidf/rel.combine=sum ' '"foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None, parent=None) # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause, db) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet has 1 item self.assertEqual(len(rs), 1) # Check that merged ResultSet contains the correct item self.assertIn(self.rsi1, rs) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check combined scores correct matches = len(self.b) self.assertEqual(rs[0].weight, sum([5 * math.log(db.totalItems / matches), 3 * math.log(db.totalItems / matches) ] ) )
def _search(self, session, query): if not hasattr(query, 'leftOperand'): # Check resultset rsid = query.getResultSetId() if (rsid): # Get existing result set if rsid.find('/') > -1: (rssid, rsid) = rsid.split('/', 1) rss = self.get_object(session, rssid) else: rss = self.get_object(session, "defaultResultSetStore") rset = rss.fetch_resultSet(session, rsid) rset.fromStore = 1 return rset else: pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, query) if (idx != None): query.config = pm rs = idx.search(session, query, self) query.config = None rs.query = query return rs else: # unsupported index raise ObjectDoesNotExistException(query.index.toCQL()) else: # get the indexStore left = self._search(session, query.leftOperand) right = self._search(session, query.rightOperand) if left.__class__ == right.__class__: new = left.__class__(session, [], recordStore=left.recordStore) elif left.__class__ == BitmapResultSet: # Want to switch the left/right, but rset assumes list[0] is same type new = right.__class__(session, [], recordStore=right.recordStore) if query.boolean.value == 'prox': # bitmaps can't do prox, so just raise raise QueryException("Cannot use Prox with %s" % left.index.toCQL(), 18) elif query.boolean.value == 'not': # can't reorder without changing query return new.combine(session, [left, right], query, self) else: return new.combine(session, [right, left], query, self) elif right.__class__ == BitmapResultSet: new = left.__class__(session, [], recordStore=left.recordStore) else: new = SimpleResultSet(session, []) rs = new.combine(session, [left, right], query, self) trip = cql.Triple() trip.leftOperand = left.query trip.rightOperand = right.query trip.boolean = query.boolean rs.query = trip return rs
def testTfidf(self): "Test combining with TF-IDF relevance ranking." # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all/rel.algorithm=tfidf "foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None, parent=None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) self.assertListEqual([rsi.weight for rsi in rs], [5 * math.log(db.totalItems / matches), 1 * math.log(db.totalItems / matches)] ) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.b) self.assertListEqual([rsi.weight for rsi in rs], [3 * math.log(db.totalItems / matches), 2 * math.log(db.totalItems / matches)] )
def testCombineAll(self): "Test combining ResultSets with 'all'" # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all "foo"') # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet has 1 item self.assertEqual(len(rs), 1) # Check that merged ResultSet contains the correct item self.assertIn(self.rsi1, rs)
def testCombineAny(self): "Test combining ResultSets with 'any'" # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index any "foo"') # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet contains each ResultSetItem self.assertIn(self.rsi1, rs) self.assertIn(self.rsi2, rs) self.assertIn(self.rsi3, rs) self.assertIn(self.rsi4, rs) # Check merged ResultSet has 3 items (as rsi1 and rsi2 are identical) self.assertEqual(len(rs), 3)
def testOkapi(self): "Test combining with OKAPI BM-25 relevance ranking." # A clause / boolean is required to combine ResultSets b, k1, k3 = [0.75, 1.5, 1.5] clause = cqlparse('my.index all/rel.algorithm=okapi/' 'rel.const0={0}/' 'rel.const1={1}/' 'rel.const2={2}' ' "foo bar"'.format(b, k1, k3)) clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None) # A RecordStore is required for CORI score calculation recStore = FakeRecordStore(self.session, None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) # Set ResultSet queryFrequency - required for OKAPI BM-25 self.a.queryFreq = 1 rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # self.assertTrue(rsi.weight) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) idf = math.log(db.totalItems / matches) qtw = ((k3 + 1) * 1) / (k3 + 1) expectedScores = [] for rsi in [self.rsi1, self.rsi3]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (((k1 + 1) * rsi.occurences) / ((k1 * ((1 - b) + b * (size / db.meanWordCount) ) ) + rsi.occurences) ) expectedScores.append(idf * T * qtw) self.assertListEqual([rsi.weight for rsi in rs], expectedScores) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) # Set ResultSet queryFrequency - required for OKAPI BM-25 self.b.queryFreq = 1 rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # self.assertTrue(rsi.weight) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) idf = math.log(db.totalItems / matches) qtw = ((k3 + 1) * 1) / (k3 + 1) expectedScores = [] for rsi in [self.rsi2, self.rsi4]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (((k1 + 1) * rsi.occurences) / ((k1 * ((1 - b) + b * (size / db.meanWordCount) ) ) + rsi.occurences) ) expectedScores.append(idf * T * qtw) self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
def _listResults(self, metadataPrefix, set_=None, from_=None, until=None): """Return a list of (datestamp, resultSet) tuples. Suitable for use by: - listIdentifiers - listRecords """ session = self.session # Check set value if set_ and not set_.startswith('contributor:'): raise StopIteration elif set_: set_ = set_.split(':', 1)[-1] if until and until < self.earliestDatestamp: raise BadArgumentError('until argument value is earlier than ' 'earliestDatestamp.') if not from_: from_ = self.earliestDatestamp if not until: until = datetime.datetime.now() #(from_ < self.earliestDatestamp) if (until < from_): raise BadArgumentError('until argument value is earlier than from ' 'argument value.') q = cqlparse('rec.lastModificationDate > "%s" and ' 'rec.lastModificationDate < "%s"' % (from_, until) ) # Actually need datestamp values as well as results - interact with # indexes directly for efficiency # Get CQL ProtocolMap pm = self.db.get_path(session, 'protocolMap') idx = pm.resolveIndex(session, q.leftOperand) q.config = pm res = {} for src in idx.sources[u'data']: res.update(src[1].process(session, [[str(from_)]])) res.update(src[1].process(session, [[str(until)]])) from_ = min(res.keys()) until = max(res.keys()) # Tweak until value to make it inclusive until = until[:-1] + chr(ord(until[-1]) + 1) termList = idx.fetch_termList(session, from_, 0, '>=', end=until) # Generate sequence of datestamp, resultSet tuples for t in termList: try: datetime_obj = datetime.datetime.strptime( t[0], u'%Y-%m-%dT%H:%M:%S' ) except ValueError: datetime_obj = datetime.datetime.strptime( t[0], u'%Y-%m-%d %H:%M:%S' ) datetime_rs = idx.construct_resultSet(session, t[1]) if not set_: yield (datetime_obj, datetime_rs) else: # Filter by set set_q = cqlparse('vdb.identifier = {0}'.format(set_)) set_rs = self.db.search(session, set_q) full_rs = SimpleResultSet(session) full_q = cqlparse('{0} and {1}' ''.format(q.toCQL(), set_q.toCQL())) yield (datetime_obj, full_rs.combine(session, [datetime_rs, set_rs], full_q ) )