def testAppend(self): "Test appending a single item to a ResultSet" rs = SimpleResultSet(self.session) self.assertEqual(len(rs), 0) rs.append(self.rsi1) self.assertEqual(len(rs), 1) self.assertEqual(rs[-1], self.rsi1)
def testCombineSumWeights(self): "Test combining ResultSet scores by summation." # A clause / boolean is required to combine ResultSets # Use TF-IDF because it's most simple to calculate clause = cqlparse('my.index ' 'all/rel.algorithm=tfidf/rel.combine=sum ' '"foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None, parent=None) # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause, db) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet has 1 item self.assertEqual(len(rs), 1) # Check that merged ResultSet contains the correct item self.assertIn(self.rsi1, rs) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check combined scores correct matches = len(self.b) self.assertEqual(rs[0].weight, sum([5 * math.log(db.totalItems / matches), 3 * math.log(db.totalItems / matches) ] ) )
def _search(self, session, query): if not hasattr(query, 'leftOperand'): # Check resultset rsid = query.getResultSetId() if (rsid): # Get existing result set if rsid.find('/') > -1: (rssid, rsid) = rsid.split('/', 1) rss = self.get_object(session, rssid) else: rss = self.get_object(session, "defaultResultSetStore") rset = rss.fetch_resultSet(session, rsid) rset.fromStore = 1 return rset else: pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm idx = pm.resolveIndex(session, query) if (idx != None): query.config = pm rs = idx.search(session, query, self) query.config = None rs.query = query return rs else: # unsupported index raise ObjectDoesNotExistException(query.index.toCQL()) else: # get the indexStore left = self._search(session, query.leftOperand) right = self._search(session, query.rightOperand) if left.__class__ == right.__class__: new = left.__class__(session, [], recordStore=left.recordStore) elif left.__class__ == BitmapResultSet: # Want to switch the left/right, but rset assumes list[0] is same type new = right.__class__(session, [], recordStore=right.recordStore) if query.boolean.value == 'prox': # bitmaps can't do prox, so just raise raise QueryException("Cannot use Prox with %s" % left.index.toCQL(), 18) elif query.boolean.value == 'not': # can't reorder without changing query return new.combine(session, [left, right], query, self) else: return new.combine(session, [right, left], query, self) elif right.__class__ == BitmapResultSet: new = left.__class__(session, [], recordStore=left.recordStore) else: new = SimpleResultSet(session, []) rs = new.combine(session, [left, right], query, self) trip = cql.Triple() trip.leftOperand = left.query trip.rightOperand = right.query trip.boolean = query.boolean rs.query = trip return rs
def testExtend(self): "Test appending multiple item to a ResultSet" rs = SimpleResultSet(self.session) self.assertEqual(len(rs), 0) rs.extend([self.rsi1, self.rsi2]) self.assertEqual(len(rs), 2) self.assertEqual(rs[0], self.rsi1) self.assertEqual(rs[1], self.rsi2)
def testFromList(self): "Test population of SimpleResultSet using fromList method." rs = SimpleResultSet(self.session) self.assertEqual(len(rs), 0) self.assertIsInstance(rs, SimpleResultSet) rs.fromList([self.rsi1, self.rsi3]) for x, y in zip(self.a, rs): self.assertEqual(x, y)
def _deserialise(self, session, data, size, id_): data = data.decode('string_escape') fmt = '<' + 'l' * size ids = struct.unpack(fmt, data) # can't use bitfield, as need to preserve order rset = SimpleResultSet(session) items = [SimpleResultSetItem(session, x, resultSet=rset) for x in ids] rset.fromList(items) rset.id = id_ return rset
def _deserialise(self, session, data, size, id): data = data.decode('string_escape') fmt = 'l' * size ids = struct.unpack(fmt, data) # can't use bitfield, as need to preserve order rset = SimpleResultSet(session) items = [SimpleResultSetItem(session, x, resultSet=rset) for x in ids] rset.fromList(items) rset.id = id return rset
def testCombineAll(self): "Test combining ResultSets with 'all'" # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all "foo"') # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet has 1 item self.assertEqual(len(rs), 1) # Check that merged ResultSet contains the correct item self.assertIn(self.rsi1, rs)
def search(self, session, query, db): # take CQL query and translate to Lucene pm = db.get_path(session, 'protocolMap') if not pm: db._cacheProtocolMaps(session) pm = db.protocolMaps.get('http://www.loc.gov/zing/srw/') query.config = pm lq = cqlToLucene(session, query, pm) q = self.parser.parse(lq) results = self.searcher.search(q, lucene.Sort.RELEVANCE) # now map to a ResultSet items = [] for x in range(len(results)): hit = results[x] w = results.score(x) rsid = hit.getField('id').stringValue() (recStore, id) = rsid.split('/') if id.isdigit(): id = int(id) rsi = SimpleResultSetItem(session, id, recStore, weight=w) items.append(rsi) rs = SimpleResultSet(session, items) return rs
def testInit(self): "Check initialization of ResultSet" rs = SimpleResultSet(self.session) # Check is instance of SimpleResultSet self.assertIsInstance(rs, SimpleResultSet) # Check is empty self.assertEqual(len(rs), 0)
def _get_test_resultSets(self): for x in range(5): rs = SimpleResultSet(self.session) for y in range(5): occs = 5 - x rs.append(SimpleResultSetItem(self.session, id=y, recStore="recordStore", occs=occs, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) ) yield rs
def _get_test_resultSets(self): for x in range(5): rs = SimpleResultSet(self.session) for y in range(5): occs = 5 - x rs.append( SimpleResultSetItem(self.session, id=y, recStore="recordStore", occs=occs, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None)) yield rs
def testCombineAny(self): "Test combining ResultSets with 'any'" # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index any "foo"') # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a, self.b], clause) # Check return value is a Resultset self.assertIsInstance(rs, SimpleResultSet) # Check merged ResultSet contains each ResultSetItem self.assertIn(self.rsi1, rs) self.assertIn(self.rsi2, rs) self.assertIn(self.rsi3, rs) self.assertIn(self.rsi4, rs) # Check merged ResultSet has 3 items (as rsi1 and rsi2 are identical) self.assertEqual(len(rs), 3)
def setUp(self): """Setup some ResultsetItems and put them into ResultSets to evaluate. N.B. a == b, other pairs should not evaluate as equal """ self.session = session = Session() # Set up same 4 ResultSetItems as for SimpleResultSetItemTestCase self.rsi1 = SimpleResultSetItem(session, id=0, recStore="recordStore", occs=5, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.rsi2 = SimpleResultSetItem(session, id=0, recStore="recordStore", occs=3, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.rsi3 = SimpleResultSetItem(session, id=1, recStore="recordStore", occs=1, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.rsi4 = SimpleResultSetItem(session, id=0, recStore="recordStore2", occs=2, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) # Put identical (rsi1 and rsi2) into separate ResultSets self.a = SimpleResultSet(session, [self.rsi1, self.rsi3], id="a") self.b = SimpleResultSet(session, [self.rsi2, self.rsi4], id="b")
def search(self, session, query, db): # Kludgey optimisation pm = db.get_path(session, 'protocolMap') if not pm: db._cacheProtocolMaps(session) pm = db.protocolMaps.get('http://www.loc.gov/zing/srw/') db.paths['protocolMap'] = pm query = self._cql_to_sql(session, query, pm) res = self._query(query) dr = res.dictresult() rset = SimpleResultSet([]) rsilist = [] for t in dr: (store, id) = t['recordid'].split('/', 1) item = SimpleResultSetItem(session, id, store, 1, resultSet=rset) rsilist.append(item) rset.fromList(rsilist) return rset
def fetch_resultSet(self, session, id): data = self.fetch_data(session, id) if (data): (cl, srlz) = data.split('||', 1) rset = dynamic.buildObject(session, cl, []) rset.deserialise(session, srlz) return rset else: return SimpleResultSet(session, [], id)
def construct_resultSet(self, session, terms, queryHash={}): # in: res.dictresult() s = SimpleResultSet(session, []) rsilist = [] for t in terms['records']: (store, id) = t['recordid'].split('/', 1) occs = t['occurences'] item = SimpleResultSetItem(session, id, store, occs, resultSet=s) rsilist.append(item) s.fromList(rsilist) s.index = self if queryHash: s.queryTerm = queryHash['text'] s.queryFreq = queryHash['occurences'] s.totalRecs = terms['totalRecs'] s.totalOccs = terms['totalOccs'] return s
def testCori(self): "Test combining with CORI relevance ranking." # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all/rel.algorithm=cori "foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None) # A RecordStore is required for CORI score calculation recStore = FakeRecordStore(self.session, None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) # I is used in calculating score for each item I = (math.log((db.totalItems + 0.5) / matches) / math.log(db.totalItems + 1.0)) expectedScores = [] for rsi in [self.rsi1, self.rsi3]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (rsi.occurences / (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount)) ) expectedScores.append(0.4 + (0.6 * T * I)) self.assertListEqual([rsi.weight for rsi in rs], expectedScores) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.b) # I is used in calculating score for each item I = (math.log((db.totalItems + 0.5) / matches) / math.log(db.totalItems + 1.0)) expectedScores = [] for rsi in [self.rsi2, self.rsi4]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (rsi.occurences / (rsi.occurences + 50.0 + (( 150.0 * size) / db.meanWordCount)) ) expectedScores.append(0.4 + (0.6 * T * I)) self.assertListEqual([rsi.weight for rsi in rs], expectedScores)
def testTfidf(self): "Test combining with TF-IDF relevance ranking." # A clause / boolean is required to combine ResultSets clause = cqlparse('my.index all/rel.algorithm=tfidf "foo bar"') clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None, parent=None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) self.assertListEqual([rsi.weight for rsi in rs], [5 * math.log(db.totalItems / matches), 1 * math.log(db.totalItems / matches)] ) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.b) self.assertListEqual([rsi.weight for rsi in rs], [3 * math.log(db.totalItems / matches), 2 * math.log(db.totalItems / matches)] )
def search(self, session, query): # Check for optimized indexStore based search (eg SQL translation) storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') # FIXME: Should respect multiple index stores somehow? idxStore = self.get_object(session, storeList[0]) # Check if there's an indexStore specific search function if hasattr(idxStore, 'search'): return idxStore.search(session, query, self) else: if ((not hasattr(query, 'leftOperand')) and query.relation.value == "any"): # Don't try to rewrite, futile. pass else: n = self._rewriteQuery(session, query) if n: query = n if not hasattr(query, 'leftOperand'): # Single term or any in single clause query.resultCount = 1 rs = self._search(session, query) else: # Triples... walk and look for ANDs that have a 0 length rs # Attach resultsets with counts self._attachResultCount(session, query) if query.resultCount == 0: # no matches return SimpleResultSet([]) else: rs = self._search(session, query) # now do top level stuff, like sort if rs.relevancy: rs.scale_weights() rs.order(session, "weight") elif query.sortKeys: # CQL 1.2 sort definition # URI: info:srw/cql-context-set/1/sort-v1.0 sk = query.sortKeys sk.reverse() # stable sort = keys in reverse order pm = self.get_path(session, 'protocolMap') if not pm: self._cacheProtocolMaps(session) pm = self.protocolMaps.get('http://www.loc.gov/zing/srw/') self.paths['protocolMap'] = pm for idx in sk: # resolve index index = pm.resolveIndex(session, query) # and find params from modifiers if idx['ascending']: ascending = True elif idx['descending']: ascending = False elif hasattr(pm, 'defaultSortDirection'): ascending = pm.defaultSortDirection[:3].lower() == 'asc' else: ascending = True if idx['missingomit']: miss = 0 elif idx['missinghigh']: miss = 1 elif idx['missinglow']: miss = -1 elif idx['missingfail']: miss = cql.Diagnostic() elif idx['missingvalue']: miss = idx['missingvalue'].value elif hasattr(pm, 'defaultSortMissing'): m = pm.defaultSortMissing vals = ['low', 'omit', 'high'] if m in vals: miss = int(vals.index(m)) - 1 elif m == 'fail': miss = cql.Diagnostic() else: miss = m else: miss = [-1, 1][int(ascending)] if idx['respectcase']: case = 1 elif idx['ignorecase']: case = 0
def _search(self, session, query): if query.resultCount == 0: # No matches in this full subtree return SimpleResultSet([]) else: return SimpleDatabase._search(self, session, query)
def _listResults(self, metadataPrefix, set_=None, from_=None, until=None): """Return a list of (datestamp, resultSet) tuples. Suitable for use by: - listIdentifiers - listRecords """ session = self.session # Check set value if set_ and not set_.startswith('contributor:'): raise StopIteration elif set_: set_ = set_.split(':', 1)[-1] if until and until < self.earliestDatestamp: raise BadArgumentError('until argument value is earlier than ' 'earliestDatestamp.') if not from_: from_ = self.earliestDatestamp if not until: until = datetime.datetime.now() #(from_ < self.earliestDatestamp) if (until < from_): raise BadArgumentError('until argument value is earlier than from ' 'argument value.') q = cqlparse('rec.lastModificationDate > "%s" and ' 'rec.lastModificationDate < "%s"' % (from_, until) ) # Actually need datestamp values as well as results - interact with # indexes directly for efficiency # Get CQL ProtocolMap pm = self.db.get_path(session, 'protocolMap') idx = pm.resolveIndex(session, q.leftOperand) q.config = pm res = {} for src in idx.sources[u'data']: res.update(src[1].process(session, [[str(from_)]])) res.update(src[1].process(session, [[str(until)]])) from_ = min(res.keys()) until = max(res.keys()) # Tweak until value to make it inclusive until = until[:-1] + chr(ord(until[-1]) + 1) termList = idx.fetch_termList(session, from_, 0, '>=', end=until) # Generate sequence of datestamp, resultSet tuples for t in termList: try: datetime_obj = datetime.datetime.strptime( t[0], u'%Y-%m-%dT%H:%M:%S' ) except ValueError: datetime_obj = datetime.datetime.strptime( t[0], u'%Y-%m-%d %H:%M:%S' ) datetime_rs = idx.construct_resultSet(session, t[1]) if not set_: yield (datetime_obj, datetime_rs) else: # Filter by set set_q = cqlparse('vdb.identifier = {0}'.format(set_)) set_rs = self.db.search(session, set_q) full_rs = SimpleResultSet(session) full_q = cqlparse('{0} and {1}' ''.format(q.toCQL(), set_q.toCQL())) yield (datetime_obj, full_rs.combine(session, [datetime_rs, set_rs], full_q ) )
def testOkapi(self): "Test combining with OKAPI BM-25 relevance ranking." # A clause / boolean is required to combine ResultSets b, k1, k3 = [0.75, 1.5, 1.5] clause = cqlparse('my.index all/rel.algorithm=okapi/' 'rel.const0={0}/' 'rel.const1={1}/' 'rel.const2={2}' ' "foo bar"'.format(b, k1, k3)) clause.addPrefix('rel', "info:srw/cql-context-set/2/relevance-1.2") # A Database is required for relevance ranking db = FakeDatabase(self.session, None) # A RecordStore is required for CORI score calculation recStore = FakeRecordStore(self.session, None) # Test self.a # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) # Set ResultSet queryFrequency - required for OKAPI BM-25 self.a.queryFreq = 1 rs = rs.combine(self.session, [self.a], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # self.assertTrue(rsi.weight) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) idf = math.log(db.totalItems / matches) qtw = ((k3 + 1) * 1) / (k3 + 1) expectedScores = [] for rsi in [self.rsi1, self.rsi3]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (((k1 + 1) * rsi.occurences) / ((k1 * ((1 - b) + b * (size / db.meanWordCount) ) ) + rsi.occurences) ) expectedScores.append(idf * T * qtw) self.assertListEqual([rsi.weight for rsi in rs], expectedScores) # Test self.b # Create a new ResultSet to combine into rs = SimpleResultSet(self.session) # Set ResultSet queryFrequency - required for OKAPI BM-25 self.b.queryFreq = 1 rs = rs.combine(self.session, [self.b], clause, db) self.assertEqual(len(rs), 2) for rsi in rs: # Check that each ResultSetItem has a score (weight) self.assertTrue(hasattr(rsi, 'weight')) # self.assertTrue(rsi.weight) # Check that each ResultSetItem has a scaled score less than 1 self.assertLessEqual(rsi.scaledWeight, 1.0) # Check scores are correct and in order matches = len(self.a) idf = math.log(db.totalItems / matches) qtw = ((k3 + 1) * 1) / (k3 + 1) expectedScores = [] for rsi in [self.rsi2, self.rsi4]: size = recStore.fetch_recordMetadata(self.session, rsi.id, 'wordCount') T = (((k1 + 1) * rsi.occurences) / ((k1 * ((1 - b) + b * (size / db.meanWordCount) ) ) + rsi.occurences) ) expectedScores.append(idf * T * qtw) self.assertListEqual([rsi.weight for rsi in rs], expectedScores)