def add_page(self, user, coll, pagedata): if not self.can_write_coll(user, coll): print('Cannot Write') return False url = pagedata['url'] try: key, end_key = calc_search_range(url, 'exact') except: print('Cannot Canon') return False if 'ts' not in pagedata: cdx_key = self.make_key(user, coll, self.CDX_KEY) result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key) if not result: print('NO CDX') return False last_cdx = CDXObject(result[-1]) pagedata['ts'] = last_cdx['timestamp'] pagedata_json = json.dumps(pagedata) key = self.make_key(user, coll, self.PAGE_KEY) self.redis.sadd(key, pagedata_json)
def __init__(self, params): self.params = params alt_url = self.params.get('alt_url') url = alt_url or self.url surt_ordered = self.params.get('surt_ordered') if not self.params.get('matchType'): if url.startswith('*.'): url = self.params['url'] = url[2:] self.params['matchType'] = 'domain' elif url.endswith('*'): url = self.params['url'] = url[:-1] self.params['matchType'] = 'prefix' else: self.params['matchType'] = 'exact' if alt_url: self.params['alt_url'] = url start, end = calc_search_range(url=url, surt_ordered=surt_ordered, match_type=self.params['matchType'], url_canon=self.params.get('_url_canon')) self.params['key'] = start.encode('utf-8') self.params['end_key'] = end.encode('utf-8')
def lookup(self, digest, url, timestamp): start, end = calc_search_range(url, 'exact') results = self.redis.zrangebylex(self.key, '[' + start, '(' + end) for res in results: cdx = CDXObject(res) if digest == cdx.get('digest'): return ('revisit', cdx['url'], timestamp_to_datetime(cdx['timestamp'])) return None
def _get_url_ts(self, user, coll, rec, url): try: key, end_key = calc_search_range(url, 'exact') except: return None cdx_key = self.cdx_key.format(user=user, coll=coll, rec=rec) result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key) if not result: return None last_cdx = CDXObject(result[-1].encode('utf-8')) return last_cdx['timestamp']
def __init__(self, params): self.params = params url = self.url url = self.params.get('alt_url', url) if not self.params.get('matchType'): if url.startswith('*.'): url = self.params['url'] = url[2:] self.params['matchType'] = 'domain' elif url.endswith('*'): url = self.params['url'] = url[:-1] self.params['matchType'] = 'prefix' else: self.params['matchType'] = 'exact' start, end = calc_search_range(url=url, match_type=self.params['matchType'], url_canon=self.params.get('_url_canon')) self.params['key'] = start.encode('utf-8') self.params['end_key'] = end.encode('utf-8')
def _calc_search_keys(self, query): return calc_search_range(url=query.url, match_type=query.match_type, url_canon=self.url_canon)