Example #1
0
    def add_page(self, user, coll, pagedata):
        if not self.can_write_coll(user, coll):
            print('Cannot Write')
            return False

        url = pagedata['url']

        try:
            key, end_key = calc_search_range(url, 'exact')
        except:
            print('Cannot Canon')
            return False

        if 'ts' not in pagedata:
            cdx_key = self.make_key(user, coll, self.CDX_KEY)
            result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key)
            if not result:
                print('NO CDX')
                return False

            last_cdx = CDXObject(result[-1])

            pagedata['ts'] = last_cdx['timestamp']

        pagedata_json = json.dumps(pagedata)

        key = self.make_key(user, coll, self.PAGE_KEY)

        self.redis.sadd(key, pagedata_json)
Example #2
0
    def __init__(self, params):
        self.params = params
        alt_url = self.params.get('alt_url')
        url = alt_url or self.url
        surt_ordered = self.params.get('surt_ordered')

        if not self.params.get('matchType'):
            if url.startswith('*.'):
                url = self.params['url'] = url[2:]
                self.params['matchType'] = 'domain'
            elif url.endswith('*'):
                url = self.params['url'] = url[:-1]
                self.params['matchType'] = 'prefix'
            else:
                self.params['matchType'] = 'exact'

        if alt_url:
            self.params['alt_url'] = url

        start, end = calc_search_range(url=url,
                                       surt_ordered=surt_ordered,
                                       match_type=self.params['matchType'],
                                       url_canon=self.params.get('_url_canon'))

        self.params['key'] = start.encode('utf-8')
        self.params['end_key'] = end.encode('utf-8')
Example #3
0
    def add_page(self, user, coll, pagedata):
        if not self.can_write_coll(user, coll):
            print('Cannot Write')
            return False

        url = pagedata['url']

        try:
            key, end_key = calc_search_range(url, 'exact')
        except:
            print('Cannot Canon')
            return False

        if 'ts' not in pagedata:
            cdx_key = self.make_key(user, coll, self.CDX_KEY)
            result = self.redis.zrangebylex(cdx_key,
                                            '[' + key,
                                            '(' + end_key)
            if not result:
                print('NO CDX')
                return False

            last_cdx = CDXObject(result[-1])

            pagedata['ts'] = last_cdx['timestamp']

        pagedata_json = json.dumps(pagedata)

        key = self.make_key(user, coll, self.PAGE_KEY)

        self.redis.sadd(key, pagedata_json)
Example #4
0
    def lookup(self, digest, url, timestamp):
        start, end = calc_search_range(url, 'exact')
        results = self.redis.zrangebylex(self.key, '[' + start, '(' + end)
        for res in results:
            cdx = CDXObject(res)
            if digest == cdx.get('digest'):
                return ('revisit', cdx['url'], timestamp_to_datetime(cdx['timestamp']))

        return None
Example #5
0
    def _get_url_ts(self, user, coll, rec, url):
        try:
            key, end_key = calc_search_range(url, 'exact')
        except:
            return None

        cdx_key = self.cdx_key.format(user=user, coll=coll, rec=rec)

        result = self.redis.zrangebylex(cdx_key, '[' + key, '(' + end_key)
        if not result:
            return None

        last_cdx = CDXObject(result[-1].encode('utf-8'))

        return last_cdx['timestamp']
Example #6
0
    def __init__(self, params):
        self.params = params
        url = self.url
        url = self.params.get('alt_url', url)
        if not self.params.get('matchType'):
            if url.startswith('*.'):
                url = self.params['url'] = url[2:]
                self.params['matchType'] = 'domain'
            elif url.endswith('*'):
                url = self.params['url'] = url[:-1]
                self.params['matchType'] = 'prefix'
            else:
                self.params['matchType'] = 'exact'

        start, end = calc_search_range(url=url,
                                       match_type=self.params['matchType'],
                                       url_canon=self.params.get('_url_canon'))

        self.params['key'] = start.encode('utf-8')
        self.params['end_key'] = end.encode('utf-8')
Example #7
0
 def _calc_search_keys(self, query):
     return calc_search_range(url=query.url,
                              match_type=query.match_type,
                              url_canon=self.url_canon)
Example #8
0
 def _calc_search_keys(self, query):
     return calc_search_range(url=query.url,
                              match_type=query.match_type,
                              url_canon=self.url_canon)