Esempio n. 1
0
    def add_urls_to_index(self, stream, params, filename, length):
        upload_key = params.get('param.upid')
        if upload_key:
            stream = SizeTrackingReader(stream, length, self.redis, upload_key)

        params['writer_cls'] = CDXJIndexer

        cdx_list = (super(WebRecRedisIndexer,
                          self).add_urls_to_index(stream, params, filename,
                                                  length))

        # if replay key exists, add to it as well!
        coll_cdxj_key = res_template(self.coll_cdxj_key, params)
        if self.redis.exists(coll_cdxj_key):
            for cdx in cdx_list:
                if cdx:
                    self.redis.zadd(coll_cdxj_key, 0, cdx)

        dt_now = datetime.utcnow()

        ts_sec = int(dt_now.timestamp())

        with redis_pipeline(self.redis) as pi:
            for key_templ in self.info_keys:
                key = res_template(key_templ, params)
                pi.hincrby(key, 'size', length)
                if cdx_list:
                    pi.hset(key, 'updated_at', ts_sec)
                    if key_templ == self.rec_info_key_templ:
                        pi.hset(key, 'recorded_at', ts_sec)

        self.stats.incr_record(params, length, cdx_list)

        return cdx_list
Esempio n. 2
0
    def add_warc_file(self, full_filename, params):
        base_filename = self._get_rel_or_base_name(full_filename, params)
        file_key = res_template(self.file_key_template, params)
        rec_key = res_template(self.rec_file_key_template, params)

        full_load_path = storagepaths.add_local_store_prefix(full_filename)

        self.redis.hset(file_key, base_filename, full_load_path)
        self.redis.sadd(rec_key, base_filename)
Esempio n. 3
0
    def _is_write_resp(self, resp, params):
        if not params['recording'].is_open():
            logger.debug(
                'Record Writer: Writing skipped, recording not open for write')
            return False

        user_key = res_template(self.user_key, params)
        size, max_size = self.redis.hmget(user_key, ['size', 'max_size'])

        size = int(size or 0)
        max_size = int(max_size or 0)

        length = resp.length or resp.rec_headers.get_header('Content-Length')
        if length is None:
            self.ensure_digest(resp, block=True, payload=True)
            resp.length = resp.payload_length
            length = resp.length

        if size + length > max_size:
            logger.error(
                'Record Writer: New Record for {0} exceeds max size, not recording!'
                .format(params['url']))
            return False

        return True
Esempio n. 4
0
    def add_warc_file(self, full_filename, params):
        base_filename = self._get_rel_or_base_name(full_filename, params)
        file_key = res_template(self.file_key_template, params)

        full_load_path = self.full_warc_prefix + full_filename

        self.redis.hset(file_key, base_filename, full_load_path)
Esempio n. 5
0
    def load_index(self, params):
        # no fuzzy match for live resources
        if params.get('is_fuzzy'):
            raise NotFoundException(params['url'] + '*')

        cdx = CDXObject()
        cdx['urlkey'] = params.get('key').decode('utf-8')
        cdx['timestamp'] = timestamp_now()
        cdx['url'] = params['url']
        cdx['load_url'] = res_template(self.proxy_url, params)
        cdx['is_live'] = 'true'

        mime = params.get('content_type', '')

        if params.get('filter') and not mime:
            try:
                res = self.sesh.head(cdx['url'])
                if res.status_code != 405:
                    cdx['status'] = str(res.status_code)

                content_type = res.headers.get('Content-Type')
                if content_type:
                    mime = content_type.split(';')[0]

            except Exception as e:
                pass

        cdx['mime'] = mime

        return iter([cdx])
    def _write_to_file(self, params, write_callback):
        full_dir = res_template(self.dir_template, params)
        dir_key = self.get_dir_key(params)

        result = self.fh_cache.get(dir_key)

        close_file = False

        new_size = start = 0

        if result:
            out, filename = result
            is_new = False
        else:
            filename = self.get_new_filename(full_dir, params)

            if not self.allow_new_file(filename, params):
                return False

            out = self._open_file(filename, params)

            is_new = True

        try:
            start = out.tell()

            write_callback(out, filename)

            out.flush()

            new_size = out.tell()

            out.seek(start)

            if self.dedup_index:
                self.dedup_index.add_urls_to_index(out, params,
                                                   filename,
                                                   new_size - start)

            return True

        except Exception as e:
            traceback.print_exc()
            close_file = True
            return False

        finally:
            # check for rollover
            if self.max_size and new_size > self.max_size:
                close_file = True

            if close_file:
                self._close_file(out)
                if not is_new:
                    self.fh_cache.pop(dir_key, None)

            elif is_new:
                if os.name != 'nt':
                    portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB)
                self.fh_cache[dir_key] = (out, filename)
Esempio n. 7
0
    def add_warc_file(self, full_filename, params):
        base_filename = self._get_rel_or_base_name(full_filename, params)
        file_key = res_template(self.file_key_template, params)

        full_load_path = self.full_warc_prefix + full_filename

        self.redis.hset(file_key, base_filename, full_load_path)
Esempio n. 8
0
    def _write_to_file(self, params, write_callback):
        full_dir = res_template(self.dir_template, params)
        dir_key = self.get_dir_key(params)

        result = self.fh_cache.get(dir_key)

        close_file = False

        new_size = start = 0

        if result:
            out, filename = result
            is_new = False
        else:
            filename = self.get_new_filename(full_dir, params)

            if not self.allow_new_file(filename, params):
                return False

            out = self._open_file(filename, params)

            is_new = True

        try:
            start = out.tell()

            write_callback(out, filename)

            out.flush()

            new_size = out.tell()

            out.seek(start)

            if self.dedup_index:
                self.dedup_index.add_urls_to_index(out, params,
                                                   filename,
                                                   new_size - start)

            return True

        except Exception as e:
            traceback.print_exc()
            close_file = True
            return False

        finally:
            # check for rollover
            if self.max_size and new_size > self.max_size:
                close_file = True

            if close_file:
                self._close_file(out)
                if not is_new:
                    self.fh_cache.pop(dir_key, None)

            elif is_new:
                if os.name != 'nt':
                    portalocker.lock(out, portalocker.LOCK_EX | portalocker.LOCK_NB)
                self.fh_cache[dir_key] = (out, filename)
Esempio n. 9
0
    def _is_write_resp(self, resp, params):
        if not params['recording'].is_open():
            logger.debug('Record Writer: Writing skipped, recording not open for write')
            return False

        user_key = res_template(self.user_key, params)
        size, max_size = self.redis.hmget(user_key, ['size', 'max_size'])
        print(self.redis.hgetall('u:e0a:*'))
        size = int(size or 0)
        max_size = int(max_size or 0)
        #hard coded min max_size, fix in the future
        if max_size < 5000000000:
            max_size = 5000000000
        length = resp.length or resp.rec_headers.get_header('Content-Length')
        if length is None:
            self.ensure_digest(resp, block=True, payload=True)
            resp.length = resp.payload_length
            length = resp.length

        #compare size to record vs max_size
        print(user_key)
        print(size)
        print(length)
        print(max_size)
        if size + length > max_size:
            logger.error('Record Writer: New Record for {0} exceeds max size, not recording!'.format(params['url']))
            return False

        return True
Esempio n. 10
0
    def add_urls_to_index(self, stream, params, filename, length):
        upload_key = params.get('param.upid')
        if upload_key:
            stream = SizeTrackingReader(stream, length, self.redis, upload_key)

        params['writer_cls'] = CDXJIndexer

        cdx_list = (super(WebRecRedisIndexer, self).
                      add_urls_to_index(stream, params, filename, length))


        # if replay key exists, add to it as well!
        coll_cdxj_key = res_template(self.coll_cdxj_key, params)
        if self.redis.exists(coll_cdxj_key):
            for cdx in cdx_list:
                if cdx:
                    self.redis.zadd(coll_cdxj_key, 0, cdx)

        ts = datetime.now().date().isoformat()
        ts_sec = str(int(time.time()))

        with redis_pipeline(self.redis) as pi:
            for key_templ in self.size_keys:
                key = res_template(key_templ, params)
                pi.hincrby(key, 'size', length)

                if key_templ == self.rec_info_key_templ and cdx_list:
                    pi.hset(key, 'updated_at', ts_sec)

            # write size to usage hashes
            if 'param.user' in params:
                if params['param.user'].startswith(self.temp_prefix):
                    key = self.temp_usage_key

                    # rate limiting
                    rate_limit_key = self.get_rate_limit_key(params)
                    if rate_limit_key:
                        pi.incrby(rate_limit_key, length)
                        pi.expire(rate_limit_key, self.rate_limit_ttl)

                else:
                    key = self.user_usage_key

                if key:
                    pi.hincrby(key, ts, length)

        return cdx_list
Esempio n. 11
0
    def __init__(self, max_size, redis_url, params, name, timeout=30):
        redis_url = res_template(redis_url, params)
        super(RedisPendingCounterTempBuffer, self).__init__(max_size=max_size)
        self.redis, self.key = RedisIndexSource.parse_redis_url(redis_url)
        self.timeout = timeout

        self.redis.incrby(self.key, 1)
        self.redis.expire(self.key, self.timeout)
Esempio n. 12
0
    def _iter_sources(self, params):
        the_dir = res_template(self.base_dir, params)
        the_dir = os.path.join(self.base_prefix, the_dir)
        try:
            sources = list(self._load_files(the_dir))
        except Exception:
            raise NotFoundException(the_dir)

        return sources
Esempio n. 13
0
    def _get_api_url(self, params):
        api_url = res_template(self.api_url, params)
        if 'closest' in params and self.closest_limit:
            api_url += '&limit=' + str(self.closest_limit)

        if 'matchType' in params:
            api_url += '&matchType=' + params.get('matchType')

        return api_url
Esempio n. 14
0
    def _iter_sources(self, params):
        the_dir = res_template(self.base_dir, params)
        the_dir = os.path.join(self.base_prefix, the_dir)
        try:
            sources = list(self._load_files(the_dir))
        except Exception:
            raise NotFoundException(the_dir)

        return sources
Esempio n. 15
0
    def _set_load_url(self, cdx, params):
        source_coll = ''
        name = params.get('_name')
        if name:
            source_coll = params.get('param.' + name + '.src_coll', '')

        cdx[self.url_field] = res_template(self.replay_url, dict(url=cdx['url'],
                                                     timestamp=cdx['timestamp'],
                                                     src_coll=source_coll))
Esempio n. 16
0
    def allow_new_file(self, filename, params):
        key = res_template(self.info_key, params)

        # ensure recording exists before writing anything
        # if not, abort opening new warc file here
        if not self.redis.exists(key):
            print('Writing skipped, recording does not exist for ' + filename)
            return False

        return True
Esempio n. 17
0
    def load_index(self, params):
        cdx = CDXObject()
        cdx['urlkey'] = params.get('key').decode('utf-8')

        closest = params.get('closest')
        cdx['timestamp'] = closest if closest else timestamp_now()
        cdx['url'] = params['url']
        cdx['load_url'] = res_template(self.proxy_url, params)
        cdx['memento_url'] = cdx['load_url']
        return self._do_load(cdx, params)
Esempio n. 18
0
    def _is_write_req(self, req, params):
        if not req or not req.rec_headers or not self.skip_key_template:
            return False

        skip_key = res_template(self.skip_key_template, params)

        if self.redis.get(skip_key) == '1':
            print('SKIPPING REQ', params.get('url'))
            return False

        return True
Esempio n. 19
0
    def _is_write_req(self, req, params):
        if not req or not req.rec_headers or not self.skip_key_template:
            return False

        skip_key = res_template(self.skip_key_template, params)

        if self.redis.get(skip_key) == '1':
            logger.debug('Record Writer: Skipping Request for: ' + params.get('url'))
            return False

        return True
Esempio n. 20
0
    def get_new_filename(self, dir_, params):
        timestamp = timestamp20_now()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        filename = dir_ + res_template(self.filename_template, params,
                                       hostname=self.hostname,
                                       timestamp=timestamp,
                                       random=randstr)

        return filename
Esempio n. 21
0
    def is_rec_open(self, params):
        open_key = res_template(self.open_rec_key, params)

        # update ttl for open recroding key, if it exists
        # if not, abort opening new warc file here
        if not self.redis.expire(open_key, self.open_rec_ttl):
            # if expire fails, recording not open!
            logging.debug('Writing skipped, recording not open for write: ' + open_key)
            return False

        return True
Esempio n. 22
0
    def load_index(self, params):
        filename = res_template(self.filename_template, params)

        fh = self._do_open(filename)

        def do_iter():
            with fh:
                for obj in self._do_iter(fh, params):
                    yield obj

        return do_iter()
    def get_new_filename(self, dir_, params):
        timestamp = timestamp20_now()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        filename = dir_ + res_template(self.filename_template, params,
                                       hostname=self.hostname,
                                       timestamp=timestamp,
                                       random=randstr)

        return filename
Esempio n. 24
0
    def load_key_index(self, key_template, params):
        z_key = res_template(key_template, params)
        index_list = self.redis.zrangebylex(z_key,
                                            b'[' + params['key'],
                                            b'(' + params['end_key'])

        def do_load(index_list):
            for line in index_list:
                yield CDXObject(line)

        return do_load(index_list)
Esempio n. 25
0
    def load_key_index(self, key_template, params):
        z_key = res_template(key_template, params)
        index_list = self.redis.zrangebylex(z_key, b'[' + params['key'],
                                            b'(' + params['end_key'])

        def do_load(index_list):
            for line in index_list:
                if isinstance(line, str):
                    line = line.encode('utf-8')
                yield CDXObject(line)

        return do_load(index_list)
Esempio n. 26
0
    def _iter_sources(self, params):
        redis_key_pattern = res_template(self.redis_key_template, params)

        if '*' not in redis_key_pattern:
            keys = [redis_key_pattern]
        else:
            keys = self.scan_keys(redis_key_pattern, params)

        for key in keys:
            res = self._get_source_for_key(key)
            if res:
                yield key, res
Esempio n. 27
0
    def _iter_sources(self, params):
        redis_key_pattern = res_template(self.redis_key_template, params)

        if '*' not in redis_key_pattern:
            keys = [redis_key_pattern]
        else:
            keys = self.scan_keys(redis_key_pattern, params)

        for key in keys:
            res = self._get_source_for_key(key)
            if res:
                yield key, res
Esempio n. 28
0
    def _get_rel_or_base_name(self, filename, params):
        rel_path = res_template(self.rel_path_template, params)
        try:
            base_name = os.path.relpath(filename, rel_path)
            assert '..' not in base_name
        except Exception:
            base_name = None

        if not base_name:
            base_name = os.path.basename(filename)

        return base_name
Esempio n. 29
0
    def _get_rel_or_base_name(self, filename, params):
        rel_path = res_template(self.rel_path_template, params)
        try:
            base_name = os.path.relpath(filename, rel_path)
            assert '..' not in base_name
        except Exception:
            base_name = None

        if not base_name:
            base_name = os.path.basename(filename)

        return base_name
Esempio n. 30
0
    def load_index(self, params):
        filename = res_template(self.filename_template, params)

        try:
            fh = open(filename, 'rb')
        except IOError:
            raise NotFoundException(filename)

        def do_load(fh):
            with fh:
                gen = iter_range(fh, params['key'], params['end_key'])
                for line in gen:
                    yield CDXObject(line)

        return do_load(fh)
Esempio n. 31
0
    def _is_write_resp(self, resp, params):
        if not self.is_rec_open(params):
            return False

        user_key = res_template(self.user_key, params)
        size, max_size = self.redis.hmget(user_key, ['size', 'max_size'])

        size = int(size or 0)
        max_size = int(max_size or 0)
        length = int(resp.length or resp.rec_headers.get_header('Content-Length') or 0)

        if size + length > max_size:
            print('New Record for {0} exceeds max size, not recording!'.format(params['url']))
            return False

        return True
Esempio n. 32
0
    def handle_timemap(self, params):
        url = res_template(self.timemap_url, params)
        headers = self._get_headers(params)
        try:
            res = self.sesh.get(url,
                                headers=headers,
                                timeout=params.get('_timeout'))

            res.raise_for_status()
            assert(res.text)

        except Exception as e:
            self.logger.debug('FAILED: ' + str(e))
            raise NotFoundException(url)

        links = res.text
        return self.links_to_cdxobject(links, 'timemap')
Esempio n. 33
0
    def add_urls_to_index(self, stream, params, filename, length):
        base_filename = self._get_rel_or_base_name(filename, params)

        cdxout = BytesIO()
        write_cdx_index(cdxout, stream, base_filename,
                        cdxj=True, append_post=True,
                        writer_cls=params.get('writer_cls'))

        z_key = res_template(self.redis_key_template, params)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')

        for cdx in cdx_list:
            if cdx:
                self.redis.zadd(z_key, 0, cdx)

        return cdx_list
Esempio n. 34
0
    def get_timegate_links(self, params, timestamp):
        url = res_template(self.timegate_url, params)
        accept_dt = timestamp_to_http_date(timestamp)
        try:
            headers = self._get_headers(params)
            headers['Accept-Datetime'] = accept_dt
            res = self.sesh.head(url, headers=headers)
            res.raise_for_status()
        except Exception as e:
            self.logger.debug('FAILED: ' + str(e))
            raise NotFoundException(url)

        links = res.headers.get('Link')

        if not links:
            raise NotFoundException(url)

        return links
Esempio n. 35
0
    def scan_keys(self, match_templ, params, member_key=None):
        if not member_key:
            member_key = self.member_key_template

        if not member_key:
            return self.redis.scan_iter(match=match_templ)

        key = res_template(member_key, params)

        scan_key = 'scan:' + key
        # check if already have keys to avoid extra redis call
        keys = params.get(scan_key)
        if not keys:
            keys = self._load_key_set(key)
            params[scan_key] = keys

        #match_templ = match_templ.encode('utf-8')

        return [match_templ.replace('*', key) for key in keys]
Esempio n. 36
0
    def add_urls_to_index(self, stream, params, filename, length):
        base_filename = self._get_rel_or_base_name(filename, params)

        cdxout = BytesIO()
        write_cdx_index(cdxout,
                        stream,
                        base_filename,
                        cdxj=True,
                        append_post=True,
                        writer_cls=params.get('writer_cls'))

        z_key = res_template(self.redis_key_template, params)

        cdx_list = cdxout.getvalue().rstrip().split(b'\n')

        for cdx in cdx_list:
            if cdx:
                self.redis.zadd(z_key, 0, cdx)

        return cdx_list
Esempio n. 37
0
 def get_dir_key(self, params):
     return res_template(self.key_template, params)
Esempio n. 38
0
    def _get_api_url(self, params):
        api_url = res_template(self.api_url, params)
        if 'closest' in params and self.closest_limit:
            api_url += '&limit=' + str(self.closest_limit)

        return api_url