def add_urls_to_index(self, stream, params, filename, length): upload_key = params.get('param.upid') if upload_key: stream = SizeTrackingReader(stream, length, self.redis, upload_key) params['writer_cls'] = CDXJIndexer cdx_list = (super(WebRecRedisIndexer, self).add_urls_to_index(stream, params, filename, length)) # if replay key exists, add to it as well! coll_cdxj_key = res_template(self.coll_cdxj_key, params) if self.redis.exists(coll_cdxj_key): for cdx in cdx_list: if cdx: self.redis.zadd(coll_cdxj_key, 0, cdx) dt_now = datetime.utcnow() ts_sec = int(dt_now.timestamp()) with redis_pipeline(self.redis) as pi: for key_templ in self.info_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if cdx_list: pi.hset(key, 'updated_at', ts_sec) if key_templ == self.rec_info_key_templ: pi.hset(key, 'recorded_at', ts_sec) self.stats.incr_record(params, length, cdx_list) return cdx_list
def add_urls_to_index(self, stream, params, filename, length): upload_key = params.get('param.upid') if upload_key: stream = SizeTrackingReader(stream, length, self.redis, upload_key) cdx_list = (super(WebRecRedisIndexer, self).add_urls_to_index(stream, params, filename, length)) with redis_pipeline(self.redis) as pi: for key_templ in self.size_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if key_templ == self.rec_info_key_templ and cdx_list: pi.hset(key, 'updated_at', str(int(time.time()))) # write size to usage hashes ts = datetime.now().date().isoformat() if 'param.user' in params: if params['param.user'].startswith(self.temp_prefix): key = self.temp_usage_key rate_limit_key = self.get_rate_limit_key(params) if rate_limit_key: pi.incrby(rate_limit_key, length) pi.expire(rate_limit_key, self.rate_limit_ttl) else: key = self.user_usage_key if key: pi.hincrby(key, ts, length) return cdx_list
def multifile_upload(self, user, files): total_size = 0 for filename in files: total_size += os.path.getsize(filename) upload_id = self._get_upload_id() upload_key = self.upload_key.format(user=user, upid=upload_id) with redis_pipeline(self.manager.redis) as pi: pi.hset(upload_key, 'size', 0) pi.hset(upload_key, 'total_size', total_size * 2) pi.hset(upload_key, 'total_files', len(files)) pi.hset(upload_key, 'files', len(files)) pi.expire(upload_key, 120) gevent.sleep(0) for filename in files: size = 0 fh = None try: size = os.path.getsize(filename) fh = open(filename, 'rb') self.manager.redis.hset(upload_key, 'filename', filename) stream = SizeTrackingReader(fh, size, self.manager.redis, upload_key) if filename.endswith('.har'): stream, expected_size = self.har2warc(filename, stream) fh.close() fh = stream atexit.register(lambda: os.remove(stream.name)) infos = self.parse_uploaded(stream, size) res = self.handle_upload(fh, upload_id, upload_key, infos, filename, user, False, size) assert ('error_message' not in res) except Exception as e: traceback.print_exc() print('ERROR PARSING: ' + filename) print(e) if fh: rem = size - fh.tell() if rem > 0: self.manager.redis.hincrby(upload_key, 'size', rem) self.manager.redis.hincrby(upload_key, 'files', -1) fh.close()
def multifile_upload(self, user, files): """Import multiple files. :param User user: user :param list files: list of filenames """ total_size = 0 for filename in files: total_size += os.path.getsize(filename) upload_id, upload_key = self._init_upload_status(user, total_size, num_files=len(files), expire=self.upload_exp) gevent.sleep(0) for filename in files: size = 0 fh = None try: size = os.path.getsize(filename) fh = open(filename, 'rb') self.redis.hset(upload_key, 'filename', filename) stream = SizeTrackingReader(fh, size, self.redis, upload_key) if filename.endswith('.har'): stream, expected_size = self.har2warc(filename, stream) fh.close() fh = stream infos = self.parse_uploaded(stream, size) res = self.handle_upload(fh, upload_id, upload_key, infos, filename, user, False, size) assert('error' not in res) except Exception as e: traceback.print_exc() print('ERROR PARSING: ' + filename) print(e) if fh: rem = size - fh.tell() if rem > 0: self.redis.hincrby(upload_key, 'size', rem) self.redis.hincrby(upload_key, 'files', -1) fh.close()
def add_urls_to_index(self, stream, params, filename, length): upload_key = params.get('param.upid') if upload_key: stream = SizeTrackingReader(stream, length, self.redis, upload_key) params['writer_cls'] = CDXJIndexer cdx_list = (super(WebRecRedisIndexer, self). add_urls_to_index(stream, params, filename, length)) # if replay key exists, add to it as well! coll_cdxj_key = res_template(self.coll_cdxj_key, params) if self.redis.exists(coll_cdxj_key): for cdx in cdx_list: if cdx: self.redis.zadd(coll_cdxj_key, 0, cdx) ts = datetime.now().date().isoformat() ts_sec = str(int(time.time())) with redis_pipeline(self.redis) as pi: for key_templ in self.size_keys: key = res_template(key_templ, params) pi.hincrby(key, 'size', length) if key_templ == self.rec_info_key_templ and cdx_list: pi.hset(key, 'updated_at', ts_sec) # write size to usage hashes if 'param.user' in params: if params['param.user'].startswith(self.temp_prefix): key = self.temp_usage_key # rate limiting rate_limit_key = self.get_rate_limit_key(params) if rate_limit_key: pi.incrby(rate_limit_key, length) pi.expire(rate_limit_key, self.rate_limit_ttl) else: key = self.user_usage_key if key: pi.hincrby(key, ts, length) return cdx_list