Beispiel #1
0
    def add_urls_to_index(self, stream, params, filename, length):
        upload_key = params.get('param.upid')
        if upload_key:
            stream = SizeTrackingReader(stream, length, self.redis, upload_key)

        params['writer_cls'] = CDXJIndexer

        cdx_list = (super(WebRecRedisIndexer,
                          self).add_urls_to_index(stream, params, filename,
                                                  length))

        # if replay key exists, add to it as well!
        coll_cdxj_key = res_template(self.coll_cdxj_key, params)
        if self.redis.exists(coll_cdxj_key):
            for cdx in cdx_list:
                if cdx:
                    self.redis.zadd(coll_cdxj_key, 0, cdx)

        dt_now = datetime.utcnow()

        ts_sec = int(dt_now.timestamp())

        with redis_pipeline(self.redis) as pi:
            for key_templ in self.info_keys:
                key = res_template(key_templ, params)
                pi.hincrby(key, 'size', length)
                if cdx_list:
                    pi.hset(key, 'updated_at', ts_sec)
                    if key_templ == self.rec_info_key_templ:
                        pi.hset(key, 'recorded_at', ts_sec)

        self.stats.incr_record(params, length, cdx_list)

        return cdx_list
Beispiel #2
0
    def add_urls_to_index(self, stream, params, filename, length):
        upload_key = params.get('param.upid')
        if upload_key:
            stream = SizeTrackingReader(stream, length, self.redis, upload_key)

        cdx_list = (super(WebRecRedisIndexer,
                          self).add_urls_to_index(stream, params, filename,
                                                  length))

        with redis_pipeline(self.redis) as pi:
            for key_templ in self.size_keys:
                key = res_template(key_templ, params)
                pi.hincrby(key, 'size', length)

                if key_templ == self.rec_info_key_templ and cdx_list:
                    pi.hset(key, 'updated_at', str(int(time.time())))

            # write size to usage hashes
            ts = datetime.now().date().isoformat()

            if 'param.user' in params:
                if params['param.user'].startswith(self.temp_prefix):
                    key = self.temp_usage_key
                    rate_limit_key = self.get_rate_limit_key(params)
                    if rate_limit_key:
                        pi.incrby(rate_limit_key, length)
                        pi.expire(rate_limit_key, self.rate_limit_ttl)

                else:
                    key = self.user_usage_key

                if key:
                    pi.hincrby(key, ts, length)

        return cdx_list
    def multifile_upload(self, user, files):
        total_size = 0

        for filename in files:
            total_size += os.path.getsize(filename)

        upload_id = self._get_upload_id()

        upload_key = self.upload_key.format(user=user, upid=upload_id)

        with redis_pipeline(self.manager.redis) as pi:
            pi.hset(upload_key, 'size', 0)
            pi.hset(upload_key, 'total_size', total_size * 2)
            pi.hset(upload_key, 'total_files', len(files))
            pi.hset(upload_key, 'files', len(files))
            pi.expire(upload_key, 120)

        gevent.sleep(0)

        for filename in files:
            size = 0
            fh = None
            try:
                size = os.path.getsize(filename)
                fh = open(filename, 'rb')

                self.manager.redis.hset(upload_key, 'filename', filename)

                stream = SizeTrackingReader(fh, size, self.manager.redis,
                                            upload_key)

                if filename.endswith('.har'):
                    stream, expected_size = self.har2warc(filename, stream)
                    fh.close()
                    fh = stream
                    atexit.register(lambda: os.remove(stream.name))

                infos = self.parse_uploaded(stream, size)

                res = self.handle_upload(fh, upload_id, upload_key, infos,
                                         filename, user, False, size)

                assert ('error_message' not in res)
            except Exception as e:
                traceback.print_exc()
                print('ERROR PARSING: ' + filename)
                print(e)
                if fh:
                    rem = size - fh.tell()
                    if rem > 0:
                        self.manager.redis.hincrby(upload_key, 'size', rem)
                    self.manager.redis.hincrby(upload_key, 'files', -1)
                    fh.close()
Beispiel #4
0
    def multifile_upload(self, user, files):
        """Import multiple files.

        :param User user: user
        :param list files: list of filenames
        """
        total_size = 0

        for filename in files:
            total_size += os.path.getsize(filename)

        upload_id, upload_key = self._init_upload_status(user, total_size,
                                                         num_files=len(files),
                                                         expire=self.upload_exp)

        gevent.sleep(0)

        for filename in files:
            size = 0
            fh = None
            try:
                size = os.path.getsize(filename)
                fh = open(filename, 'rb')

                self.redis.hset(upload_key, 'filename', filename)

                stream = SizeTrackingReader(fh, size, self.redis, upload_key)

                if filename.endswith('.har'):
                    stream, expected_size = self.har2warc(filename, stream)
                    fh.close()
                    fh = stream

                infos = self.parse_uploaded(stream, size)

                res = self.handle_upload(fh, upload_id, upload_key, infos, filename,
                                         user, False, size)

                assert('error' not in res)
            except Exception as e:
                traceback.print_exc()
                print('ERROR PARSING: ' + filename)
                print(e)
                if fh:
                    rem = size - fh.tell()
                    if rem > 0:
                        self.redis.hincrby(upload_key, 'size', rem)
                    self.redis.hincrby(upload_key, 'files', -1)
                    fh.close()
Beispiel #5
0
    def add_urls_to_index(self, stream, params, filename, length):
        upload_key = params.get('param.upid')
        if upload_key:
            stream = SizeTrackingReader(stream, length, self.redis, upload_key)

        params['writer_cls'] = CDXJIndexer

        cdx_list = (super(WebRecRedisIndexer, self).
                      add_urls_to_index(stream, params, filename, length))


        # if replay key exists, add to it as well!
        coll_cdxj_key = res_template(self.coll_cdxj_key, params)
        if self.redis.exists(coll_cdxj_key):
            for cdx in cdx_list:
                if cdx:
                    self.redis.zadd(coll_cdxj_key, 0, cdx)

        ts = datetime.now().date().isoformat()
        ts_sec = str(int(time.time()))

        with redis_pipeline(self.redis) as pi:
            for key_templ in self.size_keys:
                key = res_template(key_templ, params)
                pi.hincrby(key, 'size', length)

                if key_templ == self.rec_info_key_templ and cdx_list:
                    pi.hset(key, 'updated_at', ts_sec)

            # write size to usage hashes
            if 'param.user' in params:
                if params['param.user'].startswith(self.temp_prefix):
                    key = self.temp_usage_key

                    # rate limiting
                    rate_limit_key = self.get_rate_limit_key(params)
                    if rate_limit_key:
                        pi.incrby(rate_limit_key, length)
                        pi.expire(rate_limit_key, self.rate_limit_ttl)

                else:
                    key = self.user_usage_key

                if key:
                    pi.hincrby(key, ts, length)

        return cdx_list