Ejemplo n.º 1
0
    def _init_upload_status(self, user, total_size, num_files, filename=None, expire=None):
        """Initialize upload status.

        :param User user: user
        :param int total_size: size of WARC archive
        :param int num_files: n.s.
        :param filename: WARC archive filename
        :type: str or None
        :param expire: upload TTL
        :type: int or None

        :returns: upload ID and upload Redis key
        :rtype: str and str
        """
        upload_id = self._get_upload_id()

        upload_key = self.UPLOAD_KEY.format(user=user.name, upid=upload_id)

        with redis_pipeline(self.redis) as pi:
            pi.hset(upload_key, 'size', 0)
            pi.hset(upload_key, 'total_size', total_size * 2)
            pi.hset(upload_key, 'total_files', num_files)
            pi.hset(upload_key, 'files', num_files)

            if filename:
                pi.hset(upload_key, 'filename', filename)

            if expire:
                pi.expire(upload_key, expire)

        return upload_id, upload_key
Ejemplo n.º 2
0
    def update_dyn_stats(self, url, params, referrer, source, ra_recording):
        if referrer.endswith('.css'):
            css_res = self._res_url_templ(self.dyn_ref_templ, params, referrer)
            orig_referrer = self.redis.get(css_res)
            if orig_referrer:
                referrer = orig_referrer

        dyn_stats_key = self._res_url_templ(self.dyn_stats_key_templ,
                                             params, referrer)

        curr_url_key = self._res_url_templ(self.dyn_stats_key_templ,
                                           params, url)

        with redis_pipeline(self.redis) as pi:
            pi.delete(curr_url_key)

            pi.hincrby(dyn_stats_key, source, 1)
            pi.expire(dyn_stats_key, self.dyn_stats_secs)

            if url.endswith('.css'):
                css_res = self._res_url_templ(self.dyn_ref_templ, params, url)
                pi.setex(css_res, self.dyn_stats_secs, referrer)

            if ra_recording:
                ra_recording.track_remote_archive(pi, source)
Ejemplo n.º 3
0
    def incr_record(self, params, size, cdx_list):
        username = params.get('param.user')
        if not username:
            return

        today = today_str()

        with redis_pipeline(self.redis) as pi:
            # rate limiting
            rate_limit_key = self.get_rate_limit_key(params)
            if rate_limit_key:
                pi.incrby(rate_limit_key, size)
                pi.expire(rate_limit_key, self.RATE_LIMIT_TTL)

            # write size to usage hashes
            if username.startswith(self.TEMP_PREFIX):
                key = self.ALL_CAPTURE_TEMP_KEY
            else:
                key = self.ALL_CAPTURE_USER_KEY

            if key:
                pi.hincrby(key, today, size)

        is_extract = params.get('sources') != None
        is_patch = params.get('param.recorder.rec') != None

        if is_extract or is_patch:
            with redis_pipeline(self.redis) as pi:
                for cdx in cdx_list:
                    try:
                        cdx = CDXObject(cdx)
                        source_id = cdx['orig_source_id']
                        size = int(cdx['length'])
                        if source_id and size:
                            pi.hincrby(self.SOURCES_KEY.format(source_id), today, size)
                    except Exception as e:
                        pass

                if is_patch:
                    if username.startswith(self.TEMP_PREFIX):
                        key = self.PATCH_TEMP_KEY
                    else:
                        key = self.PATCH_USER_KEY

                    pi.hincrby(key, today, size)
Ejemplo n.º 4
0
 def move_temp_to_user_usage(self, collection):
     today = today_str()
     date_str = collection.get_created_iso_date()
     size = collection.size
     with redis_pipeline(self.redis) as pi:
         pi.hincrby(self.TEMP_MOVE_COUNT_KEY, today, 1)
         pi.hincrby(self.TEMP_MOVE_SIZE_KEY, today, size)
         pi.hincrby(self.ALL_CAPTURE_USER_KEY, date_str, size)
         pi.hincrby(self.ALL_CAPTURE_TEMP_KEY, date_str, -size)
Ejemplo n.º 5
0
    def inc_pending_count(self):
        """Increase outstanding CDX index lines."""
        if not self.is_open(extend=False):
            return

        pending_count = self.PENDING_COUNT_KEY.format(rec=self.my_id)

        with redis_pipeline(self.redis) as pi:
            pi.incrby(pending_count, 1)
            pi.expire(pending_count, self.PENDING_TTL)
Ejemplo n.º 6
0
    def __setitem__(self, name, obj):
        if not isinstance(obj, dict):
            raise Exception('Must assign a dict')

        user = self.make_user(name)
        user.access.assert_is_curr_user(user)
        user.data.update(obj)

        with redis_pipeline(self.redis) as pi:
            user.commit(pi)
            pi.sadd(self.users_key, name)
Ejemplo n.º 7
0
    def multifile_upload(self, user, files):
        total_size = 0

        for filename in files:
            total_size += os.path.getsize(filename)

        upload_id = self._get_upload_id()

        upload_key = self.upload_key.format(user=user, upid=upload_id)

        with redis_pipeline(self.manager.redis) as pi:
            pi.hset(upload_key, 'size', 0)
            pi.hset(upload_key, 'total_size', total_size * 2)
            pi.hset(upload_key, 'total_files', len(files))
            pi.hset(upload_key, 'files', len(files))
            pi.expire(upload_key, 120)

        gevent.sleep(0)

        for filename in files:
            size = 0
            fh = None
            try:
                size = os.path.getsize(filename)
                fh = open(filename, 'rb')

                self.manager.redis.hset(upload_key, 'filename', filename)

                stream = SizeTrackingReader(fh, size, self.manager.redis,
                                            upload_key)

                if filename.endswith('.har'):
                    stream, expected_size = self.har2warc(filename, stream)
                    fh.close()
                    fh = stream
                    atexit.register(lambda: os.remove(stream.name))

                infos = self.parse_uploaded(stream, size)

                res = self.handle_upload(fh, upload_id, upload_key, infos,
                                         filename, user, False, size)

                assert ('error_message' not in res)
            except Exception as e:
                traceback.print_exc()
                print('ERROR PARSING: ' + filename)
                print(e)
                if fh:
                    rem = size - fh.tell()
                    if rem > 0:
                        self.manager.redis.hincrby(upload_key, 'size', rem)
                    self.manager.redis.hincrby(upload_key, 'files', -1)
                    fh.close()
Ejemplo n.º 8
0
    def inc_pending_size(self, size):
        """Increase outstanding size.

        :param int size: size
        """
        if not self.is_open(extend=False):
            return

        pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id)
        with redis_pipeline(self.redis) as pi:
            pi.incrby(pending_size, size)
            pi.expire(pending_size, self.PENDING_TTL)
Ejemplo n.º 9
0
    def inc_pending_size(self, size):
        """Increase outstanding size.

        :param int size: size
        """
        if not self.is_open(extend=False):
            return

        pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id)
        with redis_pipeline(self.redis) as pi:
            pi.incrby(pending_size, size)
            pi.expire(pending_size, self.PENDING_TTL)
Ejemplo n.º 10
0
    def handle_upload(self, stream, upload_id, upload_key, infos, filename,
                      user, force_coll_name, total_size):
        """Operate WARC archive upload.

        :param stream: file object
        :param str upload_id: upload ID
        :param str upload_key: upload Redis key
        :param list infos: list of recordings
        :param str filename: WARC archive filename
        :param user User: user
        :param str force_coll_name: name of collection to upload into
        :param int total_size: size of WARC archive

        :returns: upload information
        :rtype: dict
        """

        logger.debug('Begin handle_upload() from: ' + filename + ' force_coll_name: ' + str(force_coll_name))

        num_recs = 0
        num_recs = len(infos)
        # first info is for collection, not recording
        if num_recs >= 2:
            num_recs -= 1

        logger.debug('Parsed {0} recordings, Buffer Size {1}'.format(num_recs, total_size))

        first_coll, rec_infos = self.process_upload(user, force_coll_name, infos, stream,
                                                    filename, total_size, num_recs)

        if not rec_infos:
            print('NO ARCHIVES!')
            #stream.close()
            return {'error': 'no_archive_data'}

        with redis_pipeline(self.redis) as pi:
            pi.hset(upload_key, 'coll', first_coll.name)
            pi.hset(upload_key, 'coll_title', first_coll.get_prop('title'))
            pi.hset(upload_key, 'filename', filename)
            pi.expire(upload_key, self.upload_exp)

        self.launch_upload(self.run_upload,
                           upload_key,
                           filename,
                           stream,
                           user,
                           rec_infos,
                           total_size,
                           first_coll)

        return {'upload_id': upload_id,
                'user': user.name
               }
Ejemplo n.º 11
0
    def _create_anon_user(self, user):
        max_size = self.redis.hget('h:defaults', 'max_anon_size')
        if not max_size:
            max_size = self.default_max_anon_size

        key = self.user_key.format(user=user)
        now = int(time.time())

        with redis_pipeline(self.redis) as pi:
            pi.hset(key, 'max_size', max_size)
            pi.hset(key, 'max_coll', 1)
            pi.hset(key, 'created_at', now)
            pi.hsetnx(key, 'size', '0')
Ejemplo n.º 12
0
    def _update_redis_and_cookie(self, set_cookie, session, headers):
        duration = self.durations[session.dura_type]['total']

        if session.should_save:
            with redis_pipeline(self.redis) as pi:
                data = base64.b64encode(pickle.dumps(session._sesh))

                ttl = session.ttl
                # PERMA CUSTOMIZATION: changed from < to <=
                # https://github.com/webrecorder/webrecorder/pull/721
                if ttl <= 0:
                    ttl = duration

                pi.setex(session.key, ttl, data)

                if set_cookie:
                    self.track_long_term(session, pi)

                # set redis duration
                if session.curr_role != 'anon':
                    pi.expire(session.key, duration)

        elif set_cookie and session.curr_role != 'anon':
            # extend redis duration if extending cookie!
            self.redis.expire(session.key, duration)

        if not set_cookie:
            return

        expires = datetime.utcnow() + timedelta(seconds=duration)

        # set cookie
        sesh_cookie = session.get_cookie()

        value = '{0}={1}; Path=/; HttpOnly'

        # add max-age only if:
        # - long duration session
        # - anonymous session (not restricted)
        # don't set for restricted session, as cookie only valid as long as top session exists
        if session.dura_type == 'long' or session.curr_role == 'anon':
            value += ';  max-age={3}'

        value = value.format(self.sesh_key, sesh_cookie,
                             datetime_to_http_date(expires), duration)

        scheme = session.environ.get('wsgi.url_scheme', '')
        if scheme.lower() == 'https':
            value += '; Secure'

        headers.append(('Set-Cookie', value))
Ejemplo n.º 13
0
    def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size):
        try:
            count = 0
            num_recs = len(rec_infos)
            last_end = 0

            for info in rec_infos:
                count += 1
                logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format(upload_key, count, num_recs))

                if info['length'] > 0:
                    self.do_upload(upload_key,
                                   filename,
                                   stream,
                                   user,
                                   info['coll'],
                                   info['rec'],
                                   info['offset'],
                                   info['length'])
                else:
                    logger.debug('SKIP upload for zero-length recording')


                pages = info.get('pages')
                if pages is None:
                    pages = self.detect_pages(user, info['coll'], info['rec'])

                if pages:
                    self.manager.import_pages(user, info['coll'], info['rec'], pages)

                diff = info['offset'] - last_end
                last_end = info['offset'] + info['length']
                if diff > 0:
                    self._add_split_padding(diff, upload_key)

        except:
            import traceback
            traceback.print_exc()

        finally:
            # add remainder of file, assumed consumed/skipped, if any
            last_end = stream.tell()
            stream.close()

            if last_end < total_size:
                diff = total_size - last_end
                self._add_split_padding(diff, upload_key)

            with redis_pipeline(self.manager.redis) as pi:
                pi.hincrby(upload_key, 'files', -1)
                pi.hset(upload_key, 'done', 1)
Ejemplo n.º 14
0
    def handle_upload(self, stream, upload_id, upload_key, infos, filename,
                      user, force_coll_name, total_size):
        """Operate WARC archive upload.

        :param stream: file object
        :param str upload_id: upload ID
        :param str upload_key: upload Redis key
        :param list infos: list of recordings
        :param str filename: WARC archive filename
        :param user User: user
        :param str force_coll_name: name of collection to upload into
        :param int total_size: size of WARC archive

        :returns: upload information
        :rtype: dict
        """

        logger.debug('Begin handle_upload() from: ' + filename +
                     ' force_coll_name: ' + str(force_coll_name))

        num_recs = 0
        num_recs = len(infos)
        # first info is for collection, not recording
        if num_recs >= 2:
            num_recs -= 1

        logger.debug('Parsed {0} recordings, Buffer Size {1}'.format(
            num_recs, total_size))

        first_coll, rec_infos = self.process_upload(user, force_coll_name,
                                                    infos, stream, filename,
                                                    total_size, num_recs)

        if not rec_infos:
            print('NO ARCHIVES!')
            #stream.close()
            return {'error': 'no_archive_data'}

        with redis_pipeline(self.redis) as pi:
            pi.hset(upload_key, 'coll', first_coll.name)
            pi.hset(upload_key, 'coll_title', first_coll.get_prop('title'))
            pi.hset(upload_key, 'filename', filename)
            pi.expire(upload_key, self.upload_exp)

        self.launch_upload(self.run_upload, upload_key, filename, stream, user,
                           rec_infos, total_size, first_coll)

        return {'upload_id': upload_id, 'user': user.name}
Ejemplo n.º 15
0
    def create_recording(self, user, coll, rec, rec_title, coll_title='',
                         no_dupe=False, rec_type=None, ra_list=None):

        self.assert_can_write(user, coll)

        orig_rec = rec
        orig_rec_title = rec_title
        count = 1

        rec_list_key = self.rec_list_key.format(user=user, coll=coll)

        while True:
            key = self.rec_info_key.format(user=user, coll=coll, rec=rec)

            if self.redis.hsetnx(key, 'id', rec) == 1:
                break

            # don't create a duplicate, just use the specified recording
            if no_dupe:
                return self.get_recording(user, coll, rec)

            count += 1
            rec_title = orig_rec_title + ' ' + str(count)
            rec = orig_rec + '-' + str(count)

        now = int(time.time())

        if ra_list:
            ra_key = self.ra_key.format(user=user,
                                        coll=coll,
                                        rec=rec)

        with redis_pipeline(self.redis) as pi:
            pi.hset(key, 'title', rec_title)
            pi.hset(key, 'created_at', now)
            pi.hset(key, 'updated_at', now)
            pi.hsetnx(key, 'size', '0')
            if rec_type:
                pi.hset(key, 'rec_type', rec_type)
            pi.sadd(rec_list_key, rec)
            if ra_list:
                pi.sadd(ra_key, *ra_list)

        if not self._has_collection_no_access_check(user, coll):
            coll_title = coll_title or coll
            self.create_collection(user, coll, coll_title)

        return self.get_recording(user, coll, rec)
Ejemplo n.º 16
0
    def add_urls_to_index(self, stream, params, filename, length):
        upload_key = params.get('param.upid')
        if upload_key:
            stream = SizeTrackingReader(stream, length, self.redis, upload_key)

        params['writer_cls'] = CDXJIndexer

        cdx_list = (super(WebRecRedisIndexer, self).
                      add_urls_to_index(stream, params, filename, length))


        # if replay key exists, add to it as well!
        coll_cdxj_key = res_template(self.coll_cdxj_key, params)
        if self.redis.exists(coll_cdxj_key):
            for cdx in cdx_list:
                if cdx:
                    self.redis.zadd(coll_cdxj_key, 0, cdx)

        ts = datetime.now().date().isoformat()
        ts_sec = str(int(time.time()))

        with redis_pipeline(self.redis) as pi:
            for key_templ in self.size_keys:
                key = res_template(key_templ, params)
                pi.hincrby(key, 'size', length)

                if key_templ == self.rec_info_key_templ and cdx_list:
                    pi.hset(key, 'updated_at', ts_sec)

            # write size to usage hashes
            if 'param.user' in params:
                if params['param.user'].startswith(self.temp_prefix):
                    key = self.temp_usage_key

                    # rate limiting
                    rate_limit_key = self.get_rate_limit_key(params)
                    if rate_limit_key:
                        pi.incrby(rate_limit_key, length)
                        pi.expire(rate_limit_key, self.rate_limit_ttl)

                else:
                    key = self.user_usage_key

                if key:
                    pi.hincrby(key, ts, length)

        return cdx_list
Ejemplo n.º 17
0
    def add_mount(self, user, coll, rec, rec_title,
                  mount_type, mount_desc, mount_config):
        rec_info = self.create_recording(user, coll, rec, rec_title)
        rec = rec_info['id']

        mount_key = self.mount_key.format(user=user, coll=coll, rec=rec)

        rec_key = self.rec_info_key.format(user=user, coll=coll, rec=rec)

        with redis_pipeline(self.redis) as pi:
            pi.set(mount_key, mount_config)

            pi.hset(rec_key, 'mount_type', mount_type)
            if mount_desc:
                pi.hset(rec_key, 'mount_desc', mount_desc)

        return rec_info
Ejemplo n.º 18
0
    def set_recording_timestamps(self, user, coll, rec, created_at,
                                 updated_at):

        self.assert_can_write(user, coll)

        key = self.rec_info_key.format(user=user, coll=coll, rec=rec)

        with redis_pipeline(self.redis) as pi:
            if not pi.exists(key):
                return False

            if created_at:
                pi.hset(key, 'created_at', created_at)

            if updated_at:
                pi.hset(key, 'updated_at', updated_at)

        return True
Ejemplo n.º 19
0
    def dec_pending_count_and_size(self, size):
        """Decrease outstanding CDX index lines and size.

        :param int size: size
        """
        # return if rec no longer exists (deleted while transfer is pending)
        if not self.redis.exists(self.info_key):
            return

        pending_count = self.PENDING_COUNT_KEY.format(rec=self.my_id)

        pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id)

        with redis_pipeline(self.redis) as pi:
            pi.incrby(pending_count, -1)
            pi.incrby(pending_size, -size)
            pi.expire(pending_count, self.PENDING_TTL)
            pi.expire(pending_size, self.PENDING_TTL)
Ejemplo n.º 20
0
    def dec_pending_count_and_size(self, size):
        """Decrease outstanding CDX index lines and size.

        :param int size: size
        """
        # return if rec no longer exists (deleted while transfer is pending)
        if not self.redis.exists(self.info_key):
            return

        pending_count = self.PENDING_COUNT_KEY.format(rec=self.my_id)

        pending_size = self.PENDING_SIZE_KEY.format(rec=self.my_id)

        with redis_pipeline(self.redis) as pi:
            pi.incrby(pending_count, -1)
            pi.incrby(pending_size, -size)
            pi.expire(pending_count, self.PENDING_TTL)
            pi.expire(pending_size, self.PENDING_TTL)
    def _delete_redis_keys(self, type, user, coll, rec):
        key_templ = self.del_templ.get(type)
        if not key_templ:
            print('Unknown delete type ' + str(type))
            return

        key_pattern = key_templ.format(user=user, coll=coll, rec=rec)
        keys_to_del = list(self.redis.scan_iter(match=key_pattern))

        if type != 'user':
            del_info = self.info_keys[type].format(user=user,
                                                   coll=coll,
                                                   rec=rec)

            try:
                length = int(self.redis.hget(del_info, 'size'))
            except:
                print('Error decreasing size')
                return
        else:
            length = 0

        with redis_pipeline(self.redis) as pi:
            if type == 'coll':
                coll_list_key = self.coll_list_key_templ.format(user=user)
                pi.srem(coll_list_key, coll)

            elif type == 'rec':
                rec_list_key = self.rec_list_key_templ.format(user=user,
                                                              coll=coll)
                pi.srem(rec_list_key, rec)

            if length > 0:
                user_key = self.info_keys['user'].format(user=user)
                pi.hincrby(user_key, 'size', -length)

                if type == 'rec':
                    coll_key = self.info_keys['coll'].format(user=user,
                                                             coll=coll)
                    pi.hincrby(coll_key, 'size', -length)

            for key in keys_to_del:
                pi.delete(key)
Ejemplo n.º 22
0
    def prepare_response(self, environ, headers):
        super(RedisSessionMiddleware, self).prepare_response(environ, headers)

        session = environ['webrec.session']

        if session.should_delete:
            self._delete_cookie(headers, self.sesh_key)
            self.redis.delete(session.key)
        else:
            if session.should_renew:
                self.redis.delete(session.key)
                sesh_id, session.key = self.make_id()
                session['id'] = sesh_id

            set_cookie = self.should_set_cookie(session)

            if set_cookie or session.should_save:
                with redis_pipeline(self.redis) as pi:
                    self._update_redis_and_cookie(pi, set_cookie, session,
                                                  headers)
Ejemplo n.º 23
0
    def handle_upload(self, stream, upload_id, upload_key, infos, filename,
                      user, force_coll, total_size):

        logger.debug('Begin handle_upload() from: ' + filename + ' force_coll: ' + str(force_coll))

        num_recs = 0
        num_recs = len(infos)
        # first info is for collection, not recording
        if num_recs >= 2:
            num_recs -= 1

        logger.debug('Parsed {0} recordings, Buffer Size {1}'.format(num_recs, total_size))

        first_coll, rec_infos = self.process_upload(user, force_coll, infos, stream,
                                                    filename, total_size, num_recs)

        if not rec_infos:
            print('NO ARCHIVES!')
            #stream.close()
            return {'error_message': 'No Archive Data Found'}

        with redis_pipeline(self.manager.redis) as pi:
            pi.hset(upload_key, 'coll', first_coll['id'])
            pi.hset(upload_key, 'coll_title', first_coll['title'])
            pi.hset(upload_key, 'filename', filename)
            pi.expire(upload_key, self.upload_exp)

        self.launch_upload(self.run_upload,
                           upload_key,
                           filename,
                           stream,
                           user,
                           rec_infos,
                           total_size)

        return {'upload_id': upload_id,
                'user': user
               }
Ejemplo n.º 24
0
    def _init_upload_status(self,
                            user,
                            total_size,
                            num_files,
                            filename=None,
                            expire=None):
        upload_id = self._get_upload_id()

        upload_key = self.UPLOAD_KEY.format(user=user.name, upid=upload_id)

        with redis_pipeline(self.redis) as pi:
            pi.hset(upload_key, 'size', 0)
            pi.hset(upload_key, 'total_size', total_size * 2)
            pi.hset(upload_key, 'total_files', num_files)
            pi.hset(upload_key, 'files', num_files)

            if filename:
                pi.hset(upload_key, 'filename', filename)

            if expire:
                pi.expire(upload_key, expire)

        return upload_id, upload_key
Ejemplo n.º 25
0
    def _update_redis_and_cookie(self, set_cookie, session, headers):
        duration = self.durations[session.dura_type]['total']

        if session.should_save:
            with redis_pipeline(self.redis) as pi:
                data = base64.b64encode(pickle.dumps(session._sesh))

                ttl = session.ttl
                if ttl < 0:
                    ttl = duration

                pi.setex(session.key, ttl, data)

                if set_cookie:
                    self.track_long_term(session, pi)

                # set redis duration
                pi.expire(session.key, duration)

        if not set_cookie:
            return

        expires = datetime.utcnow() + timedelta(seconds=duration)

        # set cookie
        sesh_cookie = self.id_to_signed_cookie(session['id'],
                                               session.is_restricted)

        value = '{0}={1}; Path=/; HttpOnly; max-age={3}'
        value = value.format(self.sesh_key, sesh_cookie,
                             datetime_to_http_date(expires), duration)

        scheme = session.environ.get('wsgi.url_scheme', '')
        if scheme.lower() == 'https':
            value += '; Secure'

        headers.append(('Set-Cookie', value))
Ejemplo n.º 26
0
    def _init_upload_status(self,
                            user,
                            total_size,
                            num_files,
                            filename=None,
                            expire=None):
        """Initialize upload status.

        :param User user: user
        :param int total_size: size of WARC archive
        :param int num_files: n.s.
        :param filename: WARC archive filename
        :type: str or None
        :param expire: upload TTL
        :type: int or None

        :returns: upload ID and upload Redis key
        :rtype: str and str
        """
        upload_id = self._get_upload_id()

        upload_key = self.UPLOAD_KEY.format(user=user.name, upid=upload_id)

        with redis_pipeline(self.redis) as pi:
            pi.hset(upload_key, 'size', 0)
            pi.hset(upload_key, 'total_size', total_size * 2)
            pi.hset(upload_key, 'total_files', num_files)
            pi.hset(upload_key, 'files', num_files)

            if filename:
                pi.hset(upload_key, 'filename', filename)

            if expire:
                pi.expire(upload_key, expire)

        return upload_id, upload_key
Ejemplo n.º 27
0
    def init_new(self, title='', desc='', rec_type=None, ra_list=None):
        rec = self._create_new_id()

        open_rec_key = self.OPEN_REC_KEY.format(rec=rec)

        self.data = {
            'title': title,
            'desc': desc,
            'size': 0,
        }

        if rec_type:
            self.data['rec_type'] = rec_type

        with redis_pipeline(self.redis) as pi:
            self._init_new(pi)

            if ra_list:
                ra_key = self.RA_KEY.format(rec=self.my_id)
                pi.sadd(ra_key, *ra_list)

            pi.setex(open_rec_key, self.OPEN_REC_TTL, 1)

        return rec
Ejemplo n.º 28
0
    def init_new(self, title='', desc='', rec_type=None, ra_list=None):
        """Initialize new recording Redis building block.

        :param str title: title
        :param str desc: description
        :param rec_type: type of recording
        :type: str or None
        :param ra_list: remote archives
        :type: list or None

        :returns: component ID
        :rtype: str
        """
        rec = self._create_new_id()

        open_rec_key = self.OPEN_REC_KEY.format(rec=rec)

        self.data = {
                     'title': title,
                     'desc': desc,
                     'size': 0,
                    }

        if rec_type:
            self.data['rec_type'] = rec_type

        with redis_pipeline(self.redis) as pi:
            self._init_new(pi)

            if ra_list:
                ra_key = self.RA_KEY.format(rec=self.my_id)
                pi.sadd(ra_key, *ra_list)

            pi.setex(open_rec_key, self.OPEN_REC_TTL, 1)

        return rec
Ejemplo n.º 29
0
    def init_new(self, title='', desc='', rec_type=None, ra_list=None):
        """Initialize new recording Redis building block.

        :param str title: title
        :param str desc: description
        :param rec_type: type of recording
        :type: str or None
        :param ra_list: remote archives
        :type: list or None

        :returns: component ID
        :rtype: str
        """
        rec = self._create_new_id()

        open_rec_key = self.OPEN_REC_KEY.format(rec=rec)

        self.data = {
            'title': title,
            'desc': desc,
            'size': 0,
        }

        if rec_type:
            self.data['rec_type'] = rec_type

        with redis_pipeline(self.redis) as pi:
            self._init_new(pi)

            if ra_list:
                ra_key = self.RA_KEY.format(rec=self.my_id)
                pi.sadd(ra_key, *ra_list)

            pi.setex(open_rec_key, self.OPEN_REC_TTL, 1)

        return rec
Ejemplo n.º 30
0
    def run_upload(self, upload_key, filename, stream, user, rec_infos,
                   total_size, first_coll):
        """Upload WARC archive.

        :param str upload_key: upload Redis key
        :param str filename: WARC archive filename
        :param stream: file object
        :param User user: user
        :param list rec_infos: list of recordings
        :param int total_size: size of WARC archive
        :param Collection first_coll: collection
        """
        try:
            count = 0
            num_recs = len(rec_infos)
            last_end = 0
            page_id_map = {}

            for info in rec_infos:
                count += 1
                logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format(
                    upload_key, count, num_recs))

                if info['length'] > 0:
                    self.do_upload(upload_key, filename, stream, user.name,
                                   info['coll'], info['rec'], info['offset'],
                                   info['length'])
                else:
                    logger.debug('SKIP upload for zero-length recording')

                # BEGIN PERMA CUSTOMIZATION
                # investigating https://github.com/harvard-lil/perma/issues/2602
                try:
                    self.process_pages(info, page_id_map)
                except Exception as e:
                    raise Exception("Exception processing pages for {}".format(
                        first_coll.name)) from e
                # END PERMA CUSTOMIZATION

                diff = info['offset'] - last_end
                last_end = info['offset'] + info['length']
                if diff > 0:
                    self._add_split_padding(diff, upload_key)

                recording = info['recording']
                recording.set_date_prop('created_at', info)
                recording.set_date_prop('recorded_at', info)
                recording.set_date_prop('updated_at', info)

            self.import_lists(first_coll, page_id_map)

            self.postprocess_coll(first_coll)

            first_coll.set_date_prop('created_at', first_coll.data,
                                     '_created_at')
            first_coll.set_date_prop('updated_at', first_coll.data,
                                     '_updated_at')

        except:
            traceback.print_exc()

        finally:
            # add remainder of file, assumed consumed/skipped, if any
            last_end = stream.tell()
            stream.close()

            if last_end < total_size:
                diff = total_size - last_end
                self._add_split_padding(diff, upload_key)

            with redis_pipeline(self.redis) as pi:
                pi.hincrby(upload_key, 'files', -1)
                pi.hset(upload_key, 'done', 1)

            if first_coll.is_external():
                first_coll.sync_coll_index(exists=False, do_async=False)
                first_coll.set_external_remove_on_expire()
Ejemplo n.º 31
0
    def run_upload(self, upload_key, filename, stream, user, rec_infos, total_size, first_coll):
        """Upload WARC archive.

        :param str upload_key: upload Redis key
        :param str filename: WARC archive filename
        :param stream: file object
        :param User user: user
        :param list rec_infos: list of recordings
        :param int total_size: size of WARC archive
        :param Collection first_coll: collection
        """
        try:
            count = 0
            num_recs = len(rec_infos)
            last_end = 0
            page_id_map = {}

            for info in rec_infos:
                count += 1
                logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format(upload_key, count, num_recs))

                if info['length'] > 0:
                    self.do_upload(upload_key,
                                   filename,
                                   stream,
                                   user.name,
                                   info['coll'],
                                   info['rec'],
                                   info['offset'],
                                   info['length'])
                else:
                    logger.debug('SKIP upload for zero-length recording')


                self.process_pages(info, page_id_map)

                diff = info['offset'] - last_end
                last_end = info['offset'] + info['length']
                if diff > 0:
                    self._add_split_padding(diff, upload_key)

                recording = info['recording']
                recording.set_date_prop('created_at', info)
                recording.set_date_prop('recorded_at', info)
                recording.set_date_prop('updated_at', info)

            self.import_lists(first_coll, page_id_map)

            self.postprocess_coll(first_coll)

            first_coll.set_date_prop('created_at', first_coll.data, '_created_at')
            first_coll.set_date_prop('updated_at', first_coll.data, '_updated_at')

        except:
            traceback.print_exc()

        finally:
            # add remainder of file, assumed consumed/skipped, if any
            last_end = stream.tell()
            stream.close()

            if last_end < total_size:
                diff = total_size - last_end
                self._add_split_padding(diff, upload_key)

            with redis_pipeline(self.redis) as pi:
                pi.hincrby(upload_key, 'files', -1)
                pi.hset(upload_key, 'done', 1)

            if first_coll.is_external():
                first_coll.sync_coll_index(exists=False, do_async=False)
                first_coll.set_external_remove_on_expire()
Ejemplo n.º 32
0
    def run_upload(self, upload_key, filename, stream, user, rec_infos,
                   total_size, first_coll):
        """Upload WARC archive.

        :param str upload_key: upload Redis key
        :param str filename: WARC archive filename
        :param stream: file object
        :param User user: user
        :param list rec_infos: list of recordings
        :param int total_size: size of WARC archive
        :param Collection first_coll: collection
        """
        try:
            count = 0
            num_recs = len(rec_infos)
            last_end = 0
            page_id_map = {}

            for info in rec_infos:
                count += 1
                logger.debug('Id: {0}, Uploading Rec {1} of {2}'.format(
                    upload_key, count, num_recs))

                if info['length'] > 0:
                    self.do_upload(upload_key, filename, stream, user.name,
                                   info['coll'], info['rec'], info['offset'],
                                   info['length'])
                else:
                    logger.debug('SKIP upload for zero-length recording')

                self.process_pages(info, page_id_map)

                diff = info['offset'] - last_end
                last_end = info['offset'] + info['length']
                if diff > 0:
                    self._add_split_padding(diff, upload_key)

                recording = info['recording']
                recording.set_date_prop('created_at', info)
                recording.set_date_prop('recorded_at', info)
                recording.set_date_prop('updated_at', info)

            self.import_lists(first_coll, page_id_map)

            self.postprocess_coll(first_coll)

            first_coll.set_date_prop('created_at', first_coll.data,
                                     '_created_at')
            first_coll.set_date_prop('updated_at', first_coll.data,
                                     '_updated_at')

        except:
            traceback.print_exc()

        finally:
            # add remainder of file, assumed consumed/skipped, if any
            last_end = stream.tell()
            stream.close()

            if last_end < total_size:
                diff = total_size - last_end
                self._add_split_padding(diff, upload_key)

            with redis_pipeline(self.redis) as pi:
                pi.hincrby(upload_key, 'files', -1)
                pi.hset(upload_key, 'done', 1)
def create_user(m,
                email=None,
                username=None,
                passwd=None,
                role=None,
                name=None):
    """Create a new user with command line arguments or series of prompts,
       preforming basic validation
    """
    users = m.get_users()

    if not email:
        print('let\'s create a new user..')
        email = input('email: ').strip()

    # validate email
    if not re.match(r'[\w.-/+]+@[\w.-]+.\w+', email):
        print('valid email required!')
        return

    if email in [data['email_addr'] for u, data in users.items()]:
        print('A user already exists with {0} email!'.format(email))
        return

    username = username or input('username: '******'please enter a username!')
        return

    if not m.USER_RX.match(username) or username in m.RESTRICTED_NAMES:
        print('Invalid username..')
        return

    if username in users:
        print('Username already exists..')
        return

    name = name if name is not None else input('name (optional): ').strip()

    role = role if role in [r[0]
                            for r in m.cork.list_roles()] else choose_role(m)

    if passwd is not None:
        passwd2 = passwd
    else:
        passwd = getpass('password: '******'repeat password: '******'Passwords must match and be at least 8 characters long '
              'with lowercase, uppercase, and either digits or symbols.')
        return

    print('Creating user {username} with the email {email} and the role: '
          '\'{role}\''.format(username=username, email=email, role=role))

    # add user to cork
    m.cork._store.users[username] = {
        'role': role,
        'hash': m.cork._hash(username, passwd).decode('ascii'),
        'email_addr': email,
        'desc': '{{"name":"{name}"}}'.format(name=name),
        'creation_date': str(datetime.utcnow()),
        'last_login': str(datetime.utcnow()),
    }
    m.cork._store.save_users()

    # add user account defaults
    key = m.user_key.format(user=username)
    now = int(time.time())

    max_size, max_coll = m.redis.hmget('h:defaults', ['max_size', 'max_coll'])
    if not max_size:
        max_size = m.default_max_size

    if not max_coll:
        max_coll = m.default_max_coll

    with redis_pipeline(m.redis) as pi:
        pi.hset(key, 'max_size', max_size)
        pi.hset(key, 'max_coll', max_coll)
        pi.hset(key, 'created_at', now)
        pi.hset(key, 'name', name)
        pi.hsetnx(key, 'size', '0')

    if m.default_coll:
        # create initial collection
        m.create_collection(username,
                            coll=m.default_coll['id'],
                            coll_title=m.default_coll['title'],
                            desc=m.default_coll['desc'].format(username),
                            public=False)

    # email subscription set up?
    if m.mailing_list:
        m.add_to_mailing_list(username, email, name)

    print('All done!')
Ejemplo n.º 34
0
    def upload_file(self):
        stream = None
        temp_file = None
        logger.debug('Upload Begin')

        expected_size = int(request.headers['Content-Length'])

        logger.debug('Expected Size: ' + str(expected_size))

        if not expected_size:
            return {'error_message': 'No File Specified'}

        curr_user = self.manager.get_curr_user()

        if not curr_user:
            #user = self.manager.get_anon_user()
            #force_coll = 'temp'
            #is_anon = True

            return {
                'error_message':
                'Sorry, uploads only available for logged-in users'
            }

        user = curr_user
        force_coll = request.query.getunicode('force-coll', '')
        is_anon = False

        size_rem = self.manager.get_size_remaining(user)

        logger.debug('User Size Rem: ' + str(size_rem))

        if size_rem < expected_size:
            return {
                'error_message': 'Sorry, not enough space to upload this file'
            }

        if force_coll and not self.manager.has_collection(user, force_coll):
            if is_anon:
                self.manager.create_collection(user, force_coll,
                                               'Temporary Collection')

            else:
                status = 'Collection {0} not found'.format(force_coll)
                return {'error_message': status}

        temp_file = SpooledTemporaryFile(max_size=BLOCK_SIZE)

        filename = request.query.getunicode('filename')

        stream = request.environ['wsgi.input']
        stream = CacheingLimitReader(stream, expected_size, temp_file)

        if filename.endswith('.har'):
            stream, expected_size = self.har2warc(filename, stream)
            temp_file.close()
            temp_file = stream

        infos = self.parse_uploaded(stream, expected_size)

        total_size = temp_file.tell()
        if total_size != expected_size:
            return {
                'error_message':
                'size mismatch: expected {0}, got {1}'.format(
                    expected_size, total_size)
            }

        upload_id = self._get_upload_id()

        upload_key = self.upload_key.format(user=user, upid=upload_id)

        with redis_pipeline(self.manager.redis) as pi:
            pi.hset(upload_key, 'size', 0)
            pi.hset(upload_key, 'total_size', total_size * 2)
            pi.hset(upload_key, 'filename', filename)
            pi.hset(upload_key, 'total_files', 1)
            pi.hset(upload_key, 'files', 1)

        return self.handle_upload(temp_file, upload_id, upload_key, infos,
                                  filename, user, force_coll, total_size)
Ejemplo n.º 35
0
    def rename(self):
        from_user = request.query.getunicode('from_user', '')
        from_coll = request.query.getunicode('from_coll', '')
        from_rec = request.query.getunicode('from_rec', '*')

        to_user = request.query.getunicode('to_user', '')
        to_coll = request.query.getunicode('to_coll', '')
        to_rec = request.query.getunicode('to_rec', '*')

        to_title = request.query.getunicode('to_title', '')

        if not from_user or not from_coll or not to_user or not to_coll:
            return {'error_message': 'user or coll params missing'}

        if (from_rec == '*' or to_rec == '*') and (from_rec != to_rec):
            return {
                'error_message':
                'must specify rec name or "*" if moving entire coll'
            }

        # Move the redis keys, this performs the move as far as user is concerned
        match_pattern = ':' + from_user + ':' + from_coll + ':'
        replace_pattern = ':' + to_user + ':' + to_coll + ':'

        if to_rec != '*':
            match_pattern += from_rec + ':'
            replace_pattern += to_rec + ':'

        moves = {}

        for key in self.redis.scan_iter(match='*' + match_pattern + '*'):
            key = key.decode('utf-8')
            moves[key] = key.replace(match_pattern, replace_pattern)

        # Get Info Keys
        to_user_key = self.info_keys['user'].format(user=to_user)
        from_user_key = self.info_keys['user'].format(user=from_user)

        if to_rec != '*':
            to_coll_key = self.info_keys['coll'].format(user=to_user,
                                                        coll=to_coll)
            from_coll_key = self.info_keys['coll'].format(user=from_user,
                                                          coll=from_coll)

            to_coll_list_key = self.rec_list_key_templ.format(user=to_user,
                                                              coll=to_coll)
            from_coll_list_key = self.rec_list_key_templ.format(user=from_user,
                                                                coll=from_coll)

            info_key = self.info_keys['rec'].format(user=from_user,
                                                    coll=from_coll,
                                                    rec=from_rec)

            to_id = to_rec
        else:
            info_key = self.info_keys['coll'].format(user=from_user,
                                                     coll=from_coll)

            to_id = to_coll

        the_size = int(self.redis.hget(info_key, 'size'))

        with redis_pipeline(self.redis) as pi:
            # Fix Id
            pi.hset(info_key, 'id', to_id)

            # Change title, if provided
            if to_title:
                pi.hset(info_key, 'title', to_title)

            # actual rename
            for from_key, to_key in iteritems(moves):
                pi.rename(from_key, to_key)

        with redis_pipeline(self.redis) as pi:
            # change user size, if different users
            if to_user_key != from_user_key:
                pi.hincrby(from_user_key, 'size', -the_size)
                pi.hincrby(to_user_key, 'size', the_size)

            # change coll size if moving rec and different colls
            if to_rec != '*' and to_coll_key != from_coll_key:
                pi.hincrby(from_coll_key, 'size', -the_size)
                pi.hincrby(to_coll_key, 'size', the_size)

            if to_rec != '*':
                pi.srem(from_coll_list_key, from_rec)
                pi.sadd(to_coll_list_key, to_rec)

        # rename WARCs (only if switching users)
        replace_list = []

        for key, name, url in self._iter_all_warcs(to_user, to_coll, to_rec):
            if not url.startswith(self.full_warc_prefix):
                continue

            filename = url[len(self.full_warc_prefix):]

            new_filename = filename.replace(from_user + '/', to_user + '/')

            repl = dict(key=key, name=name, old_v=filename, new_v=new_filename)

            replace_list.append(repl)

        if replace_list:
            if not self.queue_message('rename',
                                      {'replace_list': replace_list}):
                return {'error_message': 'no local clients'}

        #if self.storage_committer:
        #    storage = self.storage_committer.get_storage(to_user, to_coll, to_rec)
        #    if storage and not storage.rename(from_user, from_coll, from_rec,
        #                                      to_user, to_coll, to_rec):
        #        return {'error_message': 'remote rename failed'}

        return {'success': to_user + ':' + to_coll + ':' + to_rec}
Ejemplo n.º 36
0
    def process_upload(self, user, force_coll_name, infos, stream, filename, total_size, num_recs, upload_key):
        """Process WARC archive.

        :param User user: user
        :param str force_coll_name: name of collection to upload into
        :param list infos: list of recordings (indices)
        :param stream: file object
        :param str filename: WARC archive filename
        :param int total_size: WARC archive size
        :param int num_recs: number of recordings

        :returns: collection and recordings
        :rtype: Collection and list
        """
        stream.seek(0)

        count = 0

        first_coll = None

        collection = None
        recording = None

        if force_coll_name:
            collection = user.get_collection_by_name(force_coll_name)

        rec_infos = []

        lists = None

        for info in infos:
            type = info.get('type')

            if type == 'collection':
                if not collection:
                    collection = self.make_collection(user, filename, info)
                lists = info.get('lists')

            i=0
            if type == 'recording':
                if not collection:
                    collection = self.make_collection(user, filename, self.upload_coll_info, info)

                desc = info.get('desc', '')
                print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")

                url=info.get('pages', None)[0].get('url', None)
                if url and i == 0 :
                    i+=1
                    collection = user.get_collection_by_name(force_coll_name)
                    collection['url'] = url
                    with redis_pipeline(self.redis) as pi:
                        pi.hset(upload_key, 'url', url)
                    print(collection['url'])
                    print(collection['title'])
                    collection.mark_updated()
                # if title was auto-generated for compatibility on export,
                # set title to blank
                if info.get('auto_title'):
                    title = ''
                else:
                    title = info.get('title', '')

                recording = collection.create_recording(title=title,
                                                        desc=desc,
                                                        rec_type=info.get('rec_type'),
                                                        ra_list=info.get('ra'))

                info['id'] = recording.my_id

                count += 1
                #yield collection, recording

                logger.debug('Processing Upload Rec {0} of {1}'.format(count, num_recs))

                rec_infos.append({'coll': collection.my_id,
                                  'rec': recording.my_id,
                                  'offset': info['offset'],
                                  'length': info['length'],
                                  'pages': info.get('pages', None),
                                  'collection': collection,
                                  'recording': recording,
                                  'created_at': info.get('created_at'),
                                  'updated_at': info.get('updated_at'),
                                  'recorded_at': info.get('recorded_at', info.get('updated_at')),
                                 })

            if not first_coll:
                first_coll = collection


        if lists:
            collection.data['_lists'] = lists

        return first_coll, rec_infos