Esempio n. 1
0
    def delete_expired_external(self):
        """ Delete any expired external collections in non-temp users
        """
        all_ext_templ = Collection.EXTERNAL_KEY.format(coll='*')

        for ext_key in self.data_redis.scan_iter(all_ext_templ):
            try:
                _, coll, _2 = ext_key.split(':', 2)

                collection = Collection(my_id=coll,
                                        redis=self.data_redis,
                                        access=BaseAccess())

                user = collection.get_owner()
                if not user or user.is_anon():
                    continue

                if not collection.has_cdxj():
                    logger.debug(
                        'TempChecker: Delete Expired External Coll: ' +
                        collection.name)
                    user.remove_collection(collection, delete=True)
            except Exception:
                import traceback
                traceback.print_exc()
Esempio n. 2
0
    def process_cdxj_key(self, cdxj_key):
        _, rec, _2 = cdxj_key.split(':', 2)

        recording = Recording(my_id=rec, redis=self.redis, access=BaseAccess())

        if not recording.is_open(extend=False):
            recording.commit_to_storage()
Esempio n. 3
0
    def create_write_buffer(self, params, name):
        rec_id = params.get('param.recorder.rec') or params.get('param.rec')
        recording = Recording(my_id=rec_id,
                              redis=self.redis,
                              access=BaseAccess())

        params['recording'] = recording

        return TempWriteBuffer(recording, name, params['url'])
Esempio n. 4
0
    def delete_if_expired(self, temp_user, temp_dir):
        temp_key = 't:' + temp_user
        sesh = self.sesh_redis.get(temp_key)

        if sesh == 'commit-wait':
            try:
                if not os.path.isdir(temp_dir):
                    logger.debug(
                        'TempChecker: Remove Session For Already Deleted Dir: '
                        + temp_dir)
                    self.sesh_redis.delete(temp_key)
                    return True

                logger.debug('TempChecker: Removing if empty: ' + temp_dir)
                os.rmdir(temp_dir)
                #shutil.rmtree(temp_dir)
                logger.debug('TempChecker: Deleted empty dir: ' + temp_dir)

                self.sesh_redis.delete(temp_key)

            except Exception as e:
                logger.debug('TempChecker: Waiting for commit')
                return False

        # temp user key exists
        elif self.data_redis.exists(User.INFO_KEY.format(user=temp_user)):
            # if user still active, don't remove
            if self.sesh_redis.get(self.sesh_key_template.format(sesh)):
                #print('Skipping active temp ' + temp)
                return False

            # delete user
            logger.debug('TempChecker: Deleting expired user: '******'TempChecker: Deleted expired temp dir: ' +
                             temp_dir)
                shutil.rmtree(temp_dir)
            except Exception as e:
                logger.warn(str(e))
                return False

        return True
Esempio n. 5
0
    def process_cdxj_key(self, cdxj_key):
        _, rec, _2 = cdxj_key.split(':', 2)

        recording = Recording(my_id=rec, redis=self.redis, access=BaseAccess())

        if not recording.get_owner():
            logger.debug('Deleting Invalid Rec: ' + recording.my_id)
            recording.delete_object()
            return

        if not recording.is_open(extend=False):
            recording.commit_to_storage()
Esempio n. 6
0
    def __init__(self, redis_url=None):
        config = load_wr_config()

        self.base_access = BaseAccess()

        # Init Redis
        if not redis_url:
            redis_url = os.environ['REDIS_BASE_URL']

        r = redis.StrictRedis.from_url(redis_url, decode_responses=True)

        # Init Cork
        cork = WebRecCork.create_cork(r, config)

        super(CLIUserManager, self).__init__(redis=r, cork=cork, config=config)
Esempio n. 7
0
    def test_sync_avoid_double_load(self):
        self.assert_exists(COLL_CDXJ, False)()
        self.assert_exists(REC_CDXJ, False)()

        collection = User(redis=self.redis,
                          my_id=self.anon_user,
                          access=BaseAccess()).get_collection_by_name('temp')

        collection.sync_coll_index(exists=False, do_async=True)

        time.sleep(0.1)

        self.assert_exists(REC_CDXJ_T, True)()

        collection.sync_coll_index(exists=True, do_async=True)

        time.sleep(0.1)

        self.assert_exists(REC_CDXJ_T, True)()

        self.sleep_try(0.1, 0.5, self.assert_exists(REC_CDXJ_T, False))

        assert load_counter == 1
Esempio n. 8
0
    def delete_if_expired(self, temp_user, temp_dir):
        temp_key = 't:' + temp_user
        sesh = self.sesh_redis.get(temp_key)

        if sesh == 'commit-wait':
            # This temporary user has signed up for a permanent account and
            # their collections will be migrated to storage.
            # Clean up if that migration is complete (i.e. the dir is empty).
            # Otherwise, wait.
            if os.path.isdir(temp_dir):
                try:
                    logger.debug('TempChecker: Removing if empty: ' + temp_dir)
                    os.rmdir(temp_dir)
                    logger.debug('TempChecker: Deleted empty dir: ' + temp_dir)
                except OSError as e:
                    if e.errno == errno.ENOTEMPTY:
                        logger.debug('TempChecker: Waiting for commit')
                    elif e.errno != errno.ENOENT:
                        logger.error(str(e))
                    return False
            else:
                logger.debug(
                    'TempChecker: Removing Session For Already Deleted Dir: ' +
                    temp_dir)

            self.sesh_redis.delete(temp_key)
            return True

        # temp user key exists
        elif self.data_redis.exists(User.INFO_KEY.format(user=temp_user)):

            # if user still active, don't remove
            if self.sesh_redis.get(self.sesh_key_template.format(sesh)):
                return False

            logger.debug('TempChecker: Deleting expired user: '******'s open recordings "closed";
            # return (if necessary) to give time for closing logic to complete
            wait_to_delete = False
            for collection in user.get_collections(load=False):
                for recording in collection.get_recordings(load=False):
                    if recording.is_open(extend=False):
                        recording.set_closed()
                        logger.debug('TempChecker: Closing temp recording: ' +
                                     recording.my_id)
                        wait_to_delete = True
            if wait_to_delete:
                return False

            # delete the user; signal that the user's collections should be deleted.
            # the temp dir containing those collections will be deleted on next pass.
            user.delete_me()

            # delete the session
            self.sesh_redis.delete(temp_key)

            return True

        # no user session, remove temp dir and everything in it
        else:
            try:
                logger.debug('TempChecker: Deleted expired temp dir: ' +
                             temp_dir)
                shutil.rmtree(temp_dir)
            except OSError as e:
                if e.errno != errno.ENOENT:
                    logger.error(str(e))
                return False

        return True
Esempio n. 9
0
 def __init__(self, redis):
     self.redis = redis
     self.access = BaseAccess()
     self.users = UserTable(self.redis, self.get_access)
     self.roles = RedisTable(self.redis, 'h:roles')
     self.pending_registrations = RedisTable(self.redis, 'h:register')
Esempio n. 10
0
    def process_new_pages(self):
        crawl_groups = {}

        while True:
            data = self.redis.rpop(Collection.NEW_PAGES_Q)
            if not data:
                break

            page_data = json.loads(data)

            rec = page_data['rec']

            if rec not in crawl_groups:
                crawl_groups[rec] = {
                    'user': page_data['user'],
                    'coll': page_data['coll'],
                    'coll_name': page_data['coll_name'],
                    'pages': []
                }

            crawl_groups[rec]['pages'].append({
                'pid':
                page_data['pid'],
                'url':
                page_data['url'],
                'timestamp':
                page_data['timestamp'],
                'title':
                page_data.get('title'),
            })

            if page_data.get('derivs_rec'):
                crawl_groups[rec]['derivs_rec'] = page_data.get('derivs_rec')

        for rec, data in crawl_groups.items():
            user = User(my_id=data['user'],
                        redis=self.redis,
                        access=BaseAccess())

            if not user:
                print('Invalid User: '******'coll_name'])

            if not collection:
                print('Invalid Collection: ' + data['coll_name'])
                continue

            recording = collection.get_recording(rec)

            # if a specific derivates recording is provided, use that
            derivs_rec = data.get('derivs_rec')

            # otherwise create derivates recording if none exists
            if not derivs_rec:
                derivs_recording = recording.get_derivs_recording()
                if not derivs_recording:
                    title = 'Derivatives for: Session from ' + recording.to_iso_date(
                        recording['created_at'], no_T=True)
                    derivs_recording = collection.create_recording(
                        title=title, rec_type='derivs')

                    recording.set_derivs_recording(derivs_recording)

                derivs_rec = derivs_recording.my_id

            crawl_def = SEARCH_CRAWL_DEF.copy()
            crawl_def['coll'] = crawl_def['screenshot_coll'] = crawl_def[
                'text_coll'] = data['coll']
            crawl_def['user_params'] = {
                'user': data['user'],
                'coll': data['coll'],
                'coll_name': data['coll_name'],
                'rec': derivs_rec,
                'type': 'replay-coll',
                # updated later
                'request_ts': '',
                'browser': BROWSER
            }
            crawl_def['name'] = 'text-' + data['user'] + '-' + data['coll']
            crawl_def['seed_urls'] = data['pages']

            print(crawl_def)

            r = requests.post(self.browsertrix_url, json=crawl_def)
            print(r.text)
Esempio n. 11
0
 def _get_access(self):
     if self.admin_override:
         self.admin_override = False
         return BaseAccess()
     else:
         return request['webrec.access']