Beispiel #1
0
    def create_bookmark(self, props, incr_stats=True):
        collection = self.get_owner()
        self.access.assert_can_write_coll(collection)

        # don't store rec id, if provided
        props.pop('rec', '')

        # if a page is specified for this bookmark, ensure that it has the same url and timestamp
        page_id = props.get('page_id')
        if page_id:
            if not collection.page_exists(page_id):
                return None

        bid = self.get_new_bookmark_id()
        props['id'] = bid

        bookmark = props

        self.bookmark_order.insert_ordered_id(bid, props.get('before_id'))

        self.redis.hset(self.BOOK_CONTENT_KEY.format(blist=self.my_id), bid,
                        json.dumps(bookmark))

        if incr_stats:
            Stats(self.redis).incr_bookmark_add()

        if page_id:
            collection.clear_page_bookmark_cache()
            self.load_pages([bookmark])

        return bookmark
Beispiel #2
0
    def add_bookmarks(self, bookmarks):
        collection = self.get_owner()
        self.access.assert_can_write_coll(collection)

        all_bookmarks = {}

        clear_cache = False

        for bookmark_data in bookmarks:
            # don't store rec id, if provided
            bookmark_data.pop('rec', '')

            # if a page is specified for this bookmark, ensure that it has the same url and timestamp
            page_id = bookmark_data.get('page_id')
            if page_id:
                clear_cache = True

            bid = self.get_new_bookmark_id()
            bookmark_data['id'] = bid

            all_bookmarks[bid] = json.dumps(bookmark_data)

        self.bookmark_order.insert_ordered_ids(all_bookmarks.keys())

        self.redis.hmset(self.BOOK_CONTENT_KEY.format(blist=self.my_id),
                         all_bookmarks)

        if clear_cache:
            self.get_owner().clear_page_bookmark_cache()

        Stats(self.redis).incr_bookmark_add(len(bookmarks))

        self.mark_updated()
Beispiel #3
0
        def upload_file():
            user = self.access.session_user
            force_coll_name = request.query.getunicode('force-coll', '')

            if force_coll_name:
                collection = user.get_collection_by_name(force_coll_name)
            else:
                collection = None

            # allow uploading to external collections
            if not collection or not collection.is_external():
                if user.is_anon():
                    return self._raise_error(400, 'not_logged_in')

            expected_size = int(request.headers['Content-Length'])

            if not expected_size:
                return self._raise_error(400, 'no_file_specified')

            filename = request.query.getunicode('filename')
            stream = request.environ['wsgi.input']

            res = self.uploader.upload_file(user, stream, expected_size,
                                            filename, force_coll_name)

            if 'error' in res:
                return self._raise_error(400, res['error'])

            Stats(self.redis).incr_upload(user, expected_size)
            return res
    def __init__(self, *args, **kwargs):
        super(WebsockController, self).__init__(*args, **kwargs)
        config = kwargs['config']
        self.status_update_secs = float(config['status_update_secs'])

        self.browser_mgr = kwargs['browser_mgr']
        self.content_app = kwargs['content_app']

        self.dyn_stats = DynStats(self.redis, config)
        self.stats = Stats(self.redis)
    def _add_stats(self, cdx, resp_headers, kwargs, record):
        type_ = kwargs['type']

        if type_ == 'replay-coll':
            content_len = record.rec_headers.get_header('Content-Length')
            if content_len is not None:
                Stats(self.redis).incr_replay(int(content_len), kwargs['user'])

        if type_ in ('record', 'live'):
            return

        source = cdx.get('source')
        if not source:
            return

        if source == 'local':
            source = 'replay'

        if source == 'replay' and type_ == 'patch':
            return

        orig_source = cdx.get('orig_source_id')
        if orig_source:
            source = orig_source

        ra_rec = None
        ra_recording = None

        # set source in recording-key
        if type_ in self.MODIFY_MODES:
            skip = resp_headers.get('Recorder-Skip')

            if not skip and source not in ('live', 'replay'):
                ra_rec = unquote(resp_headers.get('Recorder-Rec', ''))
                ra_rec = ra_rec or kwargs['rec']

                recording = kwargs.get('recording')
                patch_recording = kwargs.get('patch_recording')

                if recording and ra_rec == recording.my_id:
                    ra_recording = recording
                elif patch_recording and ra_rec == patch_recording.my_id:
                    ra_recording = patch_recording

        url = cdx.get('url')
        referrer = request.environ.get('HTTP_REFERER')

        if not referrer:
            referrer = url
        elif ('wsgiprox.proxy_host' not in request.environ
              and request.environ.get('HTTP_HOST') in referrer):
            referrer = url

        self.dyn_stats.update_dyn_stats(url, kwargs, referrer, source,
                                        ra_recording)
Beispiel #6
0
    def delete_me(self, storage, pages=True):
        res = self.delete_files(storage)

        Stats(self.redis).incr_delete(self)

        # if deleting collection, no need to remove pages for each recording
        # they'll be deleted with the collection
        if pages:
            self.get_owner().delete_rec_pages(self)

        if not self.delete_object():
            res['error'] = 'not_found'

        return res
    def remove_bookmark(self, bid):
        self.access.assert_can_write_coll(self.get_owner())

        res = self.bookmark_order.remove_ordered_id(bid)
        if not res:
            return False

        # check if bookmark had a page_id
        bookmark = self.get_bookmark(bid)
        page_id = bookmark.get('page_id')
        if page_id:
            self.get_owner().remove_page_bookmark(page_id, bid)

        if self.redis.hdel(self.BOOK_CONTENT_KEY.format(blist=self.my_id), bid) == 1:
            Stats(self.redis).incr_bookmark_del()
            return True
        else:
            return False
    def update_bookmark(self, bid, props):
        self.access.assert_can_write_coll(self.get_owner())

        bookmark = self.get_bookmark(bid)

        if not bookmark:
            return False

        AVAIL_PROPS = ('title', 'url', 'timestamp', 'browser', 'desc')

        for prop in props:
            if prop in AVAIL_PROPS:
                bookmark[prop] = props[prop]

        self.redis.hset(self.BOOK_CONTENT_KEY.format(blist=self.my_id), bid, json.dumps(bookmark))

        Stats(self.redis).incr_bookmark_mod()

        return bookmark
Beispiel #9
0
    def move(self, collection, new_name, new_user):
        if self == new_user:
            return False

        new_name = new_user.colls.reserve_obj_name(new_name, allow_dupe=False)

        if not self.colls.remove_object(collection):
            return False

        new_user.colls.add_object(new_name, collection, owner=True)

        self.incr_size(-collection.size)
        new_user.incr_size(collection.size)

        Stats(self.redis).move_temp_to_user_usage(collection)

        for recording in collection.get_recordings():
            # will be marked for commit
            recording.set_closed()

        return True
Beispiel #10
0
    def delete_me(self, storage, pages=True):
        """Delete recording.

        :param BaseStorage storage: Webrecorder storage
        :param bool pages: whether to delete pages

        :returns: result
        :rtype: dict
        """
        res = self.delete_files(storage)

        Stats(self.redis).incr_delete(self)

        # if deleting collection, no need to remove pages for each recording
        # they'll be deleted with the collection
        if pages:
            self.get_owner().delete_rec_pages(self)

        if not self.delete_object():
            res['error'] = 'not_found'

        return res
Beispiel #11
0
        def upload_file():
            if self.access.session_user.is_anon():
                return self._raise_error(400, 'not_logged_in')

            expected_size = int(request.headers['Content-Length'])

            if not expected_size:
                return self._raise_error(400, 'no_file_specified')

            force_coll_name = request.query.getunicode('force-coll', '')
            filename = request.query.getunicode('filename')
            stream = request.environ['wsgi.input']
            user = self.access.session_user

            res = self.uploader.upload_file(user, stream, expected_size,
                                            filename, force_coll_name)

            if 'error' in res:
                return self._raise_error(400, res['error'])

            Stats(self.redis).incr_upload(user, expected_size)
            return res
Beispiel #12
0
    def handle_download(self, user, coll_name, recs):
        user, collection = self.user_manager.get_user_coll(user, coll_name)

        if not collection:
            self._raise_error(404, 'no_such_collection')

        if not self.access.is_superuser():
            self.access.assert_can_write_coll(collection)

        # collection['uid'] = coll
        collection.load()

        Stats(self.redis).incr_download(collection)

        now = timestamp_now()

        name = coll_name
        if recs != '*':
            rec_list = recs.split(',')
            if len(rec_list) == 1:
                name = recs
            else:
                name += '-' + recs
        else:
            rec_list = None

        filename = self.download_filename.format(title=quote(name),
                                                 timestamp=now)
        loader = BlockLoader()

        coll_info = self.create_coll_warcinfo(user, collection, filename)

        def iter_infos():
            for recording in collection.get_recordings(load=True):
                if rec_list and recording.name not in rec_list:
                    continue

                warcinfo = self.create_rec_warcinfo(user, collection,
                                                    recording, filename)

                size = len(warcinfo)
                size += recording.size
                yield recording, warcinfo, size

        def read_all(infos):
            yield coll_info

            for recording, warcinfo, _ in infos:
                yield warcinfo

                for n, warc_path in recording.iter_all_files():
                    try:
                        fh = loader.load(warc_path)
                    except Exception:
                        print('Skipping invalid ' + warc_path)
                        continue

                    for chunk in StreamIter(fh):
                        yield chunk

        response.headers['Content-Type'] = 'application/octet-stream'
        response.headers[
            'Content-Disposition'] = "attachment; filename*=UTF-8''" + filename

        # if not transfer-encoding, store infos and calculate total size
        if not self.download_chunk_encoded:
            size = len(coll_info)
            infos = list(iter_infos())
            size += sum(size for r, i, size in infos)

            response.headers['Content-Length'] = size
            return read_all(infos)

        else:
            # stream everything
            response.headers['Transfer-Encoding'] = 'chunked'

            return read_all(iter_infos())
        def create_browser():
            """ Api to launch remote browser instances
            """
            sesh = self.get_session()

            if sesh.is_new() and self.is_content_request():
                self._raise_error(403, 'invalid_browser_request')

            browser_id = request.query['browser']

            Stats(self.redis).incr_browser(browser_id)

            user = self.get_user(redir_check=False)

            data = request.query

            coll_name = data.getunicode('coll', '')
            rec = data.get('rec', '')

            mode = data.get('mode', '')

            url = data.getunicode('url', '')
            timestamp = data.get('timestamp', '')

            sources = ''
            inv_sources = ''
            patch_rec = ''

            collection = user.get_collection_by_name(coll_name)
            recording = collection.get_recording(rec)

            if not collection:
                self._raise_error(404, 'no_such_collection')

            if mode == 'extract':
                # Extract from All, Patch from None
                sources = '*'
                inv_sources = '*'
            elif mode.startswith('extract:'):
                # Extract from One, Patch from all but one
                sources = mode.split(':', 1)[1]
                inv_sources = sources
                # load patch recording also
                #patch_recording = collection.get_recording(recording['patch_rec'])
                if recording:
                    patch_rec = recording.get_prop('patch_rec')

                mode = 'extract'
            elif mode.startswith('extract_only:'):
                # Extract from one only, no patching
                sources = mode.split(':', 1)[1]
                inv_sources = '*'
                mode = 'extract'

            if mode in self.MODIFY_MODES:
                if not recording:
                    return self._raise_error(404, 'no_such_recording')

                #rec = recording.my_id
            elif mode in ('replay', 'replay-coll'):
                rec = '*'
            else:
                return self._raise_error(400, 'invalid_mode')


            browser_can_write = '1' if self.access.can_write_coll(collection) else '0'

            remote_ip = self._get_remote_ip()

            # build kwargs
            kwargs = dict(user=user.name,
                          id=sesh.get_id(),
                          coll=collection.my_id,
                          rec=rec,
                          coll_name=quote(coll_name),
                          #rec_name=quote(rec_name, safe='/*'),

                          type=mode,
                          sources=sources,
                          inv_sources=inv_sources,
                          patch_rec=patch_rec,

                          remote_ip=remote_ip,
                          ip=remote_ip,

                          browser=browser_id,
                          url=url,
                          request_ts=timestamp,

                          browser_can_write=browser_can_write)

            data = self.browser_mgr.request_new_browser(kwargs)

            if 'error_message' in data:
                self._raise_error(400, data['error_message'])

            return data
Beispiel #14
0
    def handle_download_name(self, user, coll_name, warc_name, url):
        #username = request.query.getunicode('user')

        #warc_name = request.query.getunicode('doi')
        # some clients use collection rather than coll_name so we must check for both
        #coll_name = request.query.getunicode('collection')

        #user = self._get_wasapi_user()

        #self.access.assert_is_curr_user(user)

        #colls = None

        #if coll_name:
        #    collection = user.get_collection_by_name(coll_name)
        #    if collection:
        #        colls = [collection]
        #    else:
        #        self._raise_error(404, 'no_such_collection')

        #else:
        #    colls = user.get_collections()

        #files = []
        user_name = user
        user = self.user_manager.get_user(user)
        collection = user.get_collection_by_name(coll_name)
        if not collection:
            self._raise_error(404, 'no_such_collection')

        self.access.assert_can_write_coll(collection)

        # collection['uid'] = coll
        collection.load()

        Stats(self.redis).incr_download(collection)

        download_path = self.get_origin() + "/api/v1/download/{}/".format(
            user_name)
        warc_name_broke = warc_name.replace("/", "\/")
        warc_name_broke = warc_name.replace("10.25354/", "")
        local_storage = LocalFileStorage(self.redis)
        landingpage = template(
            'webrecorder/templates/landingpage.html',
            title=coll_name,
            warc_file=
            'https://projects.zo.uni-heidelberg.de/webarchive/warc/10.25354/' +
            warc_name_broke + '.warc',
            url=url)
        try:
            os.makedirs(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))
            print("Directory '% s' created" % os.path.isfile(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354')))
        except FileExistsError:
            print("Directory '% s' already created!" %
                  os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))
        except FileNotFoundError:
            print("Directory '% s' No such file or directory!" %
                  os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354'))
        try:
            os.makedirs(
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))
            print("Directory '% s' created" % os.path.isfile(
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                             '10.25354')))
        except FileExistsError:
            print(
                "Directory '% s' already created!" %
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))
        except FileNotFoundError:
            print(
                "Directory '% s' No such file or directory!" %
                os.path.join(os.environ['STORAGE_REPLAY'], 'warc', '10.25354'))

        try:
            f = open(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354',
                             warc_name_broke) + ".html", 'w')
            f.write(landingpage)
            f.close()
        except FileExistsError:
            print(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354',
                             warc_name_broke) + ".html exists")
        except FileNotFoundError:
            print(
                os.path.join(os.environ['STORAGE_REPLAY'], 'lp', '10.25354',
                             warc_name_broke) + ".html doesn't exists")
        commit_storage = collection.get_storage()

        for recording in collection.get_recordings():
            is_committed = recording.is_fully_committed()
            is_open = not is_committed and recording.get_pending_count() > 0
            storage = commit_storage if is_committed else local_storage
            try:
                f = open(
                    os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                                 '10.25354', warc_name_broke) + ".warc", 'wb')
                writer = WARCWriter(f, gzip=True)
                for name, path in recording.iter_all_files(
                        include_index=False):
                    local_download = download_path.format(user=user.name,
                                                          coll=collection.name,
                                                          filename=name)
                    warc_key = collection.get_warc_key()
                    warc_path = self.redis.hget(warc_key, name)
                    if 'http://nginx:6090' in warc_path:
                        warc_path = warc_path.replace('http://nginx:6090', '')
                    if 'https://nginx:6090' in warc_path:
                        warc_path = warc_path.replace('https://nginx:6090', '')
                    if not warc_path:
                        self._raise_error(404, 'file_not_found')
                    with open(warc_path, 'rb') as stream:
                        for record in ArchiveIterator(stream):
                            writer.write_record(record)
                f.close()
            except FileExistsError:
                print(
                    os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                                 '10.25354', warc_name_broke) + ".warc exists")
            except FileNotFoundError:
                print(
                    os.path.join(os.environ['STORAGE_REPLAY'], 'warc',
                                 '10.25354', warc_name_broke) +
                    ".warc doesn't exists")