Example #1
0
    def write_cdxj(self, user, cdxj_key):
        #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY)
        full_filename = self.get_prop(self.INDEX_FILE_KEY)
        if full_filename:
            cdxj_filename = os.path.basename(strip_prefix(full_filename))
            return cdxj_filename, full_filename

        dirname = user.get_user_temp_warc_path()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        timestamp = timestamp_now()

        cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp,
                                                     random=randstr)

        os.makedirs(dirname, exist_ok=True)

        full_filename = os.path.join(dirname, cdxj_filename)

        cdxj_list = self.redis.zrange(cdxj_key, 0, -1)

        with open(full_filename, 'wt') as out:
            for cdxj in cdxj_list:
                out.write(cdxj + '\n')
            out.flush()

        full_url = add_local_store_prefix(
            full_filename.replace(os.path.sep, '/'))
        #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url)
        self.set_prop(self.INDEX_FILE_KEY, full_url)

        return cdxj_filename, full_filename
Example #2
0
    def get_client_url(self, target_url):
        """Get client URL.

        :param str target_url: target URL

        :returns: client URL
        :rtype: str
        """
        return add_local_store_prefix(target_url.replace(os.path.sep, '/'))
Example #3
0
    def get_client_url(self, target_url):
        """Get client URL.

        :param str target_url: target URL

        :returns: client URL
        :rtype: str
        """
        return add_local_store_prefix(target_url.replace(os.path.sep, '/'))
Example #4
0
    def add_warc_file(self, full_filename, params):
        base_filename = self._get_rel_or_base_name(full_filename, params)
        file_key = res_template(self.file_key_template, params)
        rec_key = res_template(self.rec_file_key_template, params)

        full_load_path = storagepaths.add_local_store_prefix(full_filename)

        self.redis.hset(file_key, base_filename, full_load_path)
        self.redis.sadd(rec_key, base_filename)
Example #5
0
    def copy_rec_files(self, user, collection, recording, warc_files):
        if self.dry_run:
            target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id)
        else:
            target_dirname = user.get_user_temp_warc_path()

        os.makedirs(target_dirname, exist_ok=True)
        print('Writing to dir: ' + target_dirname)

        coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id)
        rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id)

        # Copy WARCs
        loader = BlockLoader()
        total_size = 0

        for n, url in warc_files.items():
            if not url.startswith('s3://'):
                print('FILE ERR: Skipping local file: ' + url)
                continue

            local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(
                url)
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                target_file = add_local_store_prefix(target_file)
                if n != recording.INDEX_FILE_KEY:
                    self.redis.hset(coll_warc_key, n, target_file)
                    self.redis.sadd(rec_warc_key, n)
                    total_size += size
                else:
                    recording.set_prop(n, target_file, update_ts=False)

                if self.dry_run:
                    os.remove(strip_prefix(target_file))

            except:
                import traceback
                traceback.print_exc()

        # commit from temp dir to storage
        if not self.dry_run:
            recording.commit_to_storage()

        return total_size
Example #6
0
    def copy_rec_files(self, user, collection, recording, warc_files):
        if self.dry_run:
            target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id)
        else:
            target_dirname = user.get_user_temp_warc_path()

        os.makedirs(target_dirname, exist_ok=True)
        print('Writing to dir: ' + target_dirname)

        coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id)
        rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id)

        # Copy WARCs
        loader = BlockLoader()
        total_size = 0

        for n, url in warc_files.items():
            if not url.startswith('s3://'):
                print('FILE ERR: Skipping local file: ' + url)
                continue

            local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(url)
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                target_file = add_local_store_prefix(target_file)
                if n != recording.INDEX_FILE_KEY:
                    self.redis.hset(coll_warc_key, n, target_file)
                    self.redis.sadd(rec_warc_key, n)
                    total_size += size
                else:
                    recording.set_prop(n, target_file, update_ts=False)

                if self.dry_run:
                    os.remove(strip_prefix(target_file))

            except:
                import traceback
                traceback.print_exc()

        # commit from temp dir to storage
        if not self.dry_run:
            recording.commit_to_storage()

        return total_size
Example #7
0
    def write_cdxj(self, user, cdxj_key):
        """Write CDX index lines to file.

        :param RedisUniqueComponent user: user
        :param str cdxj_key: CDX index file Redis key

        :returns: CDX file filename and path
        :rtype: str and str
        """
        #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY)
        full_filename = self.get_prop(self.INDEX_FILE_KEY)
        if full_filename:
            cdxj_filename = os.path.basename(strip_prefix(full_filename))
            return cdxj_filename, full_filename

        dirname = user.get_user_temp_warc_path()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        timestamp = timestamp_now()

        cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp,
                                                     random=randstr)

        os.makedirs(dirname, exist_ok=True)

        full_filename = os.path.join(dirname, cdxj_filename)

        cdxj_list = self.redis.zrange(cdxj_key, 0, -1)

        with open(full_filename, 'wt') as out:
            for cdxj in cdxj_list:
                out.write(cdxj + '\n')
            out.flush()

        full_url = add_local_store_prefix(full_filename.replace(os.path.sep, '/'))
        #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url)
        self.set_prop(self.INDEX_FILE_KEY, full_url)

        return cdxj_filename, full_filename
Example #8
0
    def copy_data_from_recording(self, source, delete_source=False):
        """Copy given recording building block entries.

        :param RedisUniqueComponent source: building block
        :param bool delete_source: whether to delete source building block

        :returns: whether successful or not
        :rtype: bool
        """
        if self == source:
            return False

        if not self.is_open():
            return False

        errored = False

        self._copy_prop(source, 'title')
        self._copy_prop(source, 'desc')
        self._copy_prop(source, 'rec_type')
        self._copy_prop(source, 'recorded_at')
        #self._copy_prop(source, 'patch_rec')

        collection = self.get_owner()
        user = collection.get_owner()

        target_dirname = user.get_user_temp_warc_path()
        target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id)

        # Copy WARCs
        loader = BlockLoader()

        for n, url in source.iter_all_files(include_index=True):
            local_filename = n + '.' + timestamp20_now()
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    logger.debug('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                if n != self.INDEX_FILE_KEY:
                    self.incr_size(size)
                    self.redis.hset(target_warc_key, n, add_local_store_prefix(target_file))
                else:
                    self.set_prop(n, target_file)

            except:
                traceback.print_exc()
                errored = True

        # COPY cdxj, if exists
        source_key = self.CDXJ_KEY.format(rec=source.my_id)
        target_key = self.CDXJ_KEY.format(rec=self.my_id)

        self.redis.zunionstore(target_key, [source_key])

        # recreate pages, if any, in new recording
        source_coll = source.get_owner()
        source_pages = source_coll.list_rec_pages(source)
        collection.import_pages(source_pages, self)

        # COPY remote archives, if any
        self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id),
                               self.RA_KEY.format(rec=source.my_id))

        # COPY recording warc keys
        self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id),
                               self.REC_WARC_KEY.format(rec=source.my_id))

        # sync collection cdxj, if exists
        collection.sync_coll_index(exists=True, do_async=True)

        if not errored and delete_source:
            collection = source.get_owner()
            collection.remove_recording(source, delete=True)

        return not errored
Example #9
0
    def copy_data_from_recording(self, source, delete_source=False):
        """Copy given recording building block entries.

        :param RedisUniqueComponent source: building block
        :param bool delete_source: whether to delete source building block

        :returns: whether successful or not
        :rtype: bool
        """
        if self == source:
            return False

        if not self.is_open():
            return False

        errored = False

        self._copy_prop(source, 'title')
        self._copy_prop(source, 'desc')
        self._copy_prop(source, 'rec_type')
        self._copy_prop(source, 'recorded_at')
        #self._copy_prop(source, 'patch_rec')

        collection = self.get_owner()
        user = collection.get_owner()

        target_dirname = user.get_user_temp_warc_path()
        target_warc_key = self.COLL_WARC_KEY.format(coll=collection.my_id)

        # Copy WARCs
        loader = BlockLoader()

        for n, url in source.iter_all_files(include_index=True):
            local_filename = n + '.' + timestamp20_now()
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                if n != self.INDEX_FILE_KEY:
                    self.incr_size(size)
                    self.redis.hset(target_warc_key, n,
                                    add_local_store_prefix(target_file))
                else:
                    self.set_prop(n, target_file)

            except:
                import traceback
                traceback.print_exc()
                errored = True

        # COPY cdxj, if exists
        source_key = self.CDXJ_KEY.format(rec=source.my_id)
        target_key = self.CDXJ_KEY.format(rec=self.my_id)

        self.redis.zunionstore(target_key, [source_key])

        # recreate pages, if any, in new recording
        source_coll = source.get_owner()
        source_pages = source_coll.list_rec_pages(source)
        collection.import_pages(source_pages, self)

        # COPY remote archives, if any
        self.redis.sunionstore(self.RA_KEY.format(rec=self.my_id),
                               self.RA_KEY.format(rec=source.my_id))

        # COPY recording warc keys
        self.redis.sunionstore(self.REC_WARC_KEY.format(rec=self.my_id),
                               self.REC_WARC_KEY.format(rec=source.my_id))

        # sync collection cdxj, if exists
        collection.sync_coll_index(exists=True, do_async=True)

        if not errored and delete_source:
            collection = source.get_owner()
            collection.remove_recording(source, delete=True)

        return not errored
Example #10
0
 def get_client_url(self, target_url):
     return add_local_store_prefix(target_url.replace(os.path.sep, '/'))