Exemple #1
0
    def write_cdxj(self, user, cdxj_key):
        #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY)
        full_filename = self.get_prop(self.INDEX_FILE_KEY)
        if full_filename:
            cdxj_filename = os.path.basename(strip_prefix(full_filename))
            return cdxj_filename, full_filename

        dirname = user.get_user_temp_warc_path()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        timestamp = timestamp_now()

        cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp,
                                                     random=randstr)

        os.makedirs(dirname, exist_ok=True)

        full_filename = os.path.join(dirname, cdxj_filename)

        cdxj_list = self.redis.zrange(cdxj_key, 0, -1)

        with open(full_filename, 'wt') as out:
            for cdxj in cdxj_list:
                out.write(cdxj + '\n')
            out.flush()

        full_url = add_local_store_prefix(
            full_filename.replace(os.path.sep, '/'))
        #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url)
        self.set_prop(self.INDEX_FILE_KEY, full_url)

        return cdxj_filename, full_filename
Exemple #2
0
    def handle_delete_file(self, uri):
        # determine if local file
        filename = storagepaths.strip_prefix(uri)

        closed = self.recorder.writer.close_file(filename)

        self.local_storage.delete_file(filename)
Exemple #3
0
    def commit_file(self, filename, full_filename, obj_type,
                    update_key=None, update_prop=None, direct_delete=False):

        user = self.get_owner()
        storage = self.get_storage()

        if not storage:
            return True

        orig_full_filename = full_filename
        full_filename = strip_prefix(full_filename)

        # not a local filename
        if '://' in full_filename and not full_filename.startswith('local'):
            return True

        if not os.path.isfile(full_filename):
            return True

        commit_wait = self.COMMIT_WAIT_KEY.format(filename=full_filename)

        if self.redis.set(commit_wait, '1', ex=self.COMMIT_WAIT_SECS, nx=True):
            if not storage.upload_file(user, self, None,
                                       filename, full_filename, obj_type):

                self.redis.delete(commit_wait)
                return False

        # already uploaded, see if it is accessible
        # if so, finalize and delete original
        remote_url = storage.get_upload_url(filename)
        if not remote_url:
            print('Not yet available: {0}'.format(full_filename))
            return False

        print('Committed {0} -> {1}'.format(full_filename, remote_url))
        if update_key:
            update_prop = update_prop or filename
            self.redis.hset(update_key, update_prop, remote_url)

        # just in case, if remote_url is actually same as original (local file double-commit?), just return
        if remote_url == orig_full_filename:
            return True

        # if direct delete, call os.remove directly
        # used for CDXJ files which are not owned by a writer
        if direct_delete:
            try:
                os.remove(full_filename)
            except Exception as e:
                print(e)
                return True
        else:
        # for WARCs, send handle_delete to ensure writer can close the file
             if self.redis.publish('handle_delete_file', full_filename) < 1:
                print('No Delete Listener!')

        return True
Exemple #4
0
    def client_url_to_target_url(self, client_url):
        """Get target URL (from client URL).

        :param str client URL: client URL

        :returns: target URL
        :rtype: str
        """
        return strip_prefix(client_url)
Exemple #5
0
    def client_url_to_target_url(self, client_url):
        """Get target URL (from client URL).

        :param str client URL: client URL

        :returns: target URL
        :rtype: str
        """
        return strip_prefix(client_url)
Exemple #6
0
    def copy_rec_files(self, user, collection, recording, warc_files):
        if self.dry_run:
            target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id)
        else:
            target_dirname = user.get_user_temp_warc_path()

        os.makedirs(target_dirname, exist_ok=True)
        print('Writing to dir: ' + target_dirname)

        coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id)
        rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id)

        # Copy WARCs
        loader = BlockLoader()
        total_size = 0

        for n, url in warc_files.items():
            if not url.startswith('s3://'):
                print('FILE ERR: Skipping local file: ' + url)
                continue

            local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(
                url)
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                target_file = add_local_store_prefix(target_file)
                if n != recording.INDEX_FILE_KEY:
                    self.redis.hset(coll_warc_key, n, target_file)
                    self.redis.sadd(rec_warc_key, n)
                    total_size += size
                else:
                    recording.set_prop(n, target_file, update_ts=False)

                if self.dry_run:
                    os.remove(strip_prefix(target_file))

            except:
                import traceback
                traceback.print_exc()

        # commit from temp dir to storage
        if not self.dry_run:
            recording.commit_to_storage()

        return total_size
    def copy_rec_files(self, user, collection, recording, warc_files):
        if self.dry_run:
            target_dirname = os.path.join('/tmp/migrate4.0', collection.my_id)
        else:
            target_dirname = user.get_user_temp_warc_path()

        os.makedirs(target_dirname, exist_ok=True)
        print('Writing to dir: ' + target_dirname)

        coll_warc_key = recording.COLL_WARC_KEY.format(coll=collection.my_id)
        rec_warc_key = recording.REC_WARC_KEY.format(rec=recording.my_id)

        # Copy WARCs
        loader = BlockLoader()
        total_size = 0

        for n, url in warc_files.items():
            if not url.startswith('s3://'):
                print('FILE ERR: Skipping local file: ' + url)
                continue

            local_filename = n if n != recording.INDEX_FILE_KEY else os.path.basename(url)
            target_file = os.path.join(target_dirname, local_filename)

            src = loader.load(url)

            try:
                with open(target_file, 'wb') as dest:
                    print('Copying {0} -> {1}'.format(url, target_file))
                    shutil.copyfileobj(src, dest)
                    size = dest.tell()

                target_file = add_local_store_prefix(target_file)
                if n != recording.INDEX_FILE_KEY:
                    self.redis.hset(coll_warc_key, n, target_file)
                    self.redis.sadd(rec_warc_key, n)
                    total_size += size
                else:
                    recording.set_prop(n, target_file, update_ts=False)

                if self.dry_run:
                    os.remove(strip_prefix(target_file))

            except:
                import traceback
                traceback.print_exc()

        # commit from temp dir to storage
        if not self.dry_run:
            recording.commit_to_storage()

        return total_size
Exemple #8
0
    def write_cdxj(self, user, cdxj_key):
        """Write CDX index lines to file.

        :param RedisUniqueComponent user: user
        :param str cdxj_key: CDX index file Redis key

        :returns: CDX file filename and path
        :rtype: str and str
        """
        #full_filename = self.redis.hget(warc_key, self.INDEX_FILE_KEY)
        full_filename = self.get_prop(self.INDEX_FILE_KEY)
        if full_filename:
            cdxj_filename = os.path.basename(strip_prefix(full_filename))
            return cdxj_filename, full_filename

        dirname = user.get_user_temp_warc_path()

        randstr = base64.b32encode(os.urandom(5)).decode('utf-8')

        timestamp = timestamp_now()

        cdxj_filename = self.INDEX_NAME_TEMPL.format(timestamp=timestamp,
                                                     random=randstr)

        os.makedirs(dirname, exist_ok=True)

        full_filename = os.path.join(dirname, cdxj_filename)

        cdxj_list = self.redis.zrange(cdxj_key, 0, -1)

        with open(full_filename, 'wt') as out:
            for cdxj in cdxj_list:
                out.write(cdxj + '\n')
            out.flush()

        full_url = add_local_store_prefix(full_filename.replace(os.path.sep, '/'))
        #self.redis.hset(warc_key, self.INDEX_FILE_KEY, full_url)
        self.set_prop(self.INDEX_FILE_KEY, full_url)

        return cdxj_filename, full_filename
    def commit_file(self, filename, full_filename, obj_type,
                    update_key=None, update_prop=None, direct_delete=False):

        user = self.get_owner()
        storage = self.get_storage()

        if not storage:
            logger.debug('Skip File Commit: No Storage')
            return True

        orig_full_filename = full_filename
        full_filename = strip_prefix(full_filename)

        # not a local filename
        if '://' in full_filename and not full_filename.startswith('local'):
            logger.debug('Skip File Commit: Not Local Filename: {0}'.format(full_filename))
            return True

        if not os.path.isfile(full_filename):
            logger.debug('Fail File Commit: Not Found: {0}'.format(full_filename))
            return False

        commit_wait = self.COMMIT_WAIT_KEY.format(filename=full_filename)

        if self.redis.set(commit_wait, '1', ex=self.COMMIT_WAIT_SECS, nx=True):
            if not storage.upload_file(user, self, None,
                                       filename, full_filename, obj_type):

                self.redis.delete(commit_wait)
                return False

        # already uploaded, see if it is accessible
        # if so, finalize and delete original
        remote_url = storage.get_upload_url(filename)
        if not remote_url:
            logger.debug('File Commit: Not Yet Available: {0}'.format(full_filename))
            return False

        if update_key:
            update_prop = update_prop or filename
            self.redis.hset(update_key, update_prop, remote_url)

        # just in case, if remote_url is actually same as original (local file double-commit?), just return
        if remote_url == orig_full_filename:
            logger.debug('File Already Committed: {0}'.format(remote_url))
            return True

        # if direct delete, call os.remove directly
        # used for CDXJ files which are not owned by a writer
        if direct_delete:
            try:
                os.remove(full_filename)
            except Exception as e:
                traceback.print_exc()
        else:
        # for WARCs, send handle_delete to ensure writer can close the file
             if self.redis.publish('handle_delete_file', full_filename) < 1:
                logger.debug('No Delete Listener!')

        logger.debug('File Committed {0} -> {1}'.format(full_filename, remote_url))
        return True
Exemple #10
0
 def client_url_to_target_url(self, client_url):
     return strip_prefix(client_url)