def janitor(): """Ideally this is maintained by a systemd service to cleanup redis and the file system while Fractalis is running. """ data_dir = os.path.join(app.config['FRACTALIS_TMP_DIR'], 'data') tracked_ids = [key.split(':')[1] for key in redis.scan_iter('data:*')] if not os.path.exists(data_dir): for task_id in tracked_ids: async_result = celery.AsyncResult(task_id) if async_result.state == 'SUCCESS': redis.delete('data:{}'.format(task_id)) return cached_files = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))] # clean cached files for cached_file in cached_files: if cached_file not in tracked_ids: sync.remove_file(os.path.join(data_dir, cached_file)) # clean tracked files for task_id in tracked_ids: path = os.path.join(data_dir, task_id) async_result = celery.AsyncResult(task_id) if async_result.state == 'SUCCESS' and not os.path.exists(path): redis.delete('data:{}'.format(task_id))
def remove_duplicates(self, data_tasks: List[str], descriptor: dict) -> None: """Delete the duplicates of the given descriptor from redis and call the janitor afterwards to cleanup orphaned files. :param data_tasks: Limit duplicate search to. :param descriptor: ETL descriptor. Used to identify duplicates. """ task_ids = self.find_duplicates(data_tasks, descriptor) for task_id in task_ids: redis.delete('data:{}'.format(task_id)) if task_ids: janitor.delay()
def remove_data(task_id: str) -> None: """Remove all traces of any data associated with the given id. That includes redis and the file system. :param task_id: The id associated with a data state :param wait: Wait for all subtasks to finish before returning """ key = 'data:{}'.format(task_id) value = redis.get(key) celery.control.revoke(task_id, terminate=True, signal='SIGUSR1') redis.delete(key) if value: data_state = json.loads(value) remove_file(data_state['file_path']) else: logger.warning("Can't delete file for task id '{}',because there is " "no associated entry in Redis.".format(task_id))