Esempio n. 1
0
def janitor():
    """Ideally this is maintained by a systemd service to cleanup redis and the
    file system while Fractalis is running.
    """
    data_dir = os.path.join(app.config['FRACTALIS_TMP_DIR'], 'data')
    tracked_ids = [key.split(':')[1] for key in redis.scan_iter('data:*')]
    if not os.path.exists(data_dir):
        for task_id in tracked_ids:
            async_result = celery.AsyncResult(task_id)
            if async_result.state == 'SUCCESS':
                redis.delete('data:{}'.format(task_id))
        return

    cached_files = [f for f in os.listdir(data_dir)
                    if os.path.isfile(os.path.join(data_dir, f))]

    # clean cached files
    for cached_file in cached_files:
        if cached_file not in tracked_ids:
            sync.remove_file(os.path.join(data_dir, cached_file))

    # clean tracked files
    for task_id in tracked_ids:
        path = os.path.join(data_dir, task_id)
        async_result = celery.AsyncResult(task_id)
        if async_result.state == 'SUCCESS' and not os.path.exists(path):
            redis.delete('data:{}'.format(task_id))
Esempio n. 2
0
 def find_duplicate_task_id(self, data_tasks: List[str],
                            descriptor: dict) -> Union[str, None]:
     """Search for duplicates of the given descriptor and return their
     task id if the state is SUBMITTED or SUCCESS, meaning the data are
     reusable.
     :param data_tasks: Limit search to this list.
     :param descriptor: ETL descriptor. Used to identify duplicates.
     :return: TaskID if valid duplicate has been found, None otherwise.
     """
     task_ids = self.find_duplicates(data_tasks, descriptor)
     for task_id in task_ids:
         async_result = celery.AsyncResult(task_id)
         if (async_result.state == 'SUBMITTED'
                 or async_result.state == 'SUCCESS'):
             return task_id
     return None
Esempio n. 3
0
def cleanup_all() -> None:
    """Reset redis, celery and the filesystem. This is only useful for testing
    and should !!!NEVER!!! be used for anything else.
    """
    celery.control.purge()
    for key in redis.keys('data:*'):
        value = redis.get(key)
        try:
            data_state = json.loads(value)
        except ValueError:
            continue
        task_id = data_state.get('task_id')
        if task_id is not None:
            async_result = celery.AsyncResult(task_id)
            if async_result.state == 'SUBMITTED':
                async_result.get(propagate=False)
    redis.flushall()
    tmp_dir = app.config['FRACTALIS_TMP_DIR']
    if os.path.exists(tmp_dir):
        rmtree(tmp_dir)
    assert not os.path.exists(tmp_dir)
Esempio n. 4
0
def get_data_state_for_task_id(task_id: str, wait: bool) -> Union[dict, None]:
    """Return data state associated with task id.
    :param task_id: The id associated with the ETL task.
    :param wait: If true and ETL is still running wait for it.
    :return: Data state that has been stored in Redis.
    """
    async_result = celery.AsyncResult(task_id)
    if wait and async_result.state == 'SUBMITTED':
        logger.debug("'wait' was set. Waiting for tasks to finish ...")
        async_result.get(propagate=False)
    value = redis.get('data:{}'.format(task_id))
    if not value:
        return None
    data_state = json.loads(value)
    # add additional information to data_state
    result = async_result.result
    if isinstance(result, Exception):  # Exception -> str
        result = "{}: {}".format(type(result).__name__, str(result))
    data_state['etl_message'] = result
    data_state['etl_state'] = async_result.state
    return data_state
Esempio n. 5
0
def get_task_details(task_id: UUID) -> Tuple[Response, int]:
    """Get task details for the given task_id.
     See doc/api/ for more information.
    :param task_id: ID returned on task creation.
    :return: Flask Response
    """
    logger.debug("Received GET request on /analytics/task_id.")
    wait = request.args.get('wait') == '1'
    task_id = str(task_id)
    if task_id not in session['analytic_tasks']:
        error = "Task ID '{}' not found in session. " \
                "Refusing access.".format(task_id)
        logger.warning(error)
        return jsonify({'error': error}), 403
    async_result = celery.AsyncResult(task_id)
    if wait and async_result.state == 'SUBMITTED':
        async_result.get(propagate=False)
    result = async_result.result
    if isinstance(result, Exception):  # Exception -> str
        result = "{}: {}".format(type(result).__name__, str(result))
    logger.debug("Task found and has access. Sending response.")
    return jsonify({'state': async_result.state, 'result': result}), 200
Esempio n. 6
0
def get_state_data(state_id: UUID) -> Tuple[Response, int]:
    """Check whether every ETL linked to the state_id successfully executed for
    this session. If and only if every ETL successfully completed grant access
    to the state information.
    :param state_id: ID of the state that is requested.
    :return: Previously saved state.
    """
    logger.debug("Received GET request on /state/<uuid:state_id>.")
    state_id = str(state_id)
    value = redis.get('state:{}'.format(state_id))
    if not value or state_id not in session['state_access']:
        error = "Cannot get state. Make sure to submit a POST request " \
                "to this very same URL containing credentials and server " \
                "data to launch access verification. Only after that a GET " \
                "request might or might not return you the saved state."
        logger.error(error)
        return jsonify({'error': error}), 404
    meta_state = json.loads(value)
    state = json.dumps(meta_state['state'])
    for task_id in session['state_access'][state_id]:
        async_result = celery.AsyncResult(task_id)
        if async_result.state == 'SUBMITTED':
            return jsonify({'message': 'ETLs are still running.'}), 202
        elif async_result.state == 'SUCCESS':
            continue
        else:
            error = "One or more ETLs failed or has unknown status. " \
                    "Assuming no access to saved state."
            logger.error(error)
            return jsonify({'error': error}), 403
    # replace task ids in state with the ids of the freshly loaded data
    for i, task_id in enumerate(meta_state['task_ids']):
        state = re.sub(pattern=task_id,
                       repl=session['state_access'][state_id][i],
                       string=state)
    return jsonify({'state': json.loads(state)}), 200