Ejemplo n.º 1
0
 def test_save_state_discards_duplicates(self, test_client):
     uuid1 = str(uuid4())
     uuid2 = str(uuid4())
     payload = {
         'state': {
             'test': [
                 '${}$'.format(uuid1), '${}$'.format(uuid1),
                 '${}$'.format(uuid2)
             ]
         },
         'handler': 'test',
         'server': 'localfoo'
     }
     redis.set(name='data:{}'.format(uuid1),
               value=json.dumps({'meta': {
                   'descriptor': 'foo'
               }}))
     redis.set(name='data:{}'.format(uuid2),
               value=json.dumps({'meta': {
                   'descriptor': 'bar'
               }}))
     rv = test_client.post('/state', data=flask.json.dumps(payload))
     body = flask.json.loads(rv.get_data())
     assert 201 == rv.status_code, body
     assert UUID(body['state_id'])
     meta_state = json.loads(redis.get('state:{}'.format(body['state_id'])))
     assert len(meta_state['task_ids']) == 2
     assert len(meta_state['descriptors']) == 2
     assert uuid1 in meta_state['task_ids']
     assert uuid2 in meta_state['task_ids']
     assert 'foo' in meta_state['descriptors']
     assert 'bar' in meta_state['descriptors']
Ejemplo n.º 2
0
def request_state_access(state_id: UUID) -> Tuple[Response, int]:
    """Traverse through the state object linked to the given UUID and look for
    data ids. Then attempt to load the data into the current session to verify
    access.
    :param state_id: The id associated with the saved state.
    :return: See redirect target.
    """
    logger.debug("Received POST request on /state/<uuid:state_id>.")
    wait = request.args.get('wait') == '1'
    payload = request.get_json(force=True)
    state_id = str(state_id)
    value = redis.get('state:{}'.format(state_id))
    if not value:
        error = "Could not find state associated with id {}".format(state_id)
        logger.error(error)
        return jsonify({'error': error}), 404
    meta_state = json.loads(value)
    etl_handler = ETLHandler.factory(handler=meta_state['handler'],
                                     server=meta_state['server'],
                                     auth=payload['auth'])
    task_ids = etl_handler.handle(descriptors=meta_state['descriptors'],
                                  data_tasks=session['data_tasks'],
                                  use_existing=True,
                                  wait=wait)

    session['data_tasks'] += task_ids
    session['data_tasks'] = list(set(session['data_tasks']))
    # if all tasks finish successfully we now that session has access to state
    session['state_access'][state_id] = task_ids
    logger.debug("Tasks successfully submitted. Sending response.")
    return jsonify(''), 202
Ejemplo n.º 3
0
    def test_update_redis(self):
        df1 = pd.DataFrame([[1, 2, 3]], columns=['id', 'feature', 'value'])
        df2 = pd.DataFrame([[1, 3]], columns=['id', 'value'])
        df3 = pd.DataFrame([], columns=['id', 'feature', 'value'])
        redis.set('data:123', json.dumps({'meta': {}}))

        self.etl.update_redis(data_frame=df1)
        data_state = json.loads(redis.get('data:123'))
        assert data_state['meta']['features'] == [2]

        self.etl.update_redis(data_frame=df2)
        data_state = json.loads(redis.get('data:123'))
        assert data_state['meta']['features'] == []

        self.etl.update_redis(data_frame=df3)
        data_state = json.loads(redis.get('data:123'))
        assert data_state['meta']['features'] == []
Ejemplo n.º 4
0
 def test_valid_response_before_loaded_on_meta(self, test_client, payload):
     data = payload()
     test_client.post('/data', data=data['serialized'])
     for key in redis.keys('data:*'):
         value = redis.get(key)
         data_state = json.loads(value)
         rv = test_client.get('/data/meta/{}'.format(data_state['task_id']))
         body = flask.json.loads(rv.get_data())
         assert rv.status_code == 200
         assert 'features' not in body['meta']
Ejemplo n.º 5
0
 def test_valid_state_for_failed_etl_on_delete(self, test_client, faiload):
     test_client.post('/data?wait=1', data=faiload['serialized'])
     for key in redis.keys('data:*'):
         value = redis.get(key)
         data_state = json.loads(value)
         assert not os.path.exists(data_state['file_path'])
         test_client.delete('/data/{}?wait=1'.format(data_state['task_id']))
         assert not redis.exists(key)
         assert not os.path.exists(data_state['file_path'])
         with test_client.session_transaction() as sess:
             assert data_state['task_id'] not in sess['data_tasks']
Ejemplo n.º 6
0
 def test_encryption_works(self, test_client, payload):
     app.config['FRACTALIS_ENCRYPT_CACHE'] = True
     data = payload()
     test_client.post('/data?wait=1', data=data['serialized'])
     keys = redis.keys('data:*')
     for key in keys:
         value = redis.get(key)
         data_state = json.loads(value)
         file_path = data_state['file_path']
         with pytest.raises(UnicodeDecodeError):
             open(file_path, 'r').readlines()
     app.config['FRACTALIS_ENCRYPT_CACHE'] = False
Ejemplo n.º 7
0
 def test_valid_filesystem_before_loaded_on_post(
         self, test_client, payload):
     data_dir = os.path.join(app.config['FRACTALIS_TMP_DIR'], 'data')
     data = payload()
     test_client.post('/data', data=data['serialized'])
     if os.path.exists(data_dir):
         assert len(os.listdir(data_dir)) == 0
     keys = redis.keys('data:*')
     for key in keys:
         value = redis.get(key)
         data_state = json.loads(value)
         assert not os.path.exists(data_state['file_path'])
Ejemplo n.º 8
0
 def test_403_if_not_authorized(self, test_client, payload):
     data = payload()
     test_client.post('/data', data=data['serialized'])
     for key in redis.keys('data:*'):
         value = redis.get(key)
         data_state = json.loads(value)
         rv = test_client.get('/data/meta/{}?wait=1'
                              .format(data_state['task_id']))
         body = flask.json.loads(rv.get_data())
         assert rv.status_code == 403
         assert 'Access unauthorized.' in body['error']
         assert redis.exists(key)
Ejemplo n.º 9
0
 def test_valid_redis_after_loaded_on_post(self, test_client, payload):
     data = payload()
     test_client.post('/data?wait=1', data=data['serialized'])
     keys = redis.keys('data:*')
     assert len(keys) == data['size']
     for key in keys:
         value = redis.get(key)
         data_state = json.loads(value)
         assert 'file_path' in data_state
         assert 'label' in data_state
         assert 'data_type' in data_state
         assert 'meta' in data_state
Ejemplo n.º 10
0
 def test_403_if_no_auth_on_get_meta(self, test_client, payload):
     data = payload()
     test_client.post('/data?wait=1', data=data['serialized'])
     with test_client.session_transaction() as sess:
         sess['data_tasks'] = []
     for key in redis.keys('data:*'):
         value = redis.get(key)
         data_state = json.loads(value)
         rv = test_client.get('/data/meta/{}?wait=1'
                              .format(data_state['task_id']))
         body = flask.json.loads(rv.get_data())
         assert rv.status_code == 403
         assert 'Refusing access.' in body['error']
         assert redis.exists(key)
         assert os.path.exists(data_state['file_path'])
Ejemplo n.º 11
0
 def update_redis(self, data_frame: DataFrame) -> None:
     """Set several meta information that can be used to filter the data
     before the analysis.
     :param data_frame: The extracted and transformed data.
     """
     value = redis.get(name='data:{}'.format(self.request.id))
     assert value is not None
     data_state = json.loads(value)
     if 'feature' in data_frame.columns:
         features = data_frame['feature'].unique().tolist()
     else:
         features = []
     data_state['meta']['features'] = features
     redis.setex(name='data:{}'.format(self.request.id),
                 value=json.dumps(data_state),
                 time=app.config['FRACTALIS_DATA_LIFETIME'])
Ejemplo n.º 12
0
def remove_data(task_id: str) -> None:
    """Remove all traces of any data associated with the given id. That includes
    redis and the file system.
    :param task_id: The id associated with a data state
    :param wait: Wait for all subtasks to finish before returning
    """
    key = 'data:{}'.format(task_id)
    value = redis.get(key)
    celery.control.revoke(task_id, terminate=True, signal='SIGUSR1')
    redis.delete(key)
    if value:
        data_state = json.loads(value)
        remove_file(data_state['file_path'])
    else:
        logger.warning("Can't delete file for task id '{}',because there is "
                       "no associated entry in Redis.".format(task_id))
Ejemplo n.º 13
0
 def find_duplicates(self, data_tasks: List[str],
                     descriptor: dict) -> List[str]:
     """Search for duplicates of the given descriptor and return a list
     of associated task ids.
     :param data_tasks: Limit duplicate search to.
     :param descriptor: ETL descriptor. Used to identify duplicates.
     :return: The list of duplicates.
     """
     task_ids = []
     hash_value = self.descriptor_to_hash(descriptor)
     for task_id in data_tasks:
         value = redis.get('data:{}'.format(task_id))
         if value is None:
             continue
         data_state = json.loads(value)
         if hash_value == data_state['hash']:
             task_ids.append(task_id)
     return task_ids
Ejemplo n.º 14
0
 def test_save_state_saves_and_returns(self, test_client):
     uuid = str(uuid4())
     payload = {
         'state': {
             'test': ['${}$'.format(uuid), '${${}']
         },
         'handler': 'test',
         'server': 'localfoo'
     }
     redis.set(name='data:{}'.format(uuid),
               value=json.dumps({'meta': {
                   'descriptor': 'foo'
               }}))
     rv = test_client.post('/state', data=flask.json.dumps(payload))
     body = flask.json.loads(rv.get_data())
     assert 201 == rv.status_code, body
     assert UUID(body['state_id'])
     meta_state = json.loads(redis.get('state:{}'.format(body['state_id'])))
     assert meta_state['task_ids'] == [uuid]
     assert meta_state['state']['test'][0] == '${}$'.format(uuid)
Ejemplo n.º 15
0
def cleanup_all() -> None:
    """Reset redis, celery and the filesystem. This is only useful for testing
    and should !!!NEVER!!! be used for anything else.
    """
    celery.control.purge()
    for key in redis.keys('data:*'):
        value = redis.get(key)
        try:
            data_state = json.loads(value)
        except ValueError:
            continue
        task_id = data_state.get('task_id')
        if task_id is not None:
            async_result = celery.AsyncResult(task_id)
            if async_result.state == 'SUBMITTED':
                async_result.get(propagate=False)
    redis.flushall()
    tmp_dir = app.config['FRACTALIS_TMP_DIR']
    if os.path.exists(tmp_dir):
        rmtree(tmp_dir)
    assert not os.path.exists(tmp_dir)
Ejemplo n.º 16
0
def get_data_state_for_task_id(task_id: str, wait: bool) -> Union[dict, None]:
    """Return data state associated with task id.
    :param task_id: The id associated with the ETL task.
    :param wait: If true and ETL is still running wait for it.
    :return: Data state that has been stored in Redis.
    """
    async_result = celery.AsyncResult(task_id)
    if wait and async_result.state == 'SUBMITTED':
        logger.debug("'wait' was set. Waiting for tasks to finish ...")
        async_result.get(propagate=False)
    value = redis.get('data:{}'.format(task_id))
    if not value:
        return None
    data_state = json.loads(value)
    # add additional information to data_state
    result = async_result.result
    if isinstance(result, Exception):  # Exception -> str
        result = "{}: {}".format(type(result).__name__, str(result))
    data_state['etl_message'] = result
    data_state['etl_state'] = async_result.state
    return data_state
Ejemplo n.º 17
0
def save_state() -> Tuple[Response, int]:
    """Save given payload to redis, so it can be accessed later on.
    :return: UUID linked to the saved state.
    """
    logger.debug("Received POST request on /state.")
    payload = request.get_json(force=True)
    state = str(payload['state'])
    matches = re.findall('\$.+?\$', state)
    task_ids = [AnalyticTask.parse_value(match)[0] for match in matches]
    task_ids = [task_id for task_id in set(task_ids) if task_id is not None]
    if not task_ids:
        error = "This state cannot be saved because it contains no data " \
                "task ids. These are used to verify access to the state and " \
                "its potentially sensitive data."
        logger.error(error)
        return jsonify({'error': error}), 400
    descriptors = []
    for task_id in task_ids:
        value = redis.get('data:{}'.format(task_id))
        if value is None:
            error = "Data task id is {} could not be found in redis. " \
                    "State cannot be saved".format(task_id)
            logger.error(error)
            return jsonify({'error': error}), 400
        data_state = json.loads(value)
        descriptors.append(data_state['meta']['descriptor'])
    assert len(task_ids) == len(descriptors)
    meta_state = {
        'state': ast.literal_eval(state),
        'server': payload['server'],
        'handler': payload['handler'],
        'task_ids': task_ids,
        'descriptors': descriptors
    }
    uuid = uuid4()
    redis.set(name='state:{}'.format(uuid), value=json.dumps(meta_state))
    logger.debug("Successfully saved data to redis. Sending response.")
    return jsonify({'state_id': uuid}), 201
Ejemplo n.º 18
0
 def data_task_id_to_data_frame(self, data_task_id: str,
                                session_data_tasks: List[str],
                                decrypt: bool) -> DataFrame:
     """Attempts to load the data frame associated with the provided data id
     :param data_task_id: The data id associated with the previously loaded
     data.
     :param session_data_tasks: A list of data tasks previously executed by
     this the requesting session. This is used for permission checks.
     :param decrypt: Specify whether the data have to be decrypted for usage
     only part of the data, for instance some genes out of thousands.
     :return: A pandas data frame associated with the data id.
     """
     if data_task_id not in session_data_tasks:
         error = "No permission to use data_task_id '{}' " \
                 "for analysis".format(data_task_id)
         logger.error(error)
         raise PermissionError(error)
     entry = redis.get('data:{}'.format(data_task_id))
     if not entry:
         error = "The key '{}' does not match any entry in Redis. " \
                 "Value probably expired.".format(data_task_id)
         logger.error(error)
         raise LookupError(error)
     data_state = json.loads(entry)
     async_result = self.AsyncResult(data_task_id)
     if async_result.state != 'SUCCESS':
         error = "The data task '{}' has not been loaded, yet. " \
                 "Wait for it to complete before using it in an " \
                 "analysis task.".format(data_task_id)
         logger.error(error)
         raise ValueError(error)
     file_path = data_state['file_path']
     if decrypt:
         return self.secure_load(file_path)
     else:
         df = read_pickle(file_path, compression='gzip')
     return df
Ejemplo n.º 19
0
def get_state_data(state_id: UUID) -> Tuple[Response, int]:
    """Check whether every ETL linked to the state_id successfully executed for
    this session. If and only if every ETL successfully completed grant access
    to the state information.
    :param state_id: ID of the state that is requested.
    :return: Previously saved state.
    """
    logger.debug("Received GET request on /state/<uuid:state_id>.")
    state_id = str(state_id)
    value = redis.get('state:{}'.format(state_id))
    if not value or state_id not in session['state_access']:
        error = "Cannot get state. Make sure to submit a POST request " \
                "to this very same URL containing credentials and server " \
                "data to launch access verification. Only after that a GET " \
                "request might or might not return you the saved state."
        logger.error(error)
        return jsonify({'error': error}), 404
    meta_state = json.loads(value)
    state = json.dumps(meta_state['state'])
    for task_id in session['state_access'][state_id]:
        async_result = celery.AsyncResult(task_id)
        if async_result.state == 'SUBMITTED':
            return jsonify({'message': 'ETLs are still running.'}), 202
        elif async_result.state == 'SUCCESS':
            continue
        else:
            error = "One or more ETLs failed or has unknown status. " \
                    "Assuming no access to saved state."
            logger.error(error)
            return jsonify({'error': error}), 403
    # replace task ids in state with the ids of the freshly loaded data
    for i, task_id in enumerate(meta_state['task_ids']):
        state = re.sub(pattern=task_id,
                       repl=session['state_access'][state_id][i],
                       string=state)
    return jsonify({'state': json.loads(state)}), 200