def test_save_state_discards_duplicates(self, test_client): uuid1 = str(uuid4()) uuid2 = str(uuid4()) payload = { 'state': { 'test': [ '${}$'.format(uuid1), '${}$'.format(uuid1), '${}$'.format(uuid2) ] }, 'handler': 'test', 'server': 'localfoo' } redis.set(name='data:{}'.format(uuid1), value=json.dumps({'meta': { 'descriptor': 'foo' }})) redis.set(name='data:{}'.format(uuid2), value=json.dumps({'meta': { 'descriptor': 'bar' }})) rv = test_client.post('/state', data=flask.json.dumps(payload)) body = flask.json.loads(rv.get_data()) assert 201 == rv.status_code, body assert UUID(body['state_id']) meta_state = json.loads(redis.get('state:{}'.format(body['state_id']))) assert len(meta_state['task_ids']) == 2 assert len(meta_state['descriptors']) == 2 assert uuid1 in meta_state['task_ids'] assert uuid2 in meta_state['task_ids'] assert 'foo' in meta_state['descriptors'] assert 'bar' in meta_state['descriptors']
def request_state_access(state_id: UUID) -> Tuple[Response, int]: """Traverse through the state object linked to the given UUID and look for data ids. Then attempt to load the data into the current session to verify access. :param state_id: The id associated with the saved state. :return: See redirect target. """ logger.debug("Received POST request on /state/<uuid:state_id>.") wait = request.args.get('wait') == '1' payload = request.get_json(force=True) state_id = str(state_id) value = redis.get('state:{}'.format(state_id)) if not value: error = "Could not find state associated with id {}".format(state_id) logger.error(error) return jsonify({'error': error}), 404 meta_state = json.loads(value) etl_handler = ETLHandler.factory(handler=meta_state['handler'], server=meta_state['server'], auth=payload['auth']) task_ids = etl_handler.handle(descriptors=meta_state['descriptors'], data_tasks=session['data_tasks'], use_existing=True, wait=wait) session['data_tasks'] += task_ids session['data_tasks'] = list(set(session['data_tasks'])) # if all tasks finish successfully we now that session has access to state session['state_access'][state_id] = task_ids logger.debug("Tasks successfully submitted. Sending response.") return jsonify(''), 202
def test_update_redis(self): df1 = pd.DataFrame([[1, 2, 3]], columns=['id', 'feature', 'value']) df2 = pd.DataFrame([[1, 3]], columns=['id', 'value']) df3 = pd.DataFrame([], columns=['id', 'feature', 'value']) redis.set('data:123', json.dumps({'meta': {}})) self.etl.update_redis(data_frame=df1) data_state = json.loads(redis.get('data:123')) assert data_state['meta']['features'] == [2] self.etl.update_redis(data_frame=df2) data_state = json.loads(redis.get('data:123')) assert data_state['meta']['features'] == [] self.etl.update_redis(data_frame=df3) data_state = json.loads(redis.get('data:123')) assert data_state['meta']['features'] == []
def test_valid_response_before_loaded_on_meta(self, test_client, payload): data = payload() test_client.post('/data', data=data['serialized']) for key in redis.keys('data:*'): value = redis.get(key) data_state = json.loads(value) rv = test_client.get('/data/meta/{}'.format(data_state['task_id'])) body = flask.json.loads(rv.get_data()) assert rv.status_code == 200 assert 'features' not in body['meta']
def test_valid_state_for_failed_etl_on_delete(self, test_client, faiload): test_client.post('/data?wait=1', data=faiload['serialized']) for key in redis.keys('data:*'): value = redis.get(key) data_state = json.loads(value) assert not os.path.exists(data_state['file_path']) test_client.delete('/data/{}?wait=1'.format(data_state['task_id'])) assert not redis.exists(key) assert not os.path.exists(data_state['file_path']) with test_client.session_transaction() as sess: assert data_state['task_id'] not in sess['data_tasks']
def test_encryption_works(self, test_client, payload): app.config['FRACTALIS_ENCRYPT_CACHE'] = True data = payload() test_client.post('/data?wait=1', data=data['serialized']) keys = redis.keys('data:*') for key in keys: value = redis.get(key) data_state = json.loads(value) file_path = data_state['file_path'] with pytest.raises(UnicodeDecodeError): open(file_path, 'r').readlines() app.config['FRACTALIS_ENCRYPT_CACHE'] = False
def test_valid_filesystem_before_loaded_on_post( self, test_client, payload): data_dir = os.path.join(app.config['FRACTALIS_TMP_DIR'], 'data') data = payload() test_client.post('/data', data=data['serialized']) if os.path.exists(data_dir): assert len(os.listdir(data_dir)) == 0 keys = redis.keys('data:*') for key in keys: value = redis.get(key) data_state = json.loads(value) assert not os.path.exists(data_state['file_path'])
def test_403_if_not_authorized(self, test_client, payload): data = payload() test_client.post('/data', data=data['serialized']) for key in redis.keys('data:*'): value = redis.get(key) data_state = json.loads(value) rv = test_client.get('/data/meta/{}?wait=1' .format(data_state['task_id'])) body = flask.json.loads(rv.get_data()) assert rv.status_code == 403 assert 'Access unauthorized.' in body['error'] assert redis.exists(key)
def test_valid_redis_after_loaded_on_post(self, test_client, payload): data = payload() test_client.post('/data?wait=1', data=data['serialized']) keys = redis.keys('data:*') assert len(keys) == data['size'] for key in keys: value = redis.get(key) data_state = json.loads(value) assert 'file_path' in data_state assert 'label' in data_state assert 'data_type' in data_state assert 'meta' in data_state
def test_403_if_no_auth_on_get_meta(self, test_client, payload): data = payload() test_client.post('/data?wait=1', data=data['serialized']) with test_client.session_transaction() as sess: sess['data_tasks'] = [] for key in redis.keys('data:*'): value = redis.get(key) data_state = json.loads(value) rv = test_client.get('/data/meta/{}?wait=1' .format(data_state['task_id'])) body = flask.json.loads(rv.get_data()) assert rv.status_code == 403 assert 'Refusing access.' in body['error'] assert redis.exists(key) assert os.path.exists(data_state['file_path'])
def update_redis(self, data_frame: DataFrame) -> None: """Set several meta information that can be used to filter the data before the analysis. :param data_frame: The extracted and transformed data. """ value = redis.get(name='data:{}'.format(self.request.id)) assert value is not None data_state = json.loads(value) if 'feature' in data_frame.columns: features = data_frame['feature'].unique().tolist() else: features = [] data_state['meta']['features'] = features redis.setex(name='data:{}'.format(self.request.id), value=json.dumps(data_state), time=app.config['FRACTALIS_DATA_LIFETIME'])
def remove_data(task_id: str) -> None: """Remove all traces of any data associated with the given id. That includes redis and the file system. :param task_id: The id associated with a data state :param wait: Wait for all subtasks to finish before returning """ key = 'data:{}'.format(task_id) value = redis.get(key) celery.control.revoke(task_id, terminate=True, signal='SIGUSR1') redis.delete(key) if value: data_state = json.loads(value) remove_file(data_state['file_path']) else: logger.warning("Can't delete file for task id '{}',because there is " "no associated entry in Redis.".format(task_id))
def find_duplicates(self, data_tasks: List[str], descriptor: dict) -> List[str]: """Search for duplicates of the given descriptor and return a list of associated task ids. :param data_tasks: Limit duplicate search to. :param descriptor: ETL descriptor. Used to identify duplicates. :return: The list of duplicates. """ task_ids = [] hash_value = self.descriptor_to_hash(descriptor) for task_id in data_tasks: value = redis.get('data:{}'.format(task_id)) if value is None: continue data_state = json.loads(value) if hash_value == data_state['hash']: task_ids.append(task_id) return task_ids
def test_save_state_saves_and_returns(self, test_client): uuid = str(uuid4()) payload = { 'state': { 'test': ['${}$'.format(uuid), '${${}'] }, 'handler': 'test', 'server': 'localfoo' } redis.set(name='data:{}'.format(uuid), value=json.dumps({'meta': { 'descriptor': 'foo' }})) rv = test_client.post('/state', data=flask.json.dumps(payload)) body = flask.json.loads(rv.get_data()) assert 201 == rv.status_code, body assert UUID(body['state_id']) meta_state = json.loads(redis.get('state:{}'.format(body['state_id']))) assert meta_state['task_ids'] == [uuid] assert meta_state['state']['test'][0] == '${}$'.format(uuid)
def cleanup_all() -> None: """Reset redis, celery and the filesystem. This is only useful for testing and should !!!NEVER!!! be used for anything else. """ celery.control.purge() for key in redis.keys('data:*'): value = redis.get(key) try: data_state = json.loads(value) except ValueError: continue task_id = data_state.get('task_id') if task_id is not None: async_result = celery.AsyncResult(task_id) if async_result.state == 'SUBMITTED': async_result.get(propagate=False) redis.flushall() tmp_dir = app.config['FRACTALIS_TMP_DIR'] if os.path.exists(tmp_dir): rmtree(tmp_dir) assert not os.path.exists(tmp_dir)
def get_data_state_for_task_id(task_id: str, wait: bool) -> Union[dict, None]: """Return data state associated with task id. :param task_id: The id associated with the ETL task. :param wait: If true and ETL is still running wait for it. :return: Data state that has been stored in Redis. """ async_result = celery.AsyncResult(task_id) if wait and async_result.state == 'SUBMITTED': logger.debug("'wait' was set. Waiting for tasks to finish ...") async_result.get(propagate=False) value = redis.get('data:{}'.format(task_id)) if not value: return None data_state = json.loads(value) # add additional information to data_state result = async_result.result if isinstance(result, Exception): # Exception -> str result = "{}: {}".format(type(result).__name__, str(result)) data_state['etl_message'] = result data_state['etl_state'] = async_result.state return data_state
def save_state() -> Tuple[Response, int]: """Save given payload to redis, so it can be accessed later on. :return: UUID linked to the saved state. """ logger.debug("Received POST request on /state.") payload = request.get_json(force=True) state = str(payload['state']) matches = re.findall('\$.+?\$', state) task_ids = [AnalyticTask.parse_value(match)[0] for match in matches] task_ids = [task_id for task_id in set(task_ids) if task_id is not None] if not task_ids: error = "This state cannot be saved because it contains no data " \ "task ids. These are used to verify access to the state and " \ "its potentially sensitive data." logger.error(error) return jsonify({'error': error}), 400 descriptors = [] for task_id in task_ids: value = redis.get('data:{}'.format(task_id)) if value is None: error = "Data task id is {} could not be found in redis. " \ "State cannot be saved".format(task_id) logger.error(error) return jsonify({'error': error}), 400 data_state = json.loads(value) descriptors.append(data_state['meta']['descriptor']) assert len(task_ids) == len(descriptors) meta_state = { 'state': ast.literal_eval(state), 'server': payload['server'], 'handler': payload['handler'], 'task_ids': task_ids, 'descriptors': descriptors } uuid = uuid4() redis.set(name='state:{}'.format(uuid), value=json.dumps(meta_state)) logger.debug("Successfully saved data to redis. Sending response.") return jsonify({'state_id': uuid}), 201
def data_task_id_to_data_frame(self, data_task_id: str, session_data_tasks: List[str], decrypt: bool) -> DataFrame: """Attempts to load the data frame associated with the provided data id :param data_task_id: The data id associated with the previously loaded data. :param session_data_tasks: A list of data tasks previously executed by this the requesting session. This is used for permission checks. :param decrypt: Specify whether the data have to be decrypted for usage only part of the data, for instance some genes out of thousands. :return: A pandas data frame associated with the data id. """ if data_task_id not in session_data_tasks: error = "No permission to use data_task_id '{}' " \ "for analysis".format(data_task_id) logger.error(error) raise PermissionError(error) entry = redis.get('data:{}'.format(data_task_id)) if not entry: error = "The key '{}' does not match any entry in Redis. " \ "Value probably expired.".format(data_task_id) logger.error(error) raise LookupError(error) data_state = json.loads(entry) async_result = self.AsyncResult(data_task_id) if async_result.state != 'SUCCESS': error = "The data task '{}' has not been loaded, yet. " \ "Wait for it to complete before using it in an " \ "analysis task.".format(data_task_id) logger.error(error) raise ValueError(error) file_path = data_state['file_path'] if decrypt: return self.secure_load(file_path) else: df = read_pickle(file_path, compression='gzip') return df
def get_state_data(state_id: UUID) -> Tuple[Response, int]: """Check whether every ETL linked to the state_id successfully executed for this session. If and only if every ETL successfully completed grant access to the state information. :param state_id: ID of the state that is requested. :return: Previously saved state. """ logger.debug("Received GET request on /state/<uuid:state_id>.") state_id = str(state_id) value = redis.get('state:{}'.format(state_id)) if not value or state_id not in session['state_access']: error = "Cannot get state. Make sure to submit a POST request " \ "to this very same URL containing credentials and server " \ "data to launch access verification. Only after that a GET " \ "request might or might not return you the saved state." logger.error(error) return jsonify({'error': error}), 404 meta_state = json.loads(value) state = json.dumps(meta_state['state']) for task_id in session['state_access'][state_id]: async_result = celery.AsyncResult(task_id) if async_result.state == 'SUBMITTED': return jsonify({'message': 'ETLs are still running.'}), 202 elif async_result.state == 'SUCCESS': continue else: error = "One or more ETLs failed or has unknown status. " \ "Assuming no access to saved state." logger.error(error) return jsonify({'error': error}), 403 # replace task ids in state with the ids of the freshly loaded data for i, task_id in enumerate(meta_state['task_ids']): state = re.sub(pattern=task_id, repl=session['state_access'][state_id][i], string=state) return jsonify({'state': json.loads(state)}), 200