def test_gen_metadata(monkeypatch): """Test that _generate_metadata() produces the correct metadata.""" fpath = os.path.abspath("tests/data/test.txt") monkeypatch.setattr( db.Checksums, "get_checksums", lambda self: {fpath: "150b62e4e7d58c70503bd5fc8a26463c"}) metadata = md._generate_metadata("tests/data/test.txt", "tests", "data", "pid:uuid:storage_id", db.Database().checksums.get_checksums()) assert len(metadata["identifier"]) == 45 assert metadata["file_name"] == "test.txt" assert metadata["file_format"] == "text/plain" assert metadata["byte_size"] == 31 assert metadata["file_path"] == "/test.txt" assert metadata["project_identifier"] == "data" assert "file_uploaded" in metadata assert "file_modified" in metadata assert "file_frozen" in metadata checksum = metadata["checksum"] assert checksum["algorithm"] == "MD5" assert checksum["value"] == "150b62e4e7d58c70503bd5fc8a26463c" assert "checked" in checksum assert metadata["file_storage"] == "pid:uuid:storage_id"
def extract_task(fpath, dir_path, task_id): """Calculate the checksum of the archive and extracts the files into ``dir_path`` directory. Finally updates the status of the task into database. :param str fpath: file path of the archive :param str dir_path: directory to where the archive will be extracted :param str task_id: mongo dentifier of the task """ database = db.Database() database.tasks.update_message(task_id, "Extracting archive") md5 = gen_metadata.md5_digest(fpath) try: extract(fpath, dir_path) except (MemberNameError, MemberTypeError, MemberOverwriteError) as error: logging.error(str(error), exc_info=error) # Remove the archive and set task's state os.remove(fpath) database.tasks.update_status(task_id, "error") msg = {"message": str(error)} database.tasks.update_message(task_id, json.dumps(msg)) else: # Add checksums of the extracted files to mongo database.checksums.insert(_get_archive_checksums(fpath, dir_path)) # Remove archive and all created symlinks os.remove(fpath) _process_extracted_files(dir_path) msg = {"message": "Archive uploaded and extracted", "md5": md5} database.tasks.update_message(task_id, json.dumps(msg)) database.tasks.update_status(task_id, "done")
def get_path(fpath): """Get filepath, name and checksum. :returns: HTTP Response """ username = request.authorization.username database = db.Database() project = database.user(username).get_project() root_upload_path = current_app.config.get("UPLOAD_PATH") fpath, fname = utils.get_upload_path(project, fpath, root_upload_path) fpath = os.path.join(fpath, fname) if os.path.isfile(fpath): file_path = utils.get_return_path(project, fpath, root_upload_path) response = jsonify({ "file_path": file_path, "metax_identifier": database.files.get_identifier(fpath), "md5": database.checksums.get_checksum(os.path.abspath(fpath)), "timestamp": md.iso8601_timestamp(fpath) }) elif os.path.isdir(fpath): dir_tree = _get_dir_tree(project, fpath, root_upload_path) response = jsonify(dict(file_path=dir_tree)) else: return utils.make_response(404, "File not found") response.status_code = 200 return response
def enqueue_background_job(task_func, queue_name, username, job_kwargs): """Create a task ID and enqueue a RQ job. :param str task_func: Python function to run as a string to import eg. "upload_rest_api.jobs.upload.extract_archive" :param str queue_name: Queue used to run the job :param str username: Username :param dict job_kwargs: Keyword arguments to pass to the background task """ queue = get_job_queue(queue_name) database = db.Database() project = database.user(username).get_project() task_id = database.tasks.create(project) database.tasks.update_message(task_id, "processing") job_kwargs["task_id"] = str(task_id) job_timeout = CONFIG.get("RQ_JOB_TIMEOUT", DEFAULT_JOB_TIMEOUT) queue.enqueue( task_func, job_id=str(task_id), timeout=job_timeout, # rq 0.12.0 or older job_timeout=job_timeout, # rq 0.13.0 and newer failure_ttl=CONFIG.get("RQ_FAILED_JOB_TTL", DEFAULT_FAILED_JOB_TTL), kwargs=job_kwargs) return str(task_id)
def delete_files(): """Delete all files of a user. :returns: HTTP Response """ username = request.authorization.username project = db.Database().user(username).get_project() root_upload_path = current_app.config.get("UPLOAD_PATH") fpath = safe_join(root_upload_path, secure_filename(project)) if not os.path.exists(fpath): return utils.make_response(404, "No files found") task_id = enqueue_background_job( task_func="upload_rest_api.jobs.files.delete_files", queue_name=FILES_QUEUE, username=username, job_kwargs={ "fpath": fpath, "username": username }) polling_url = utils.get_polling_url(TASK_STATUS_API_V1.name, task_id) response = jsonify({ "file_path": "/", "message": "Deleting files and metadata", "polling_url": polling_url, "status": "pending" }) location = url_for(TASK_STATUS_API_V1.name + ".task_status", task_id=task_id) response.headers[b'Location'] = location response.status_code = 202 return response
def upload_file(fpath): """Save the uploaded file at <UPLOAD_PATH>/project/fpath. :returns: HTTP Response """ username = request.authorization.username database = db.Database() project = database.user(username).get_project() response = up.validate_upload(database) if response: return response file_path, file_name = utils.get_upload_path(project, fpath) # Create directory if it does not exist if not os.path.exists(file_path): os.makedirs(file_path) file_path = os.path.join(file_path, file_name) try: response = up.save_file(database, project, file_path) except (up.OverwriteError) as error: return utils.make_response(409, str(error)) database.user(request.authorization.username).update_used_quota( current_app.config.get("UPLOAD_PATH")) return response
def test_store_identifiers(monkeypatch): """Test that store_identifiers writes the POSTed identifiers and corresponding file_paths to Mongo. """ monkeypatch.setattr(db, "_get_abs_path", lambda path, _root_path, _project: path) monkeypatch.setattr(db.User, "get_project", lambda self: "project_path") metax_response = [{ "object": { "identifier": "pid:urn:1", "file_path": "1" } }, { "object": { "identifier": "pid:urn:2", "file_path": "2" } }, { "object": { "identifier": "pid:urn:3", "file_path": "3" } }] database = db.Database() database.store_identifiers(metax_response, "/tmp", "user") assert database.files.get_all_ids() \ == ["pid:urn:1", "pid:urn:2", "pid:urn:3"]
def upload_archive(): """Upload and extract the archive at <UPLOAD_PATH>/project. :returns: HTTP Response """ database = db.Database() response = up.validate_upload(database) if response: return response upload_dir = request.args.get("dir", default=None) file_path, file_name = utils.get_tmp_upload_path() # Create directory if it does not exist if not os.path.exists(file_path): os.makedirs(file_path) file_path = safe_join(file_path, file_name) try: response = up.save_archive(database, file_path, upload_dir) except (MemberOverwriteError, up.OverwriteError) as error: return utils.make_response(409, str(error)) except MemberTypeError as error: return utils.make_response(415, str(error)) except MemberNameError as error: return utils.make_response(400, str(error)) except up.QuotaError as error: return utils.make_response(413, str(error)) return response
def clean_mongo(): """Clean old tasks from mongo. Clean file identifiers that do not exist in Metax any more from Mongo. :returns: Count of cleaned Mongo documents """ conf = parse_conf("/etc/upload_rest_api.conf") url = conf["METAX_URL"] user = conf["METAX_USER"] password = conf["METAX_PASSWORD"] ssl_verification = conf["METAX_SSL_VERIFICATION"] time_lim = conf["CLEANUP_TIMELIM"] _clean_old_tasks(time_lim) projects = _get_projects() metax_ids = md.MetaxClient(url, user, password, ssl_verification).get_all_ids(projects) files = db.Database().files mongo_ids = files.get_all_ids() id_list = [] # Check for identifiers found in Mongo but not in Metax for identifier in mongo_ids: if identifier not in metax_ids: id_list.append(identifier) # Remove identifiers from mongo return files.delete(id_list)
def test_delete_user(mock_mongo): """Test deletion of an existing user.""" db.Database().user("test").create("test_project") with mock.patch.object(sys, 'argv', ['upload-rest-api', 'delete', 'test']): upload_rest_api.__main__.main() assert mock_mongo.upload.users.count({"_id": "test"}) == 0
def files_col(mock_mongo): """Initialize and return Files instance with db connection through mongomock. """ files_coll = db.Database().files files_coll.files = mock_mongo.upload.files return files_coll
def user(mock_mongo): """Initialize and return User instance with db connection through mongomock. """ test_user = db.Database().user("test_user") test_user.users = mock_mongo.upload.users return test_user
def test_create_existing_user(): """Test that creating a user that already exists raises UserExistsError. """ db.Database().user("test").create("test_project") with mock.patch.object(sys, 'argv', ['upload-rest-api', 'create', 'test', 'test']): with pytest.raises(db.UserExistsError): upload_rest_api.__main__.main()
def tasks_col(mock_mongo): """Initialize and return Tasks instance with db connection through mongomock. """ tasks_col = db.Database().tasks tasks_col.tasks = mock_mongo.upload.tasks tasks_col.task_messages = mock_mongo.upload.task_messages return tasks_col
def post_metadata(fpath, username, storage_id, task_id): """Create file metadata in Metax. This function creates the metadata in Metax for the file(s) denoted by fpath argument. Finally updates the status of the task into database. :param str fpath: file path :param str username: current user :param str storage_id: pas storage identifier in Metax :param str task_id: mongo dentifier of the task """ root_upload_path = CONFIG["UPLOAD_PATH"] status = "error" response = None metax_client = md.MetaxClient() database = db.Database() project = database.user(username).get_project() fpath, fname = utils.get_upload_path(project, fpath, root_upload_path) fpath = os.path.join(fpath, fname) ret_path = utils.get_return_path(project, fpath, root_upload_path) database.tasks.update_message(task_id, "Creating metadata: %s" % ret_path) if os.path.isdir(fpath): # POST metadata of all files under dir fpath fpaths = [] for dirpath, _, files in os.walk(fpath): for fname in files: fpaths.append(os.path.join(dirpath, fname)) elif os.path.isfile(fpath): fpaths = [fpath] else: response = {"code": 404, "error": "File not found"} if not response: status_code = 200 try: response = metax_client.post_metadata(fpaths, root_upload_path, username, storage_id) status = "done" except HTTPError as error: logging.error(str(error), exc_info=error) response = error.response.json() status_code = error.response.status_code # Create upload-rest-api response response = {"code": status_code, "metax_response": response} database.tasks.update_message(task_id, json.dumps(response)) database.tasks.update_status(task_id, status)
def wrapper(*args, **kwargs): task_id = kwargs["task_id"] try: return func(*args, **kwargs) except Exception: tasks = db.Database().tasks tasks.update_status(task_id, "error") tasks.update_message(task_id, "Internal server error") raise
def _clean_old_tasks(time_lim): """Remove tasks that are older than time_lim. :param time_lim: : expiration time in seconds """ current_time = time.time() tasks = db.Database().tasks for task in tasks.get_all_tasks(): if current_time - task["timestamp"] > time_lim: tasks.delete_one(task["_id"])
def test_get(capsys): """Test get command.""" database = db.Database() database.user("test1").create("test_project") database.user("test2").create("test_project") with mock.patch.object(sys, 'argv', ['upload-rest-api', 'get', '--users']): upload_rest_api.__main__.main() out, _ = capsys.readouterr() assert out == "test1\ntest2\n"
def clean_project(project, fpath, metax=True): """Remove all files of a given project that haven't been accessed within time_lim seconds. If the removed file has a Metax file entry and metax_client is provided, remove the Metax file entry as well. :param project: Project identifier used to search files from Metax :param fpath: Path to the dir to cleanup :param time_lim: Time limit in seconds :param metax: Boolean. if True metadata is removed also from Metax :returns: Number of deleted files """ conf = parse_conf("/etc/upload_rest_api.conf") time_lim = conf["CLEANUP_TIMELIM"] upload_path = conf["UPLOAD_PATH"] current_time = time.time() metax_client = None file_dict = None fpaths = [] deleted_files = [] if metax: metax_client = md.MetaxClient( url=conf["METAX_URL"], user=conf["METAX_USER"], password=conf["METAX_PASSWORD"], verify=conf["METAX_SSL_VERIFICATION"] ) file_dict = metax_client.get_files_dict(project) # Remove all old files for dirpath, _, files in os.walk(fpath): for fname in files: _file = os.path.join(dirpath, fname) if _is_expired(_file, current_time, time_lim): _clean_file( _file, upload_path, fpaths, file_dict, metax_client ) deleted_files.append(_file) # Remove all empty dirs _clean_empty_dirs(fpath) # Clean checksums of the deleted files from mongo db.Database().checksums.delete(deleted_files) # Remove Metax entries of deleted files that are not part of any # datasets if metax: metax_client.delete_metadata(project, fpaths) return len(deleted_files)
def test_modify(): """Test modifying user quota and project.""" user = db.Database().user("test") user.create("test_project") with mock.patch.object(sys, 'argv', [ 'upload-rest-api', 'modify', 'test', "--quota", "1", "--project", "X" ]): upload_rest_api.__main__.main() assert user.get_quota() == 1 assert user.get_project() == "X"
def test_auth_user(user, password, result): """Test _auth_user() function with different username-password combinations. :param user: username of user :param password: password of user :param bool result: Excepted result of authentication """ # Create one test user to database usersdoc = db.Database().user('test_user') usersdoc.create('test_project', 'test_password') # pylint: disable=protected-access assert auth._auth_user(user, password) is result
def _auth_user(username, password): """Authenticate user.""" user = db.Database().user(username) try: user = user.get() except db.UserNotFoundError: # Calculate digest even if user does not exist to avoid # leaking information about which users exist return compare_digest(b"hash" * 16, db.hash_passwd("passwd", "salt")) salt = user["salt"] digest = user["digest"] return compare_digest(digest, db.hash_passwd(password, salt))
def init_db(mock_mongo): """Initialize user db.""" mock_mongo.drop_database("upload") # test user user = db.Database().user("test") user.users = mock_mongo.upload.users user.create("test_project", password="******") # test2 user with same project user.username = "******" user.create("test_project", password="******") # test3 user with different project user.username = "******" user.create("project", password="******")
def get_files(): """Get all files of the user. :return: HTTP Response """ username = request.authorization.username project = db.Database().user(username).get_project() root_upload_path = current_app.config.get("UPLOAD_PATH") fpath = safe_join(root_upload_path, secure_filename(project)) if not os.path.exists(fpath): return utils.make_response(404, "No files found") response = jsonify(_get_dir_tree(project, fpath, root_upload_path)) response.status_code = 200 return response
def post_metadata(fpath): """POST file metadata to Metax. A background task is launched to run the job. The ``Location`` header and the body of the response contain the URL to be used for polling the status of the task. Status code is set to HTTP 202(Accepted). :returns: HTTP Response """ username = request.authorization.username project = db.Database().user(username).get_project() root_upload_path = current_app.config.get("UPLOAD_PATH") file_path, fname = utils.get_upload_path(project, fpath, root_upload_path) file_path = os.path.join(file_path, fname) storage_id = current_app.config.get("STORAGE_ID") task_id = enqueue_background_job( task_func="upload_rest_api.jobs.metadata.post_metadata", queue_name=METADATA_QUEUE, username=username, job_kwargs={ "fpath": fpath, "username": username, "storage_id": storage_id }) polling_url = utils.get_polling_url(TASK_STATUS_API_V1.name, task_id) ret_path = utils.get_return_path(project, file_path, root_upload_path) response = jsonify({ "file_path": ret_path, "message": "Creating metadata", "polling_url": polling_url, "status": "pending" }) location = url_for(TASK_STATUS_API_V1.name + ".task_status", task_id=task_id) response.headers[b'Location'] = location response.status_code = 202 return response
def delete_files(fpath, username, task_id): """Delete files and metadata denoted by fpath directory under user's project. The whole directory is recursively removed. :param str fpath: path to directory :param str username: current user :param str task_id: mongo dentifier of the task """ root_upload_path = CONFIG["UPLOAD_PATH"] # Remove metadata from Metax metax_client = md.MetaxClient() database = db.Database() project = database.user(username).get_project() ret_path = utils.get_return_path(project, fpath, root_upload_path) database.tasks.update_message(task_id, "Deleting files and metadata: %s" % ret_path) try: metax_response = metax_client.delete_all_metadata( project, fpath, root_upload_path) except (MetaxError, HTTPError) as error: database.tasks.update_status(task_id, "error") msg = {"message": str(error)} database.tasks.update_message(task_id, json.dumps(msg)) raise else: # Remove checksum from mongo database.checksums.delete_dir(fpath) # Remove project directory and update used_quota rmtree(fpath) database.user(username).update_used_quota(root_upload_path) response = { "file_path": ret_path, "status": "done", "metax": metax_response } database.tasks.update_message(task_id, json.dumps(response)) database.tasks.update_status(task_id, "done")
def delete_metadata(fpath, username, task_id): """Delete file metadata. This function deletes the metadata in Metax for the file(s) denoted by fpath argument. Finally updates the status of the task into database. :param str fpath: file path :param str username: current user :param str task_id: mongo dentifier of the task """ root_upload_path = CONFIG["UPLOAD_PATH"] status = "error" response = None metax_client = md.MetaxClient() database = db.Database() project = database.user(username).get_project() fpath, fname = utils.get_upload_path(project, fpath, root_upload_path) fpath = os.path.join(fpath, fname) ret_path = utils.get_return_path(project, fpath, root_upload_path) database.tasks.update_message(task_id, "Deleting metadata: %s" % ret_path) if os.path.isfile(fpath): # Remove metadata from Metax delete_func = metax_client.delete_file_metadata elif os.path.isdir(fpath): # Remove all file metadata of files under dir fpath from Metax delete_func = metax_client.delete_all_metadata else: response = {"code": 404, "error": "File not found"} if not response: try: response = delete_func(project, fpath, root_upload_path, force=True) except HTTPError as error: logging.error(str(error), exc_info=error) response = { "file_path": utils.get_return_path(project, fpath, root_upload_path), "metax": error.response.json() } except md.MetaxClientError as error: logging.error(str(error), exc_info=error) response = {"code": 400, "error": str(error)} else: status = "done" response = { "file_path": utils.get_return_path(project, fpath, root_upload_path), "metax": response } database.tasks.update_message(task_id, json.dumps(response)) database.tasks.update_status(task_id, status)
def delete_path(fpath): """Delete fpath under user's project. If fpath resolves to a directory, the whole directory is recursively removed. :returns: HTTP Response """ root_upload_path = current_app.config.get("UPLOAD_PATH") username = request.authorization.username database = db.Database() project = database.user(username).get_project() fpath, fname = utils.get_upload_path(project, fpath) fpath = os.path.join(fpath, fname) if os.path.isfile(fpath): # Remove metadata from Metax try: response = md.MetaxClient().delete_file_metadata( project, fpath, root_upload_path) except md.MetaxClientError as exception: response = str(exception) # Remove checksum from mongo database.checksums.delete_one(os.path.abspath(fpath)) os.remove(fpath) elif os.path.isdir(fpath): # Remove all file metadata of files under dir fpath from Metax task_id = enqueue_background_job( task_func="upload_rest_api.jobs.files.delete_files", queue_name=FILES_QUEUE, username=username, job_kwargs={ "fpath": fpath, "username": username }) polling_url = utils.get_polling_url(TASK_STATUS_API_V1.name, task_id) response = jsonify({ "file_path": fpath[len(os.path.join(root_upload_path, project)):], "message": "Deleting files and metadata", "polling_url": polling_url, "status": "pending" }) location = url_for(TASK_STATUS_API_V1.name + ".task_status", task_id=task_id) response.headers[b'Location'] = location response.status_code = 202 return response else: return utils.make_response(404, "File not found") database.user(username).update_used_quota(root_upload_path) response = jsonify({ "file_path": utils.get_return_path(project, fpath, root_upload_path), "message": "deleted", "metax": response }) response.status_code = 200 return response
def post_metadata(self, fpaths, root_upload_path, username, storage_id): """Generate file metadata and POST it to Metax in 5k chunks. :param fpaths: List of files for which to generate the metadata :param root_upload_path: root upload directory :param username: current user :param storage_id: pas storage identifier in Metax :returns: Stripped HTTP response returned by Metax. Success list contains succesfully generated file metadata in format: [ { "object": { "identifier": identifier, "file_path": file_path, "checksum": {"value": checksum}, "parent_directory": { "identifier": identifier } } }, . . . ] """ database = db.Database() project = database.user(username).get_project() checksums = database.checksums.get_checksums() metadata = [] responses = [] i = 0 for fpath in fpaths: metadata.append(_generate_metadata( fpath, root_upload_path, project, storage_id, checksums )) # POST metadata to Metax every 5k steps i += 1 if i % 5000 == 0: response = self.client.post_file(metadata) responses.append(_strip_metax_response(response)) # Add created identifiers to Mongo if "success" in response and response["success"]: database.store_identifiers( response["success"], root_upload_path, username ) metadata = [] # POST remaining metadata if metadata: response = self.client.post_file(metadata) responses.append(_strip_metax_response(response)) # Add created identifiers to Mongo if "success" in response and response["success"]: database.store_identifiers( response["success"], root_upload_path, username ) # Merge all responses into one response response = {"success": [], "failed": []} for metax_response in responses: if "success" in metax_response: response["success"].extend(metax_response["success"]) if "failed" in metax_response: response["failed"].extend(metax_response["failed"]) return response
def test_mongo_cleanup( app, test_auth, monkeypatch, background_job_runner ): """Test that cleaning files from mongo deletes all files that haven't been posted to Metax. """ test_client = app.test_client() # Mock Files mongo connection def _mock_init(self, client): host = app.config.get("MONGO_HOST") port = app.config.get("MONGO_PORT") self.files = pymongo.MongoClient(host, port).upload.files monkeypatch.setattr(db.Files, "__init__", _mock_init) # Mock configuration parsing def _mock_conf(fpath): if not os.path.isfile(fpath): fpath = "include/etc/upload_rest_api.conf" conf = run_path(fpath) conf["METAX_PASSWORD"] = PASSWORD conf["UPLOAD_PATH"] = app.config.get("UPLOAD_PATH") conf["CLEANUP_TIMELIM"] = -1 return conf monkeypatch.setattr(clean, "parse_conf", _mock_conf) files_col = db.Database().files # ----- Inserting fake identifiers to Mongo and cleaning them files_col.insert([ {"_id": "pid:urn:1", "file_path": "1"}, {"_id": "pid:urn:2", "file_path": "2"} ]) assert len(files_col.get_all_ids()) == 2 clean.clean_mongo() assert not files_col.get_all_ids() # Upload integration.zip, which is extracted by the server poll_response = _upload_file( test_client, "/v1/archives", test_auth, "tests/data/integration.zip" ) response = background_job_runner(test_client, "upload", poll_response) assert response.status_code == 200 # Generate and POST metadata for all the files in test_project poll_response = test_client.post("/v1/metadata/*", headers=test_auth) response = background_job_runner(test_client, "metadata", poll_response) assert response.status_code == 200 # Check that generated identifiers were added to Mongo assert len(files_col.get_all_ids()) == 2 # Check that generated file_paths resolve to actual files for file_doc in files_col.files.find(): file_path = file_doc["file_path"] assert os.path.isfile(file_path) # Try to clean file documents that still exist in Metax clean.clean_mongo() assert len(files_col.get_all_ids()) == 2