def full_model_run_task(run_id: str, burn_in: int, sample_size: int, quiet: bool): # Set up directories for output data. recreate_dir(FULL_RUN_DATA_DIR) s3_client = get_s3_client() # Find the calibration chain databases in AWS S3. key_prefix = os.path.join(run_id, os.path.relpath(CALIBRATE_DATA_DIR, REMOTE_BASE_DIR)) chain_db_keys = list_s3(s3_client, key_prefix, key_suffix=".parquet") chain_db_keys = [k for k in chain_db_keys if any([t in k for t in TABLES_TO_DOWNLOAD])] # Download the calibration chain databases. with Timer(f"Downloading calibration data"): for src_key in chain_db_keys: download_from_run_s3(s3_client, run_id, src_key, quiet) # Run the models for the full time period plus all scenarios. db_paths = db.load.find_db_paths(CALIBRATE_DATA_DIR) chain_ids = [int(p.split("/")[-1].split("-")[-1]) for p in db_paths] num_chains = len(chain_ids) with Timer(f"Running full models for {num_chains} chains: {chain_ids}"): args_list = [ (run_id, db_path, chain_id, burn_in, sample_size, quiet) for chain_id, db_path in zip(chain_ids, db_paths) ] chain_ids = run_parallel_tasks(run_full_model_for_chain, args_list) # Upload the full model run outputs of AWS S3. db_paths = db.load.find_db_paths(FULL_RUN_DATA_DIR) with Timer(f"Uploading full model run data to AWS S3"): for db_path in db_paths: upload_to_run_s3(s3_client, run_id, db_path, quiet)
def get(self, request): try: cog = Cognito() cog.delete_user(request.headers['Authorization']) # DB에서 사용자 삭제 (CASCADE이므로 하위 상관 없음) user = self.get_object(request.data['user_id']) user.delete() # 해당 사용자의 S3 버킷 폴더 밀기 s3_client = get_s3_client( request.headers['Access-Key-Id'], request.headers['Secret-Key'], request.headers['Session-Token'], ) delete_folder_file(s3_client, '/'.join([request.data['user_id'], ''])) delete_folder_file( s3_client, '/'.join(['trash', request.data['user_id'], ''])) return Response(status=status.HTTP_200_OK) # ACCESS_TOKEN 필요 로그인 필요 except Exception as e: return Response(str(e), status=status.HTTP_401_UNAUTHORIZED)
def calibrate_task(run_id: str, runtime: float, num_chains: int, verbose: bool): s3_client = get_s3_client() # Set up directories for plots and output data. with Timer(f"Creating calibration directories"): for dirpath in CALIBRATE_DIRS: recreate_dir(dirpath) # Run the actual calibrations with Timer(f"Running {num_chains} calibration chains"): args_list = [(run_id, runtime, chain_id, num_chains, verbose) for chain_id in range(num_chains)] chain_ids = run_parallel_tasks(run_calibration_chain, args_list) # Upload the calibration outputs of AWS S3. with Timer(f"Uploading calibration data to AWS S3"): for chain_id in chain_ids: with Timer(f"Uploading data for chain {chain_id} to AWS S3"): src_dir = os.path.join(CALIBRATE_DATA_DIR, f"chain-{chain_id}") upload_to_run_s3(s3_client, run_id, src_dir, quiet=not verbose) # Create plots from the calibration outputs. with Timer(f"Creating post-calibration plots"): app_region = get_app_region(run_id) plots.calibration.plot_post_calibration(app_region.targets, CALIBRATE_DATA_DIR, CALIBRATE_PLOTS_DIR, priors=[None]) # Upload the plots to AWS S3. with Timer(f"Uploading plots to AWS S3"): upload_to_run_s3(s3_client, run_id, CALIBRATE_PLOTS_DIR, quiet=not verbose) # Find the MLE parameter set from all the chains. with Timer(f"Finding max likelihood esitmate params"): database_paths = db.load.find_db_paths(CALIBRATE_DATA_DIR) with TemporaryDirectory() as tmp_dir_path: collated_db_path = os.path.join(tmp_dir_path, "collated.db") db.process.collate_databases(database_paths, collated_db_path, tables=["mcmc_run", "mcmc_params"]) db.store.save_mle_params(collated_db_path, MLE_PARAMS_PATH) # Upload the MLE parameter set to AWS S3. with Timer(f"Uploading max likelihood esitmate params to AWS S3"): upload_to_run_s3(s3_client, run_id, MLE_PARAMS_PATH, quiet=not verbose)
def post(self, request): try: request.data['user_id'] request.data['user_password'] request.data['confirm_user_password'] request.data['user_email'] # 이메일 공백일 시 if (request.data['user_email'] == ''): return Response(status=status.HTTP_400_BAD_REQUEST) # 비밀번호 확인 다를 경우 if (request.data['user_password'] != request.data['confirm_user_password']): return Response(status=status.HTTP_400_BAD_REQUEST) # Cognito를 통한 회원가입 cog = Cognito() cog.sign_up(request.data['user_id'], request.data['user_password'], [ { 'Name': 'email', 'Value': request.data['user_email'] }, ]) # DB에 User 정보 저장 (Collection ID) serializers = UserSerializer( data={ 'user_id': request.data['user_id'], 'collection_id': 'col_' + request.data['user_id'] }) if serializers.is_valid(): serializers.save() # DB에 User의 Root 폴더, Trash 폴더 생성 serializers = FolderSerializer(data=[{ 'user_id': request.data['user_id'], 'name': request.data['user_id'], 'path': '', }, { 'user_id': request.data['user_id'], 'name': request.data['user_id'], 'path': 'trash/', }], many=True) if not serializers.is_valid(): return Response(serializers.errors, content_type="application/json", status=status.HTTP_400_BAD_REQUEST) serializers.save() # S3에 User의 Root 폴더, Trash 폴더 생성 # S3 Client 생성 s3_client = get_s3_client(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) # S3에 Root 폴더에 해당하는 Key 생성 upload_folder(s3_client, "/".join([request.data['user_id'], ''])) # S3에 Trash 폴더에 해당하는 Key 생성 upload_folder(s3_client, "/".join(['trash', request.data['user_id'], ''])) # 이메일로 verification 전송됨 return Response(status=status.HTTP_201_CREATED) except Exception as e: return Response(str(e), status=status.HTTP_400_BAD_REQUEST)
def powerbi_task(run_id: str, quiet: bool): s3_client = get_s3_client() # Set up directories for plots and output data. with Timer(f"Creating PowerBI directories"): for dirpath in POWERBI_DIRS: if os.path.exists(dirpath): shutil.rmtree(dirpath) os.makedirs(dirpath) # Find the full model run databases in AWS S3. key_prefix = os.path.join( run_id, os.path.relpath(FULL_RUN_DATA_DIR, REMOTE_BASE_DIR)) chain_db_keys = list_s3(s3_client, key_prefix, key_suffix=".feather") # Download the full model run databases. with Timer(f"Downloading full model run data"): for src_key in chain_db_keys: download_from_run_s3(s3_client, run_id, src_key, quiet) # Remove unnecessary data from each full model run database. full_db_paths = db.load.find_db_paths(FULL_RUN_DATA_DIR) with Timer(f"Pruning chain databases"): get_dest_path = lambda p: os.path.join(POWERBI_PRUNED_DIR, os.path.basename(p)) for full_db_path in full_db_paths: db.process.prune_chain(full_db_path, get_dest_path(full_db_path)) # Collate data from each pruned full model run database into a single database. pruned_db_paths = db.load.find_db_paths(POWERBI_PRUNED_DIR) with Timer(f"Collating pruned databases"): db.process.collate_databases(pruned_db_paths, POWERBI_COLLATED_PATH) # Calculate uncertainty for model outputs. app_region = get_app_region(run_id) with Timer(f"Calculating uncertainty quartiles"): db.uncertainty.add_uncertainty_quantiles(POWERBI_COLLATED_PATH, app_region.targets) # Remove unnecessary data from the database. with Timer(f"Pruning final database"): db.process.prune_final(POWERBI_COLLATED_PATH, POWERBI_COLLATED_PRUNED_PATH) # Unpivot database tables so that they're easier to process in PowerBI. run_slug = run_id.replace("/", "-") dest_db_path = os.path.join(POWERBI_DATA_DIR, f"powerbi-{run_slug}.db") with Timer(f"Applying PowerBI specific post-processing final database"): db.process.powerbi_postprocess(POWERBI_COLLATED_PRUNED_PATH, dest_db_path, run_id) # Upload final database to AWS S3 with Timer(f"Uploading PowerBI data to AWS S3"): upload_to_run_s3(s3_client, run_id, dest_db_path, quiet) # Create uncertainty plots with Timer(f"Creating uncertainty plots"): plots.uncertainty.plot_uncertainty(app_region.targets, dest_db_path, POWERBI_PLOT_DIR) # Upload the plots to AWS S3. with Timer(f"Uploading plots to AWS S3"): upload_to_run_s3(s3_client, run_id, POWERBI_PLOT_DIR, quiet)
def test_full_model_run_task(monkeypatch, tmpdir): """ Test the full model run task. """ # Ensure data is read/written to a transient test directory test_full_data_dir = os.path.join(tmpdir, "data", "full_model_runs") test_calibration_data_dir = os.path.join(tmpdir, "data", "calibration_outputs") monkeypatch.setattr(full, "REMOTE_BASE_DIR", tmpdir) monkeypatch.setattr(full, "FULL_RUN_DATA_DIR", test_full_data_dir) monkeypatch.setattr(full, "CALIBRATE_DATA_DIR", test_calibration_data_dir) monkeypatch.setattr(s3_settings, "REMOTE_BASE_DIR", tmpdir) monkeypatch.setattr(s3_settings, "S3_BUCKET", BUCKET_NAME) # Ignore logging config for now monkeypatch.setattr(full, "set_logging_config", lambda *args: None) # Create a calibration database as input to the full model run test_db_path = os.path.join(test_calibration_data_dir, "chain-0") calib_db = ParquetDatabase(test_db_path) mcmc_run_columns = [ "accept", "ap_loglikelihood", "chain", "loglikelihood", "run", "weight" ] mcmc_run_rows = [ # NB: ap_loglikelihood not used so we can ignore. [1, 0.0, 0, -110.0, 0, 1], [1, 0.0, 0, -101.0, 1, 2], [0, 0.0, 0, -102.0, 2, 0], [1, 0.0, 0, -103.2, 3, 4], [0, 0.0, 0, -102.1, 4, 0], [0, 0.0, 0, -101.4, 5, 0], [0, 0.0, 0, -101.6, 6, 0], [1, 0.0, 0, -100.0, 7, 2], # Maximum likelihood run (MLE) [0, 0.0, 0, -103.1, 8, 0], [1, 0.0, 0, -100.1, 9, 1], [1, 0.0, 0, -100.2, 10, 1], ] mcmc_run_df = pd.DataFrame(mcmc_run_rows, columns=mcmc_run_columns) calib_db.dump_df(Table.MCMC, mcmc_run_df) mcmc_param_columns = ["chain", "name", "run", "value"] mcmc_param_rows = [ [0, "recovery_rate", 0, 0.0], [0, "recovery_rate", 1, 0.1], [0, "recovery_rate", 2, 0.2], [0, "recovery_rate", 3, 0.3], [0, "recovery_rate", 4, 0.4], [0, "recovery_rate", 5, 0.5], [0, "recovery_rate", 6, 0.6], [0, "recovery_rate", 7, 0.7], # Maximum likelihood run (MLE) [0, "recovery_rate", 8, 0.8], [0, "recovery_rate", 9, 0.9], [0, "recovery_rate", 10, 1.0], ] mcmc_param_df = pd.DataFrame(mcmc_param_rows, columns=mcmc_param_columns) calib_db.dump_df(Table.PARAMS, mcmc_param_df) # Upload calibration database to mock AWS S3, then delete local copy s3 = get_s3_client() s3.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration={ "LocationConstraint": s3_settings.AWS_REGION }) upload_to_run_s3(s3, TEST_RUN_ID, test_db_path, quiet=True) recreate_dir(test_calibration_data_dir) # Ensure our test model is being run. def get_app_region(run_id): assert run_id == TEST_RUN_ID return MockAppRegion() monkeypatch.setattr(full, "get_app_region", get_app_region) # Run the full model task full_model_run_task(run_id=TEST_RUN_ID, burn_in=2, sample_size=3, quiet=True) # Delete local data, download AWS S3 data and check the results recreate_dir(test_full_data_dir) key_prefix = os.path.join(TEST_RUN_ID, "data", "full_model_runs") chain_db_keys = list_s3(s3, key_prefix, key_suffix=".feather") for src_key in chain_db_keys: download_from_run_s3(s3, TEST_RUN_ID, src_key, quiet=True) full_db_path = os.path.join(test_full_data_dir, "chain-0") full_db = FeatherDatabase(full_db_path) assert set(full_db.table_names()) == { "outputs", "mcmc_run", "derived_outputs", "mcmc_params" } # Expect MCMC params table to be unchanged, other than 1st 2 runs burned in. full_mcmc_params_df = full_db.query("mcmc_params") assert_frame_equal(full_mcmc_params_df, mcmc_param_df[2:].reset_index(drop=True)) # Expect MCMC run table to now include 'sampled' and 'parent' columns. full_mcmc_run_df = full_db.query("mcmc_run") full_mcmc_run_columns = [ "accept", "ap_loglikelihood", "chain", "loglikelihood", "run", "weight", "sampled", "parent", ] full_mcmc_run_rows = [ # Expect runs 0 and 1 to be 'burned in'. # Expect 'sampled' column to sample 3 / 9 remaining runs # Expect 'parent' column to correctly track run id of last accepted run. [0, 0.0, 0, -102.0, 2, 0, 0, 1], [1, 0.0, 0, -103.2, 3, 4, 0, 3], [0, 0.0, 0, -102.1, 4, 0, 1, 3], [0, 0.0, 0, -101.4, 5, 0, 0, 3], [0, 0.0, 0, -101.6, 6, 0, 0, 3], [1, 0.0, 0, -100.0, 7, 2, 1, 7], # Maximum likelihood run (MLE) [0, 0.0, 0, -103.1, 8, 0, 0, 7], [1, 0.0, 0, -100.1, 9, 1, 0, 9], [1, 0.0, 0, -100.2, 10, 1, 1, 10], ] expected_full_mcmc_run_df = pd.DataFrame(full_mcmc_run_rows, columns=full_mcmc_run_columns) assert_frame_equal(full_mcmc_run_df, expected_full_mcmc_run_df)