Exemple #1
0
def full_model_run_task(run_id: str, burn_in: int, sample_size: int, quiet: bool):
    # Set up directories for output data.
    recreate_dir(FULL_RUN_DATA_DIR)

    s3_client = get_s3_client()

    # Find the calibration chain databases in AWS S3.
    key_prefix = os.path.join(run_id, os.path.relpath(CALIBRATE_DATA_DIR, REMOTE_BASE_DIR))
    chain_db_keys = list_s3(s3_client, key_prefix, key_suffix=".parquet")
    chain_db_keys = [k for k in chain_db_keys if any([t in k for t in TABLES_TO_DOWNLOAD])]

    # Download the calibration chain databases.
    with Timer(f"Downloading calibration data"):
        for src_key in chain_db_keys:
            download_from_run_s3(s3_client, run_id, src_key, quiet)

    # Run the models for the full time period plus all scenarios.
    db_paths = db.load.find_db_paths(CALIBRATE_DATA_DIR)
    chain_ids = [int(p.split("/")[-1].split("-")[-1]) for p in db_paths]
    num_chains = len(chain_ids)
    with Timer(f"Running full models for {num_chains} chains: {chain_ids}"):
        args_list = [
            (run_id, db_path, chain_id, burn_in, sample_size, quiet)
            for chain_id, db_path in zip(chain_ids, db_paths)
        ]
        chain_ids = run_parallel_tasks(run_full_model_for_chain, args_list)

    # Upload the full model run outputs of AWS S3.
    db_paths = db.load.find_db_paths(FULL_RUN_DATA_DIR)
    with Timer(f"Uploading full model run data to AWS S3"):
        for db_path in db_paths:
            upload_to_run_s3(s3_client, run_id, db_path, quiet)
Exemple #2
0
    def get(self, request):
        try:
            cog = Cognito()
            cog.delete_user(request.headers['Authorization'])

            # DB에서 사용자 삭제 (CASCADE이므로 하위 상관 없음)
            user = self.get_object(request.data['user_id'])
            user.delete()

            # 해당 사용자의 S3 버킷 폴더 밀기
            s3_client = get_s3_client(
                request.headers['Access-Key-Id'],
                request.headers['Secret-Key'],
                request.headers['Session-Token'],
            )
            delete_folder_file(s3_client,
                               '/'.join([request.data['user_id'], '']))
            delete_folder_file(
                s3_client, '/'.join(['trash', request.data['user_id'], '']))

            return Response(status=status.HTTP_200_OK)

        # ACCESS_TOKEN 필요 로그인 필요
        except Exception as e:
            return Response(str(e), status=status.HTTP_401_UNAUTHORIZED)
Exemple #3
0
def calibrate_task(run_id: str, runtime: float, num_chains: int,
                   verbose: bool):
    s3_client = get_s3_client()

    # Set up directories for plots and output data.
    with Timer(f"Creating calibration directories"):
        for dirpath in CALIBRATE_DIRS:
            recreate_dir(dirpath)

    # Run the actual calibrations
    with Timer(f"Running {num_chains} calibration chains"):
        args_list = [(run_id, runtime, chain_id, num_chains, verbose)
                     for chain_id in range(num_chains)]
        chain_ids = run_parallel_tasks(run_calibration_chain, args_list)

    # Upload the calibration outputs of AWS S3.
    with Timer(f"Uploading calibration data to AWS S3"):
        for chain_id in chain_ids:
            with Timer(f"Uploading data for chain {chain_id} to AWS S3"):
                src_dir = os.path.join(CALIBRATE_DATA_DIR, f"chain-{chain_id}")
                upload_to_run_s3(s3_client, run_id, src_dir, quiet=not verbose)

    # Create plots from the calibration outputs.
    with Timer(f"Creating post-calibration plots"):
        app_region = get_app_region(run_id)
        plots.calibration.plot_post_calibration(app_region.targets,
                                                CALIBRATE_DATA_DIR,
                                                CALIBRATE_PLOTS_DIR,
                                                priors=[None])

    # Upload the plots to AWS S3.
    with Timer(f"Uploading plots to AWS S3"):
        upload_to_run_s3(s3_client,
                         run_id,
                         CALIBRATE_PLOTS_DIR,
                         quiet=not verbose)

    # Find the MLE parameter set from all the chains.
    with Timer(f"Finding max likelihood esitmate params"):
        database_paths = db.load.find_db_paths(CALIBRATE_DATA_DIR)
        with TemporaryDirectory() as tmp_dir_path:
            collated_db_path = os.path.join(tmp_dir_path, "collated.db")
            db.process.collate_databases(database_paths,
                                         collated_db_path,
                                         tables=["mcmc_run", "mcmc_params"])
            db.store.save_mle_params(collated_db_path, MLE_PARAMS_PATH)

    # Upload the MLE parameter set to AWS S3.
    with Timer(f"Uploading max likelihood esitmate params to AWS S3"):
        upload_to_run_s3(s3_client, run_id, MLE_PARAMS_PATH, quiet=not verbose)
Exemple #4
0
    def post(self, request):
        try:
            request.data['user_id']
            request.data['user_password']
            request.data['confirm_user_password']
            request.data['user_email']

            # 이메일 공백일 시
            if (request.data['user_email'] == ''):
                return Response(status=status.HTTP_400_BAD_REQUEST)

            # 비밀번호 확인 다를 경우
            if (request.data['user_password'] !=
                    request.data['confirm_user_password']):
                return Response(status=status.HTTP_400_BAD_REQUEST)

            # Cognito를 통한 회원가입
            cog = Cognito()
            cog.sign_up(request.data['user_id'], request.data['user_password'],
                        [
                            {
                                'Name': 'email',
                                'Value': request.data['user_email']
                            },
                        ])

            # DB에 User 정보 저장 (Collection ID)
            serializers = UserSerializer(
                data={
                    'user_id': request.data['user_id'],
                    'collection_id': 'col_' + request.data['user_id']
                })
            if serializers.is_valid():
                serializers.save()

            # DB에 User의 Root 폴더, Trash 폴더 생성
            serializers = FolderSerializer(data=[{
                'user_id':
                request.data['user_id'],
                'name':
                request.data['user_id'],
                'path':
                '',
            }, {
                'user_id':
                request.data['user_id'],
                'name':
                request.data['user_id'],
                'path':
                'trash/',
            }],
                                           many=True)
            if not serializers.is_valid():
                return Response(serializers.errors,
                                content_type="application/json",
                                status=status.HTTP_400_BAD_REQUEST)
            serializers.save()

            # S3에 User의 Root 폴더, Trash 폴더 생성
            # S3 Client 생성
            s3_client = get_s3_client(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

            # S3에 Root 폴더에 해당하는 Key 생성
            upload_folder(s3_client, "/".join([request.data['user_id'], '']))

            # S3에 Trash 폴더에 해당하는 Key 생성
            upload_folder(s3_client,
                          "/".join(['trash', request.data['user_id'], '']))

            # 이메일로 verification 전송됨
            return Response(status=status.HTTP_201_CREATED)

        except Exception as e:
            return Response(str(e), status=status.HTTP_400_BAD_REQUEST)
Exemple #5
0
def powerbi_task(run_id: str, quiet: bool):
    s3_client = get_s3_client()

    # Set up directories for plots and output data.
    with Timer(f"Creating PowerBI directories"):
        for dirpath in POWERBI_DIRS:
            if os.path.exists(dirpath):
                shutil.rmtree(dirpath)

            os.makedirs(dirpath)

    # Find the full model run databases in AWS S3.
    key_prefix = os.path.join(
        run_id, os.path.relpath(FULL_RUN_DATA_DIR, REMOTE_BASE_DIR))
    chain_db_keys = list_s3(s3_client, key_prefix, key_suffix=".feather")

    # Download the full model run databases.
    with Timer(f"Downloading full model run data"):
        for src_key in chain_db_keys:
            download_from_run_s3(s3_client, run_id, src_key, quiet)

    # Remove unnecessary data from each full model run database.
    full_db_paths = db.load.find_db_paths(FULL_RUN_DATA_DIR)
    with Timer(f"Pruning chain databases"):
        get_dest_path = lambda p: os.path.join(POWERBI_PRUNED_DIR,
                                               os.path.basename(p))
        for full_db_path in full_db_paths:
            db.process.prune_chain(full_db_path, get_dest_path(full_db_path))

    # Collate data from each pruned full model run database into a single database.
    pruned_db_paths = db.load.find_db_paths(POWERBI_PRUNED_DIR)
    with Timer(f"Collating pruned databases"):
        db.process.collate_databases(pruned_db_paths, POWERBI_COLLATED_PATH)

    # Calculate uncertainty for model outputs.
    app_region = get_app_region(run_id)
    with Timer(f"Calculating uncertainty quartiles"):
        db.uncertainty.add_uncertainty_quantiles(POWERBI_COLLATED_PATH,
                                                 app_region.targets)

    # Remove unnecessary data from the database.
    with Timer(f"Pruning final database"):
        db.process.prune_final(POWERBI_COLLATED_PATH,
                               POWERBI_COLLATED_PRUNED_PATH)

    # Unpivot database tables so that they're easier to process in PowerBI.
    run_slug = run_id.replace("/", "-")
    dest_db_path = os.path.join(POWERBI_DATA_DIR, f"powerbi-{run_slug}.db")
    with Timer(f"Applying PowerBI specific post-processing final database"):
        db.process.powerbi_postprocess(POWERBI_COLLATED_PRUNED_PATH,
                                       dest_db_path, run_id)

    # Upload final database to AWS S3
    with Timer(f"Uploading PowerBI data to AWS S3"):
        upload_to_run_s3(s3_client, run_id, dest_db_path, quiet)

    # Create uncertainty plots
    with Timer(f"Creating uncertainty plots"):
        plots.uncertainty.plot_uncertainty(app_region.targets, dest_db_path,
                                           POWERBI_PLOT_DIR)

    # Upload the plots to AWS S3.
    with Timer(f"Uploading plots to AWS S3"):
        upload_to_run_s3(s3_client, run_id, POWERBI_PLOT_DIR, quiet)
Exemple #6
0
def test_full_model_run_task(monkeypatch, tmpdir):
    """
    Test the full model run task.
    """
    # Ensure data is read/written to a transient test directory
    test_full_data_dir = os.path.join(tmpdir, "data", "full_model_runs")
    test_calibration_data_dir = os.path.join(tmpdir, "data",
                                             "calibration_outputs")
    monkeypatch.setattr(full, "REMOTE_BASE_DIR", tmpdir)
    monkeypatch.setattr(full, "FULL_RUN_DATA_DIR", test_full_data_dir)
    monkeypatch.setattr(full, "CALIBRATE_DATA_DIR", test_calibration_data_dir)
    monkeypatch.setattr(s3_settings, "REMOTE_BASE_DIR", tmpdir)
    monkeypatch.setattr(s3_settings, "S3_BUCKET", BUCKET_NAME)

    # Ignore logging config for now
    monkeypatch.setattr(full, "set_logging_config", lambda *args: None)

    # Create a calibration database as input to the full model run
    test_db_path = os.path.join(test_calibration_data_dir, "chain-0")
    calib_db = ParquetDatabase(test_db_path)
    mcmc_run_columns = [
        "accept", "ap_loglikelihood", "chain", "loglikelihood", "run", "weight"
    ]
    mcmc_run_rows = [
        # NB: ap_loglikelihood not used so we can ignore.
        [1, 0.0, 0, -110.0, 0, 1],
        [1, 0.0, 0, -101.0, 1, 2],
        [0, 0.0, 0, -102.0, 2, 0],
        [1, 0.0, 0, -103.2, 3, 4],
        [0, 0.0, 0, -102.1, 4, 0],
        [0, 0.0, 0, -101.4, 5, 0],
        [0, 0.0, 0, -101.6, 6, 0],
        [1, 0.0, 0, -100.0, 7, 2],  # Maximum likelihood run (MLE)
        [0, 0.0, 0, -103.1, 8, 0],
        [1, 0.0, 0, -100.1, 9, 1],
        [1, 0.0, 0, -100.2, 10, 1],
    ]
    mcmc_run_df = pd.DataFrame(mcmc_run_rows, columns=mcmc_run_columns)
    calib_db.dump_df(Table.MCMC, mcmc_run_df)

    mcmc_param_columns = ["chain", "name", "run", "value"]
    mcmc_param_rows = [
        [0, "recovery_rate", 0, 0.0],
        [0, "recovery_rate", 1, 0.1],
        [0, "recovery_rate", 2, 0.2],
        [0, "recovery_rate", 3, 0.3],
        [0, "recovery_rate", 4, 0.4],
        [0, "recovery_rate", 5, 0.5],
        [0, "recovery_rate", 6, 0.6],
        [0, "recovery_rate", 7, 0.7],  # Maximum likelihood run (MLE)
        [0, "recovery_rate", 8, 0.8],
        [0, "recovery_rate", 9, 0.9],
        [0, "recovery_rate", 10, 1.0],
    ]
    mcmc_param_df = pd.DataFrame(mcmc_param_rows, columns=mcmc_param_columns)
    calib_db.dump_df(Table.PARAMS, mcmc_param_df)

    # Upload calibration database to mock AWS S3, then delete local copy
    s3 = get_s3_client()
    s3.create_bucket(Bucket=BUCKET_NAME,
                     CreateBucketConfiguration={
                         "LocationConstraint": s3_settings.AWS_REGION
                     })
    upload_to_run_s3(s3, TEST_RUN_ID, test_db_path, quiet=True)
    recreate_dir(test_calibration_data_dir)

    # Ensure our test model is being run.
    def get_app_region(run_id):
        assert run_id == TEST_RUN_ID
        return MockAppRegion()

    monkeypatch.setattr(full, "get_app_region", get_app_region)

    # Run the full model task
    full_model_run_task(run_id=TEST_RUN_ID,
                        burn_in=2,
                        sample_size=3,
                        quiet=True)

    # Delete local data, download AWS S3 data and check the results
    recreate_dir(test_full_data_dir)
    key_prefix = os.path.join(TEST_RUN_ID, "data", "full_model_runs")
    chain_db_keys = list_s3(s3, key_prefix, key_suffix=".feather")
    for src_key in chain_db_keys:
        download_from_run_s3(s3, TEST_RUN_ID, src_key, quiet=True)

    full_db_path = os.path.join(test_full_data_dir, "chain-0")
    full_db = FeatherDatabase(full_db_path)
    assert set(full_db.table_names()) == {
        "outputs", "mcmc_run", "derived_outputs", "mcmc_params"
    }

    # Expect MCMC params table to be unchanged, other than 1st 2 runs burned in.
    full_mcmc_params_df = full_db.query("mcmc_params")
    assert_frame_equal(full_mcmc_params_df,
                       mcmc_param_df[2:].reset_index(drop=True))

    # Expect MCMC run table to now include 'sampled' and 'parent' columns.
    full_mcmc_run_df = full_db.query("mcmc_run")
    full_mcmc_run_columns = [
        "accept",
        "ap_loglikelihood",
        "chain",
        "loglikelihood",
        "run",
        "weight",
        "sampled",
        "parent",
    ]
    full_mcmc_run_rows = [
        # Expect runs 0 and 1 to be 'burned in'.
        # Expect 'sampled' column to sample 3 / 9 remaining runs
        # Expect 'parent' column to correctly track run id of last accepted run.
        [0, 0.0, 0, -102.0, 2, 0, 0, 1],
        [1, 0.0, 0, -103.2, 3, 4, 0, 3],
        [0, 0.0, 0, -102.1, 4, 0, 1, 3],
        [0, 0.0, 0, -101.4, 5, 0, 0, 3],
        [0, 0.0, 0, -101.6, 6, 0, 0, 3],
        [1, 0.0, 0, -100.0, 7, 2, 1, 7],  # Maximum likelihood run (MLE)
        [0, 0.0, 0, -103.1, 8, 0, 0, 7],
        [1, 0.0, 0, -100.1, 9, 1, 0, 9],
        [1, 0.0, 0, -100.2, 10, 1, 1, 10],
    ]
    expected_full_mcmc_run_df = pd.DataFrame(full_mcmc_run_rows,
                                             columns=full_mcmc_run_columns)
    assert_frame_equal(full_mcmc_run_df, expected_full_mcmc_run_df)