def main():
    config, model_name, datasets = parse_args()

    for dname in datasets:
        dset = config.datasets[dname]
        outpath = config.get_model_results_path(model_name, dname, "agg",
                                                "csv")

        max_date = dset.min_left_censor - relativedelta(
            months=dset.blackout_months)
        min_date = max_date - relativedelta(months=dset.projection_months)
        feature_params = {"left_censor": min_date, "right_censor": max_date}

        left_censor = dset.min_left_censor
        right_censor = left_censor + relativedelta(
            months=dset.projection_months)
        target_params = {
            "left_censor": left_censor,
            "right_censor": right_censor
        }

        indv_pmpm_path = config.get_model_results_path(model_name, dname,
                                                       "indv_pmpm", "json")
        indv_pmpm = utils.load_file(indv_pmpm_path)
        mbrs, preds = zip(*indv_pmpm.items())

        # group-to-split map
        split_map = None
        if dset.is_split:
            split_map = utils.load_file(dset.group_to_split_path)

        seg = Segment.load_from_file(dset.ldm_path, "parquet")
        seg.repartition(2000)
        slice_date = (dset.min_left_censor -
                      relativedelta(months=dset.blackout_months) -
                      relativedelta(days=1)).date()

        print("Processing dataset {}".format(dname))
        group_seg = GroupSegment(df=seg.df, slice_date=slice_date)
        # append indv pmpm predictions
        group_seg.append_member_predictions(mbrs, preds)
        featurizer = get_featurizer(feature_params, target_params)
        df = featurizer.featurize(group_seg).toPandas()

        # add "trend" and "cost_fraction|last_4_month"
        begin = (min_date + relativedelta(days=1)).strftime('%Y-%m-%d')
        left = min_date.strftime('%Y-%m-%d')
        right = max_date.strftime('%Y-%m-%d')
        df["trend"] = ((df[f'Coverage|None|{right}|GroupNumMembers'] -
                        df[f'Coverage|None|{begin}|GroupNumMembers']) /
                       (df[f'Coverage|{left}|{right}|GroupMemberMonths']))
        left_4m = (max_date - relativedelta(months=4)).strftime('%Y-%m-%d')
        df["cost_fraction|last_4_month"] = (
            df[f'Claims|{left_4m}|{right}|GroupAllowedAmount'] /
            (df[f'Claims|{left}|{right}|GroupAllowedAmount'] + 1))

        if dset.is_split:
            df['split'] = df.GROUP.map(split_map)
        df.to_csv(outpath, index=False)
def main():
    config, model_name, dataset_names, overwrite = parse_args()

    model_cache_path = config.get_model_cache_path(model_name, "indv")
    cache = utils.load_file(model_cache_path)

    for dname in dataset_names:

        out_fn = config.get_model_results_path(model_name, dname, "indv",
                                               "json")
        if overwrite or utils.validate_path(out_fn):

            path_train_mat = config.get_features_selected_matrix_path(
                dname, model_name)
            orig_dict = load_matrix_data_from_npz(path_train_mat)
            X_test = orig_dict["matrix"]
            mbrs_test = orig_dict["MBR"]

            result_dict = {}
            preds = cache['reg'].predict(X_test)
            preds_fac = cache['reg_fac'].predict(X_test)
            result_dict['reg'] = get_preds_dict(mbrs_test, preds)
            result_dict['reg_fac'] = get_preds_dict(mbrs_test, preds_fac)
            for key in cache:
                if key[:3] != 'clf':
                    continue
                preds = cache[key].predict(X_test)
                result_dict[key] = get_preds_dict(mbrs_test, preds, digits=3)

            print('writing results to', out_fn)
            utils.write_file(result_dict, out_fn)
def run():
    config, model_name, datasets, overwrite = parse_args()

    for dname in datasets:
        dset = config.datasets[dname]
        path_train_mat = config.get_features_selected_matrix_path(
            dname, model_name)
        ind_preds_path = config.get_model_results_path(model_name, dname,
                                                       'indv', 'json')
        base_dest_path = dset.plots_path
        is_gs = 'gs://' in base_dest_path
        mat_dict = load_matrix_data_from_npz(path_train_mat)
        ind_preds = utils.load_file(ind_preds_path)

        if dset.is_split:
            splits = ['train', 'test', 'evaluate']
        else:
            splits = ['all']

        for split in splits:
            if dset.is_split:
                dest_path = os.path.join(base_dest_path, model_name, split)
            else:
                dest_path = os.path.join(base_dest_path, model_name)

            if overwrite or utils.validate_path(dest_path):

                if not is_gs and not os.path.exists(dest_path):
                    os.makedirs(dest_path)

                mask = (mat_dict["splits"] == split)
                X, y_true, mbrs = (mat_dict['matrix'][mask, :],
                                   mat_dict['costs'][mask],
                                   mat_dict['MBR'][mask])
                print('number of target values in {} split'.format(split),
                      len(y_true))
                y_pred = np.array([
                    ind_preds['reg'][s] * ind_preds['reg_fac'][s] for s in mbrs
                ])

                sd = dset.min_left_censor - relativedelta(
                    months=dset.blackout_months)
                sd_minus_1_yr = sd - relativedelta(years=1)
                right = sd.strftime('%Y-%m-%d')
                left = sd_minus_1_yr.strftime('%Y-%m-%d')
                # check min_date
                if left < dset.min_date.strftime('%Y-%m-%d'):
                    y_prior = None
                else:
                    feature_names = mat_dict['column_names']
                    col = f'Claims|{left}|{right}|AllowedCost'
                    idx = np.where(feature_names == col)[0][0]
                    y_prior = X[:, idx].toarray().reshape(-1)

                metrics = get_overall_metrics(y_true, y_prior, y_pred)
                df_metrics = get_thresh_dependent_metrics(
                    y_true, y_prior, ind_preds, mbrs, dest_path)
                utils.write_file(metrics,
                                 os.path.join(dest_path, 'metrics.json'))
                df_metrics.to_csv(os.path.join(dest_path,
                                               'thresh_dependent_metrics.csv'),
                                  index=False)
Example #4
0
def run():
    config, model_name, threshs, overwrite = parse_args()

    model = config.models[model_name]
    path_model_cache = config.get_model_cache_path(model_name, "indv")

    seen_dsets = set()
    mat_dicts = {}

    datasets = model.train_datasets
    if model.validation_datasets:
        datasets = model.train_datasets + model.validation_datasets

    for ds in datasets:
        dname = ds.dataset_name
        dset = config.datasets[dname]
        if dset not in seen_dsets:
            path_train_mat = config.get_features_selected_matrix_path(
                dname, model_name)
            if overwrite or utils.validate_path(path_model_cache):
                mat_dicts[dset] = load_matrix_data_from_npz(path_train_mat)
            seen_dsets.add(dset)

    feature_fn = list(seen_dsets)[0].feature_dicts_path
    featurizer = utils.load_file(feature_fn.replace('jsons', 'joblib'))
    feature_indices = list(mat_dicts.values())[0]["column_indices"]

    if overwrite or utils.validate_path(path_model_cache):

        # X,y for costs_ref
        X_train, y_train = get_features_targets(config,
                                                model,
                                                mat_dicts,
                                                'costs_ref',
                                                train_sets=True)

        if model.validation_datasets:
            X_valid, y_valid = get_features_targets(config,
                                                    model,
                                                    mat_dicts,
                                                    'costs_ref',
                                                    train_sets=False)
        # X,y for costs_proj_fac
        X_train_fac, y_train_fac = get_features_targets(config,
                                                        model,
                                                        mat_dicts,
                                                        'costs_proj_fac',
                                                        train_sets=True)

        if model.validation_datasets:
            X_valid_fac, y_valid_fac = get_features_targets(config,
                                                            model,
                                                            mat_dicts,
                                                            'costs_proj_fac',
                                                            train_sets=False)

        lgb_reg_params = {
            'objective': 'regression',
            'min_gain_to_split': 75,
            'min_data_in_leaf': 100,
            'num_leaves': 100,
            'num_iterations': 5000 if model.validation_datasets else 100
        }

        lgb_clf_params = {
            'objective': 'binary',
            'min_gain_to_split': 75,
            'learning_rate': .01,
            'min_data_in_leaf': 100,
            'num_leaves': 100,
            'num_iterations': 5000 if model.validation_datasets else 250
        }

        valid_params = {'early_stopping_rounds': 5, 'verbose': 10}

        models = {}
        # train costs_ref model
        reg = GBRT("indv_model", model_package="lgbm", params=lgb_reg_params)
        if model.validation_datasets:
            reg.fit(features=X_train,
                    targets=y_train,
                    validation_features=X_valid,
                    validation_targets=y_valid,
                    **valid_params)
        else:
            reg.fit(features=X_train, targets=y_train)
        fimp = reg.model.feature_importances_
        print(f'Number of features found to be important: '
              f'{(fimp != 0).sum():,}')
        models['reg'] = reg

        # train costs_proj_fac
        reg_fac = GBRT("indv_model_fac",
                       model_package="lgbm",
                       params=lgb_reg_params)
        if model.validation_datasets:
            reg_fac.fit(features=X_train_fac,
                        targets=y_train_fac,
                        validation_features=X_valid_fac,
                        validation_targets=y_valid_fac,
                        **valid_params)
        else:
            reg_fac.fit(features=X_train_fac, targets=y_train_fac)
        fimp = reg_fac.model.feature_importances_
        print(f'Number of features found to be important: '
              f'{(fimp != 0).sum():,}')
        models['reg_fac'] = reg_fac

        client = Client(project_id='lumiata-internal-6f5a',
                        disable_rest_client=True)
        model_base_dir = os.path.dirname(path_model_cache)
        client.save_model_zip(output_dir=model_base_dir,
                              model=reg,
                              featurizer=featurizer,
                              target_variable="allowed_cost",
                              feature_selection_indices=feature_indices)

        for thresh in threshs:
            name = int(thresh // 1000)
            targets_train = y_train > thresh
            clf = GBDT(f"clf_model_{name}k",
                       model_package="lgbm",
                       params=lgb_clf_params)
            if model.validation_datasets:
                targets_valid = y_valid > thresh
                clf.fit(features=X_train,
                        targets=targets_train,
                        validation_features=X_valid,
                        validation_targets=targets_valid,
                        **valid_params)
            else:
                clf.fit(features=X_train, targets=targets_train)
            fimp = clf.model.feature_importances_
            print(f'Number of features found to be important: '
                  f'{(fimp != 0).sum():,}')
            models[f'clf_{name}k'] = clf

        utils.write_file(models, path_model_cache)
Example #5
0
def run():
    config, model_name, datasets, full_matrix, overwrite = parse_args()
    sc = SparkContext()
    sql = SQLContext(sc)

    # Create full matrices for all datasets
    full_matrices = {}
    data_dicts = {}
    for dname in datasets:
        dset = config.datasets[dname]
        split_map = utils.load_file(dset.sha1id_to_split_path)
        covered_mbr = utils.load_file(dset.sha1ids_covered_on_left_censor_path)
        covered_mbr = set(covered_mbr)

        vector_fn = dset.feature_dicts_path
        target_fn = dset.target_values_path
        feature_df = sql.read.parquet(vector_fn)
        feature_df = feature_df.repartition(2000)
        target_df = sql.read.parquet(target_fn)
        target_df = target_df.repartition(200)
        featurizer = utils.load_file(vector_fn.replace('jsons', 'joblib'))
        target_featurizer = utils.load_file(
            target_fn.replace('jsons', 'joblib'))

        print("Processing dataset {}".format(dname))

        member_ids, feature_array = featurizer.to_array(feature_df)
        column_names = featurizer.get_array_column_names()
        target_member_ids, target_array = target_featurizer.to_array(target_df)
        assert (member_ids == target_member_ids).sum() == len(member_ids)

        target_array = target_array.toarray()
        member_ids = member_ids.reshape(-1)

        # add on column, need featurizer update
        expected_columns = set(
            ['MBR'] + list(target_featurizer.column_feature_map.keys()))
        feature_columns = set(target_df.columns)
        diff_columns = sorted(list(feature_columns - expected_columns))[::-1]
        (cost_ref_idx, cost_fac_idx, pmpm_ref_idx, pmpm_fac_idx) = \
            find_target_extra_index(diff_columns)
        target_array_adj = np.array(target_df.select(diff_columns).collect())
        data_dict = {
            "MBR":
            member_ids,
            "column_names":
            np.array(column_names),
            "column_indices":
            np.arange(len(column_names)),
            "costs":
            target_array[:, 0],
            "pmpm":
            target_array[:, 1],
            "costs_ref":
            target_array_adj[:, cost_ref_idx],
            "pmpm_ref":
            target_array_adj[:, cost_fac_idx],
            "costs_proj_fac":
            target_array_adj[:, pmpm_ref_idx],
            "pmpm_proj_fac":
            target_array_adj[:, pmpm_fac_idx],
            "covered_on_left_censor":
            np.array([m in covered_mbr for m in member_ids]),
            "splits":
            np.array([split_map[m] for m in member_ids])
        }
        full_matrices[dset] = feature_array
        data_dicts[dset] = data_dict

        matrix_path = dset.matrix_path
        if full_matrix and (overwrite or utils.validate_path(matrix_path)):
            save_npz_data(matrix_path, data_dict, feature_array)

    # select features based on feature prevalence of the training members
    model = config.models[model_name]
    feat_prev = model.indv_feature_prev if model.indv_feature_prev else 0.0002
    stack_matrix = None
    for ds in model.train_datasets:
        dset = config.datasets[ds.dataset_name]
        splits_to_use = ds.splits
        mask = (np.isin(data_dicts[dset]["splits"], splits_to_use)
                & data_dicts[dset]["covered_on_left_censor"])
        mat = full_matrices[dset][mask, :]
        if stack_matrix is None:
            stack_matrix = mat
        else:
            stack_matrix = vstack((stack_matrix, mat))
    cols = feature_prev_filter(stack_matrix, feat_prev)

    # save feature-selected matrices
    for dname in datasets:
        dset = config.datasets[dname]
        outpath = config.get_features_selected_matrix_path(dname, model_name)
        dd = data_dicts[dset]
        dd["column_names"] = dd["column_names"][cols]
        dd["column_indices"] = dd["column_indices"][cols]
        if overwrite or utils.validate_path(outpath):
            save_npz_data(outpath, dd, full_matrices[dset][:, cols])

    print("Finished creating matrices")
def run():
    config, model_name = parse_args()
    model = config.models[model_name]
    path_model_cache = config.get_model_cache_path(model_name, "indv_pmpm")

    seen_dsets = set()
    mat_dicts = {}

    datasets = model.train_datasets
    if model.validation_datasets:
        datasets = model.train_datasets + model.validation_datasets

    for ds in datasets:
        dname = ds.dataset_name
        dset = config.datasets[dname]
        if dset not in seen_dsets:
            path_train_mat = config.get_features_selected_matrix_path(
                dname, model_name)
            mat_dicts[dset] = load_matrix_data_from_npz(path_train_mat)
            seen_dsets.add(dset)

    feature_fn = list(seen_dsets)[0].feature_dicts_path
    featurizer = utils.load_file(feature_fn.replace('jsons', 'joblib'))
    feature_indices = list(mat_dicts.values())[0]["column_indices"]

    X_train, y_train = get_features_targets(config,
                                            model,
                                            mat_dicts,
                                            train_sets=True)

    if model.validation_datasets:
        X_valid, y_valid = get_features_targets(config,
                                                model,
                                                mat_dicts,
                                                train_sets=False)

    lgb_reg_params = {
        'objective': 'regression',
        'min_gain_to_split': 75,
        'min_data_in_leaf': 100,
        'num_leaves': 100,
        'num_iterations': 5000 if model.validation_datasets else 100
    }
    valid_params = {'early_stopping_rounds': 5, 'verbose': 10}

    reg = GBRT("indv_pmpm_model", model_package="lgbm", params=lgb_reg_params)
    if model.validation_datasets:
        reg.fit(features=X_train,
                targets=y_train,
                validation_features=X_valid,
                validation_targets=y_valid,
                **valid_params)
    else:
        reg.fit(features=X_train, targets=y_train)
    fimp = reg.model.feature_importances_
    print(f'Number of features found to be important: '
          f'{(fimp != 0).sum():,}')

    client = Client(project_id='lumiata-internal-6f5a',
                    disable_rest_client=True)
    model_base_dir = os.path.dirname(path_model_cache)
    client.save_model_zip(output_dir=model_base_dir,
                          model=reg,
                          featurizer=featurizer,
                          target_variable="pmpm_cost",
                          feature_selection_indices=feature_indices)

    utils.write_file({'model': reg}, path_model_cache)