def main(): config, model_name, datasets = parse_args() for dname in datasets: dset = config.datasets[dname] outpath = config.get_model_results_path(model_name, dname, "agg", "csv") max_date = dset.min_left_censor - relativedelta( months=dset.blackout_months) min_date = max_date - relativedelta(months=dset.projection_months) feature_params = {"left_censor": min_date, "right_censor": max_date} left_censor = dset.min_left_censor right_censor = left_censor + relativedelta( months=dset.projection_months) target_params = { "left_censor": left_censor, "right_censor": right_censor } indv_pmpm_path = config.get_model_results_path(model_name, dname, "indv_pmpm", "json") indv_pmpm = utils.load_file(indv_pmpm_path) mbrs, preds = zip(*indv_pmpm.items()) # group-to-split map split_map = None if dset.is_split: split_map = utils.load_file(dset.group_to_split_path) seg = Segment.load_from_file(dset.ldm_path, "parquet") seg.repartition(2000) slice_date = (dset.min_left_censor - relativedelta(months=dset.blackout_months) - relativedelta(days=1)).date() print("Processing dataset {}".format(dname)) group_seg = GroupSegment(df=seg.df, slice_date=slice_date) # append indv pmpm predictions group_seg.append_member_predictions(mbrs, preds) featurizer = get_featurizer(feature_params, target_params) df = featurizer.featurize(group_seg).toPandas() # add "trend" and "cost_fraction|last_4_month" begin = (min_date + relativedelta(days=1)).strftime('%Y-%m-%d') left = min_date.strftime('%Y-%m-%d') right = max_date.strftime('%Y-%m-%d') df["trend"] = ((df[f'Coverage|None|{right}|GroupNumMembers'] - df[f'Coverage|None|{begin}|GroupNumMembers']) / (df[f'Coverage|{left}|{right}|GroupMemberMonths'])) left_4m = (max_date - relativedelta(months=4)).strftime('%Y-%m-%d') df["cost_fraction|last_4_month"] = ( df[f'Claims|{left_4m}|{right}|GroupAllowedAmount'] / (df[f'Claims|{left}|{right}|GroupAllowedAmount'] + 1)) if dset.is_split: df['split'] = df.GROUP.map(split_map) df.to_csv(outpath, index=False)
def main(): config, model_name, dataset_names, overwrite = parse_args() model_cache_path = config.get_model_cache_path(model_name, "indv") cache = utils.load_file(model_cache_path) for dname in dataset_names: out_fn = config.get_model_results_path(model_name, dname, "indv", "json") if overwrite or utils.validate_path(out_fn): path_train_mat = config.get_features_selected_matrix_path( dname, model_name) orig_dict = load_matrix_data_from_npz(path_train_mat) X_test = orig_dict["matrix"] mbrs_test = orig_dict["MBR"] result_dict = {} preds = cache['reg'].predict(X_test) preds_fac = cache['reg_fac'].predict(X_test) result_dict['reg'] = get_preds_dict(mbrs_test, preds) result_dict['reg_fac'] = get_preds_dict(mbrs_test, preds_fac) for key in cache: if key[:3] != 'clf': continue preds = cache[key].predict(X_test) result_dict[key] = get_preds_dict(mbrs_test, preds, digits=3) print('writing results to', out_fn) utils.write_file(result_dict, out_fn)
def run(): config, model_name, datasets, overwrite = parse_args() for dname in datasets: dset = config.datasets[dname] path_train_mat = config.get_features_selected_matrix_path( dname, model_name) ind_preds_path = config.get_model_results_path(model_name, dname, 'indv', 'json') base_dest_path = dset.plots_path is_gs = 'gs://' in base_dest_path mat_dict = load_matrix_data_from_npz(path_train_mat) ind_preds = utils.load_file(ind_preds_path) if dset.is_split: splits = ['train', 'test', 'evaluate'] else: splits = ['all'] for split in splits: if dset.is_split: dest_path = os.path.join(base_dest_path, model_name, split) else: dest_path = os.path.join(base_dest_path, model_name) if overwrite or utils.validate_path(dest_path): if not is_gs and not os.path.exists(dest_path): os.makedirs(dest_path) mask = (mat_dict["splits"] == split) X, y_true, mbrs = (mat_dict['matrix'][mask, :], mat_dict['costs'][mask], mat_dict['MBR'][mask]) print('number of target values in {} split'.format(split), len(y_true)) y_pred = np.array([ ind_preds['reg'][s] * ind_preds['reg_fac'][s] for s in mbrs ]) sd = dset.min_left_censor - relativedelta( months=dset.blackout_months) sd_minus_1_yr = sd - relativedelta(years=1) right = sd.strftime('%Y-%m-%d') left = sd_minus_1_yr.strftime('%Y-%m-%d') # check min_date if left < dset.min_date.strftime('%Y-%m-%d'): y_prior = None else: feature_names = mat_dict['column_names'] col = f'Claims|{left}|{right}|AllowedCost' idx = np.where(feature_names == col)[0][0] y_prior = X[:, idx].toarray().reshape(-1) metrics = get_overall_metrics(y_true, y_prior, y_pred) df_metrics = get_thresh_dependent_metrics( y_true, y_prior, ind_preds, mbrs, dest_path) utils.write_file(metrics, os.path.join(dest_path, 'metrics.json')) df_metrics.to_csv(os.path.join(dest_path, 'thresh_dependent_metrics.csv'), index=False)
def run(): config, model_name, threshs, overwrite = parse_args() model = config.models[model_name] path_model_cache = config.get_model_cache_path(model_name, "indv") seen_dsets = set() mat_dicts = {} datasets = model.train_datasets if model.validation_datasets: datasets = model.train_datasets + model.validation_datasets for ds in datasets: dname = ds.dataset_name dset = config.datasets[dname] if dset not in seen_dsets: path_train_mat = config.get_features_selected_matrix_path( dname, model_name) if overwrite or utils.validate_path(path_model_cache): mat_dicts[dset] = load_matrix_data_from_npz(path_train_mat) seen_dsets.add(dset) feature_fn = list(seen_dsets)[0].feature_dicts_path featurizer = utils.load_file(feature_fn.replace('jsons', 'joblib')) feature_indices = list(mat_dicts.values())[0]["column_indices"] if overwrite or utils.validate_path(path_model_cache): # X,y for costs_ref X_train, y_train = get_features_targets(config, model, mat_dicts, 'costs_ref', train_sets=True) if model.validation_datasets: X_valid, y_valid = get_features_targets(config, model, mat_dicts, 'costs_ref', train_sets=False) # X,y for costs_proj_fac X_train_fac, y_train_fac = get_features_targets(config, model, mat_dicts, 'costs_proj_fac', train_sets=True) if model.validation_datasets: X_valid_fac, y_valid_fac = get_features_targets(config, model, mat_dicts, 'costs_proj_fac', train_sets=False) lgb_reg_params = { 'objective': 'regression', 'min_gain_to_split': 75, 'min_data_in_leaf': 100, 'num_leaves': 100, 'num_iterations': 5000 if model.validation_datasets else 100 } lgb_clf_params = { 'objective': 'binary', 'min_gain_to_split': 75, 'learning_rate': .01, 'min_data_in_leaf': 100, 'num_leaves': 100, 'num_iterations': 5000 if model.validation_datasets else 250 } valid_params = {'early_stopping_rounds': 5, 'verbose': 10} models = {} # train costs_ref model reg = GBRT("indv_model", model_package="lgbm", params=lgb_reg_params) if model.validation_datasets: reg.fit(features=X_train, targets=y_train, validation_features=X_valid, validation_targets=y_valid, **valid_params) else: reg.fit(features=X_train, targets=y_train) fimp = reg.model.feature_importances_ print(f'Number of features found to be important: ' f'{(fimp != 0).sum():,}') models['reg'] = reg # train costs_proj_fac reg_fac = GBRT("indv_model_fac", model_package="lgbm", params=lgb_reg_params) if model.validation_datasets: reg_fac.fit(features=X_train_fac, targets=y_train_fac, validation_features=X_valid_fac, validation_targets=y_valid_fac, **valid_params) else: reg_fac.fit(features=X_train_fac, targets=y_train_fac) fimp = reg_fac.model.feature_importances_ print(f'Number of features found to be important: ' f'{(fimp != 0).sum():,}') models['reg_fac'] = reg_fac client = Client(project_id='lumiata-internal-6f5a', disable_rest_client=True) model_base_dir = os.path.dirname(path_model_cache) client.save_model_zip(output_dir=model_base_dir, model=reg, featurizer=featurizer, target_variable="allowed_cost", feature_selection_indices=feature_indices) for thresh in threshs: name = int(thresh // 1000) targets_train = y_train > thresh clf = GBDT(f"clf_model_{name}k", model_package="lgbm", params=lgb_clf_params) if model.validation_datasets: targets_valid = y_valid > thresh clf.fit(features=X_train, targets=targets_train, validation_features=X_valid, validation_targets=targets_valid, **valid_params) else: clf.fit(features=X_train, targets=targets_train) fimp = clf.model.feature_importances_ print(f'Number of features found to be important: ' f'{(fimp != 0).sum():,}') models[f'clf_{name}k'] = clf utils.write_file(models, path_model_cache)
def run(): config, model_name, datasets, full_matrix, overwrite = parse_args() sc = SparkContext() sql = SQLContext(sc) # Create full matrices for all datasets full_matrices = {} data_dicts = {} for dname in datasets: dset = config.datasets[dname] split_map = utils.load_file(dset.sha1id_to_split_path) covered_mbr = utils.load_file(dset.sha1ids_covered_on_left_censor_path) covered_mbr = set(covered_mbr) vector_fn = dset.feature_dicts_path target_fn = dset.target_values_path feature_df = sql.read.parquet(vector_fn) feature_df = feature_df.repartition(2000) target_df = sql.read.parquet(target_fn) target_df = target_df.repartition(200) featurizer = utils.load_file(vector_fn.replace('jsons', 'joblib')) target_featurizer = utils.load_file( target_fn.replace('jsons', 'joblib')) print("Processing dataset {}".format(dname)) member_ids, feature_array = featurizer.to_array(feature_df) column_names = featurizer.get_array_column_names() target_member_ids, target_array = target_featurizer.to_array(target_df) assert (member_ids == target_member_ids).sum() == len(member_ids) target_array = target_array.toarray() member_ids = member_ids.reshape(-1) # add on column, need featurizer update expected_columns = set( ['MBR'] + list(target_featurizer.column_feature_map.keys())) feature_columns = set(target_df.columns) diff_columns = sorted(list(feature_columns - expected_columns))[::-1] (cost_ref_idx, cost_fac_idx, pmpm_ref_idx, pmpm_fac_idx) = \ find_target_extra_index(diff_columns) target_array_adj = np.array(target_df.select(diff_columns).collect()) data_dict = { "MBR": member_ids, "column_names": np.array(column_names), "column_indices": np.arange(len(column_names)), "costs": target_array[:, 0], "pmpm": target_array[:, 1], "costs_ref": target_array_adj[:, cost_ref_idx], "pmpm_ref": target_array_adj[:, cost_fac_idx], "costs_proj_fac": target_array_adj[:, pmpm_ref_idx], "pmpm_proj_fac": target_array_adj[:, pmpm_fac_idx], "covered_on_left_censor": np.array([m in covered_mbr for m in member_ids]), "splits": np.array([split_map[m] for m in member_ids]) } full_matrices[dset] = feature_array data_dicts[dset] = data_dict matrix_path = dset.matrix_path if full_matrix and (overwrite or utils.validate_path(matrix_path)): save_npz_data(matrix_path, data_dict, feature_array) # select features based on feature prevalence of the training members model = config.models[model_name] feat_prev = model.indv_feature_prev if model.indv_feature_prev else 0.0002 stack_matrix = None for ds in model.train_datasets: dset = config.datasets[ds.dataset_name] splits_to_use = ds.splits mask = (np.isin(data_dicts[dset]["splits"], splits_to_use) & data_dicts[dset]["covered_on_left_censor"]) mat = full_matrices[dset][mask, :] if stack_matrix is None: stack_matrix = mat else: stack_matrix = vstack((stack_matrix, mat)) cols = feature_prev_filter(stack_matrix, feat_prev) # save feature-selected matrices for dname in datasets: dset = config.datasets[dname] outpath = config.get_features_selected_matrix_path(dname, model_name) dd = data_dicts[dset] dd["column_names"] = dd["column_names"][cols] dd["column_indices"] = dd["column_indices"][cols] if overwrite or utils.validate_path(outpath): save_npz_data(outpath, dd, full_matrices[dset][:, cols]) print("Finished creating matrices")
def run(): config, model_name = parse_args() model = config.models[model_name] path_model_cache = config.get_model_cache_path(model_name, "indv_pmpm") seen_dsets = set() mat_dicts = {} datasets = model.train_datasets if model.validation_datasets: datasets = model.train_datasets + model.validation_datasets for ds in datasets: dname = ds.dataset_name dset = config.datasets[dname] if dset not in seen_dsets: path_train_mat = config.get_features_selected_matrix_path( dname, model_name) mat_dicts[dset] = load_matrix_data_from_npz(path_train_mat) seen_dsets.add(dset) feature_fn = list(seen_dsets)[0].feature_dicts_path featurizer = utils.load_file(feature_fn.replace('jsons', 'joblib')) feature_indices = list(mat_dicts.values())[0]["column_indices"] X_train, y_train = get_features_targets(config, model, mat_dicts, train_sets=True) if model.validation_datasets: X_valid, y_valid = get_features_targets(config, model, mat_dicts, train_sets=False) lgb_reg_params = { 'objective': 'regression', 'min_gain_to_split': 75, 'min_data_in_leaf': 100, 'num_leaves': 100, 'num_iterations': 5000 if model.validation_datasets else 100 } valid_params = {'early_stopping_rounds': 5, 'verbose': 10} reg = GBRT("indv_pmpm_model", model_package="lgbm", params=lgb_reg_params) if model.validation_datasets: reg.fit(features=X_train, targets=y_train, validation_features=X_valid, validation_targets=y_valid, **valid_params) else: reg.fit(features=X_train, targets=y_train) fimp = reg.model.feature_importances_ print(f'Number of features found to be important: ' f'{(fimp != 0).sum():,}') client = Client(project_id='lumiata-internal-6f5a', disable_rest_client=True) model_base_dir = os.path.dirname(path_model_cache) client.save_model_zip(output_dir=model_base_dir, model=reg, featurizer=featurizer, target_variable="pmpm_cost", feature_selection_indices=feature_indices) utils.write_file({'model': reg}, path_model_cache)