def test_process_single_column(setup_config, setup_dataframe): """Test that PanelDataProcessor.process_single_column() drops degenerate columns, correctly casts categorical columns, and does not modify individual identifier column.""" errors_list = [] indiv_id_col = setup_config["INDIVIDUAL_IDENTIFIER"] degenerate_cols = [ col for col in setup_dataframe if (setup_dataframe[col].isnull().all()) | (setup_dataframe[col].nunique() < 2) ] categorical_cols = [ col for col in setup_dataframe if ("categorical_var" in col) & (col not in degenerate_cols) ] data_processor = processors.PanelDataProcessor(config=setup_config, data=setup_dataframe) processed_col = data_processor.process_single_column(indiv_id_col) if not processed_col.equals(setup_dataframe[indiv_id_col]): errors_list.append( "Individual identifier column {indiv_id_col} modified.") for degenerate_col in degenerate_cols: processed_col = data_processor.process_single_column(degenerate_col) if processed_col is not None: errors_list.append( f"Degenerate column {degenerate_col} not dropped from dataframe." ) for categorical_col in categorical_cols: processed_col = data_processor.process_single_column(categorical_col) if not isinstance(processed_col.dtype, pd.api.types.CategoricalDtype): errors_list.append( f"Categorical column {categorical_col} not cast to categorical dtype." ) assert not errors_list, "Errors occurred: \n{}".format( "\n".join(errors_list))
def test_sort_panel_data(setup_config, setup_dataframe): """Test that sort_panel_data re-sorts swapped observations.""" data_processor = processors.PanelDataProcessor(config=setup_config, data=setup_dataframe) first_two_rows_already_sorted = data_processor.data.iloc[[1, 2], :].copy() data_processor.data.iloc[[1, 2], :] = data_processor.data.iloc[[2, 1], :] data_processor.data = data_processor.sort_panel_data() assert data_processor.data.iloc[[1, 2], :].equals( first_two_rows_already_sorted)
def test_check_panel_consistency(setup_config, setup_dataframe): """Test that check_panel_consistency raises an error if an observation is duplicated. """ data_processor = processors.PanelDataProcessor(config=setup_config, data=setup_dataframe) data_processor.data = data_processor.data.append( data_processor.data.iloc[[1], :]) with pytest.raises(AssertionError): data_processor.check_panel_consistency()
def test_flag_final_periods(setup_config, setup_dataframe): """Test that flag_final_periods flags observations in the maximum period and only those observations as being in the final period. """ data_processor = processors.PanelDataProcessor(config=setup_config, data=setup_dataframe) final_periods_flag = data_processor.flag_final_periods(1) final_period = np.max(data_processor.data["FILE_DATE"]) pseudo_final_period_flag = data_processor.data["FILE_DATE"] == final_period assert final_periods_flag.equals(pseudo_final_period_flag)
def test_process_all_columns(setup_config, setup_dataframe): """Test that PanelDataProcessor.process_all_columns() replaces the data attribute of PanelDataProcessor instance with a Pandas Dataframe.""" errors_list = [] for parallelize in [True, False]: data_processor = processors.PanelDataProcessor( config=setup_config, data=setup_dataframe) data_processor.process_all_columns(parallelize=parallelize) if not isinstance(data_processor.data, pd.DataFrame): errors_list.append( f"Data attribute returned when parallelize={parallelize} " "is not an instance of pd.DataFrame.") assert not errors_list, "Errors occurred: \n{}".format( "\n".join(errors_list))
def test_flag_event_observed(setup_config, setup_dataframe): """Test that individuals not observed in the maximum period have all observations flagged as event observed.""" data_processor = processors.PanelDataProcessor(config=setup_config, data=setup_dataframe) data_processor.data["not_right_censored"] = \ data_processor.flag_event_observed() data_processor.data["max_date_by_person"] = (data_processor.data.groupby( data_processor.config["INDIVIDUAL_IDENTIFIER"])[ data_processor.config["TIME_IDENTIFIER"]].transform(max)) date_is_less_than_max_for_event_observed_obs = ( data_processor.data.loc[data_processor.data["not_right_censored"], "max_date_by_person"] < data_processor.data[data_processor.config["TIME_IDENTIFIER"]].max()) assert np.prod(date_is_less_than_max_for_event_observed_obs)
def test_flag_validation_individuals(setup_config, setup_dataframe): """Test that validation set is given share of observations and contains all observations of each individual therein. """ data_processor = processors.PanelDataProcessor(config=setup_config, data=setup_dataframe) error_tolerance = 0.1 data_processor.data[ "validation"] = data_processor.flag_validation_individuals() share_in_validation_sample = np.mean(data_processor.data["validation"]) share_approximately_correct = ( (data_processor.config["VALIDATION_SHARE"] - error_tolerance) <= share_in_validation_sample) and ( share_in_validation_sample <= (data_processor.config["VALIDATION_SHARE"] + error_tolerance)) rates_individuals_within_validation_group = data_processor.data.groupby( data_processor.config["INDIVIDUAL_IDENTIFIER"])["validation"].mean() individual_consistently_in_validation_group = ( rates_individuals_within_validation_group == 1) | (rates_individuals_within_validation_group == 0) assert share_approximately_correct assert np.mean(individual_consistently_in_validation_group) == 1
def test_PanelDataProcessor(setup_config, setup_dataframe): """Test that PanelDataProcessor binds config and data arguments.""" data_processor = processors.PanelDataProcessor(config=setup_config, data=setup_dataframe) assert data_processor.config == setup_config and data_processor.data.equals( setup_dataframe)
def main(): """Execute default FIFE pipeline from data to survival forecasts and metrics.""" checkpoint_time = time() config = parse_config() if config.get("EXIT_COL_PATH"): raise NotImplementedError( "Forecasting exit circumstances from the command line is not yet supported. Try LGBExitModeler from the FIFE Python package." ) if config.get("STATE_COL"): raise NotImplementedError( "Forecasting future state from the command line is not yet supported. Try LGBStateModeler from the FIFE Python package." ) utils.make_results_reproducible(config["SEED"]) utils.redirect_output_to_log(path=config["RESULTS_PATH"]) utils.print_copyright() utils.print_config(config) data = read_data(config) print(f"I/O setup time: {time() - checkpoint_time} seconds") checkpoint_time = time() data_processor = processors.PanelDataProcessor(config, data) data_processor.build_processed_data() utils.save_intermediate_data( data_processor.data, "Processed_Data", file_format="pickle", path=config["RESULTS_PATH"], ) print(f"Data processing time: {time() - checkpoint_time} seconds") by_feature = config.get("BY_FEATURE", "") if by_feature != "" and by_feature not in data.columns: raise ValueError( "The selected feature for 'BY_FEATURE' is not in the dataset. Check spelling or the original dataset to ensure that you are entering the correct feature name." ) checkpoint_time = time() utils.ensure_folder_existence(f'{config["RESULTS_PATH"]}/Intermediate/Models') test_intervals = config.get("TEST_INTERVALS", config.get("TEST_PERIODS", 0) - 1) if config.get("TREE_MODELS"): modeler_class = lgb_modelers.LGBSurvivalModeler elif config.get("PROPORTIONAL_HAZARDS"): modeler_class = tf_modelers.ProportionalHazardsModeler else: modeler_class = tf_modelers.FeedforwardNeuralNetworkModeler modeler = modeler_class(config=config, data=data_processor.data) modeler.n_intervals = ( test_intervals if test_intervals > 0 else modeler.set_n_intervals() ) if not config.get("TIME_ID_AS_FEATURE"): modeler.numeric_features.remove(config["TIME_IDENTIFIER"]) if config.get("HYPER_TRIALS", 0) > 0: params = modeler.hyperoptimize(config["HYPER_TRIALS"]) else: params = None modeler.build_model(n_intervals=modeler.n_intervals, params=params) modeler.save_model(path=f"{config['RESULTS_PATH']}/Intermediate/Models/") print(f"Model training time: {time() - checkpoint_time} seconds") checkpoint_time = time() if test_intervals > 0: # Save metrics max_test_intervals = int((len(set(modeler.data["_period"])) - 1) / 2) evaluation_subset = modeler.data["_period"] == ( modeler.data["_period"].max() - min(test_intervals, max_test_intervals) ) if by_feature != "": values = list(set(data[by_feature])) for feature_value in values: evaluation_subset_by_feature = modeler.data[by_feature] == feature_value evaluation_subset_comparison = ( evaluation_subset & evaluation_subset_by_feature ) utils.save_output_table( modeler.evaluate(evaluation_subset_comparison), f"Metrics_{feature_value}", path=config["RESULTS_PATH"], ) utils.save_output_table( modeler.evaluate(evaluation_subset), "Metrics", path=config["RESULTS_PATH"], ) # Save counts by quantile utils.save_output_table( modeler.tabulate_survival_by_quantile( n_quantiles=config["QUANTILES"], subset=evaluation_subset, ), "Counts_by_Quantile", index=False, path=config["RESULTS_PATH"], ) # Save forecast errors actuals = np.array( [ modeler.data[evaluation_subset]["_duration"] > time_horizon for time_horizon in range(test_intervals) ] ).T predictions = modeler.predict(evaluation_subset) utils.save_output_table( pd.DataFrame( predictions - actuals, columns=[ f"{time_horizon + 1}-period Forecast Error" for time_horizon in range(test_intervals) ], ), "Forecast_Errors", index=False, path=config["RESULTS_PATH"], ) # Save calibration errors predicted_share, actual_share = calibration_curve( actuals.flatten(), predictions.flatten(), n_bins=8, strategy="quantile" ) calibration_errors = pd.DataFrame( [predicted_share, actual_share, actual_share - predicted_share] ).T calibration_errors.columns = [ "Predicted Share", "Actual Share", "Calibration Error", ] calibration_errors.index.name = "Quantile" calibration_errors.index = calibration_errors.index + 1 utils.save_output_table( calibration_errors, "Calibration_Errors", path=config["RESULTS_PATH"], ) else: # Save forecasts individual_predictions = modeler.forecast() utils.save_output_table( individual_predictions, "Survival_Curves", path=config["RESULTS_PATH"] ) # Save aggregated forecasts with uncertainty intervals utils.save_output_table( utils.compute_aggregation_uncertainty(individual_predictions), "Aggregate_Survival_Bounds", index=False, path=config["RESULTS_PATH"], ) # Save and plot actual, fitted, and forecasted retention rates lead_periods = config["RETENTION_INTERVAL"] time_ids = pd.factorize( modeler.data[modeler.config["TIME_IDENTIFIER"]], sort=True )[0] retention_rates = modeler.tabulate_retention_rates( lead_periods=lead_periods, time_ids=time_ids ) utils.save_output_table( retention_rates, "Retention_Rates", path=config["RESULTS_PATH"] ) axes = retention_rates.plot() axes.set_ylabel(f"{lead_periods}-period Retention Rate") earliest_period = data_processor.data[ data_processor.config["TIME_IDENTIFIER"] ].min() axes.set_xlabel(f"Periods Since {earliest_period}") utils.save_plot("Retention_Rates", path=config["RESULTS_PATH"]) # Plot SHAP values for a subset of observations in the final period sample_size = config.get("SHAP_SAMPLE_SIZE", 0) if ( isinstance(modeler, (lgb_modelers.GradientBoostedTreesModeler)) and sample_size > 0 ): shap_observations = ( modeler.data[modeler.data["_predict_obs"]] .sample(n=sample_size) .sort_index() ) subset = modeler.data.index.isin(shap_observations.index) shap_values = modeler.compute_shap_values(subset=subset) utils.plot_shap_values( shap_values, shap_observations[ modeler.categorical_features + modeler.numeric_features ], modeler.data[subset][ modeler.categorical_features + modeler.numeric_features ], config["TIME_IDENTIFIER"], path=config["RESULTS_PATH"], ) print(f"Output production time: {time() - checkpoint_time} seconds")
def main(): """Execute default FIFE pipeline from data to forecasts and metrics.""" # Set up I/O checkpoint_time = time() if len(sys.argv) > 1: with open(sys.argv[1], 'r') as file: config = json.load(file) else: print('No configuration file specified.') candidate_configs = [ file for file in os.listdir() if file.endswith('.json') ] assert len(candidate_configs) >= 1, (( 'No json files found in current directory. ' 'Please specify a configuration file in your command, ' 'e.g., "fife example_config.json".')) assert len(candidate_configs) <= 1, (( 'Multiple json files found in current directory. ' 'Please specify a configuration file in your command, ' 'e.g., "fife example_config.json".')) print(f'Using {candidate_configs[0]} as configuration file.') with open(candidate_configs[0], 'r') as file: config = json.load(file) utils.make_results_reproducible(config['SEED']) utils.redirect_output_to_log(path=config['RESULTS_PATH']) print('Produced using FIFE: Finite-Interval Forecasting Engine') print('Copyright (c) 2018 - 2020, Institute for Defense Analyses (IDA)') print('Please cite using the suggested citation in the LICENSE file.\n') utils.print_config(config) # Process data data = utils.import_data_file(config['DATA_FILE_PATH']) if config['INDIVIDUAL_IDENTIFIER'] == '': config['INDIVIDUAL_IDENTIFIER'] = data.columns[0] print('Individual identifier column name not given; assumed to be ' f'leftmost column ({config["INDIVIDUAL_IDENTIFIER"]})') if config['TIME_IDENTIFIER'] == '': config['TIME_IDENTIFIER'] = data.columns[1] print('Time identifier column name not given; assumed to be ' f'second-leftmost column ({config["TIME_IDENTIFIER"]})') data_processor = processors.PanelDataProcessor(config, data) data_processor.build_processed_data() print(f'Data processing time: {time() - checkpoint_time} seconds') checkpoint_time = time() # Save intermediate files utils.save_maps(data_processor.categorical_maps, 'Categorical_Maps', path=config['RESULTS_PATH']) utils.save_maps(data_processor.numeric_ranges, 'Numeric_Ranges', path=config['RESULTS_PATH']) utils.save_intermediate_data(data_processor.data, 'Processed_Data', file_format='pickle', path=config['RESULTS_PATH']) # Train and save model utils.ensure_folder_existence( f'{config["RESULTS_PATH"]}/Intermediate/Models') categorical_features = list(data_processor.categorical_maps.keys()) if config.get('TREE_MODELS'): modeler = \ lgb_modelers.GradientBoostedTreesModeler( config=config, data=data_processor.data, categorical_features=categorical_features) modeler.build_model() for i, lead_specific_model in enumerate(modeler.model): lead_path = (f'{config["RESULTS_PATH"]}/Intermediate/Models/' f'{i + 1}-lead_GBT_Model.json') with open(lead_path, 'w') as file: json.dump(lead_specific_model.dump_model(), file, indent=4) elif config.get('PROPORTIONAL_HAZARDS'): modeler = \ tf_modelers.ProportionalHazardsModeler( config=config, data=data_processor.data, categorical_features=categorical_features) modeler.build_model() modeler.model.save( f'{config["RESULTS_PATH"]}/Intermediate/Models/PH_Model.h5') else: modeler = \ tf_modelers.FeedforwardNeuralNetworkModeler( config=config, data=data_processor.data, categorical_features=categorical_features) modeler.build_model() modeler.model.save( f'{config["RESULTS_PATH"]}/Intermediate/Models/FFNN_Model.h5') print(f'Model training time: {time() - checkpoint_time} seconds') checkpoint_time = time() # Save metrics and forecasts utils.save_output_table(modeler.evaluate(modeler.data['_validation'] & ~modeler.data['_test']), 'Metrics', path=config['RESULTS_PATH']) individual_predictions = modeler.forecast() utils.save_output_table(individual_predictions, 'Survival_Curves', path=config['RESULTS_PATH']) utils.save_output_table( utils.compute_aggregation_uncertainty(individual_predictions), 'Aggregate_Survival_Bounds', index=False, path=config['RESULTS_PATH']) # Save and plot retention rates lead_periods = config['RETENTION_INTERVAL'] time_ids = pd.factorize(modeler.data[modeler.config['TIME_IDENTIFIER']], sort=True)[0] retention_rates = modeler.tabulate_retention_rates( lead_periods=lead_periods, time_ids=time_ids) utils.save_output_table(retention_rates, 'Retention_Rates', path=config['RESULTS_PATH']) axes = retention_rates.plot() axes.set_ylabel(f'{lead_periods}-period Retention Rate') earliest_period = data_processor.numeric_ranges.loc[ data_processor.config["TIME_IDENTIFIER"], "Minimum"] axes.set_xlabel(f'Periods Since {earliest_period}') utils.save_plot('Retention_Rates', path=config['RESULTS_PATH']) # Save event counts by quantile utils.save_output_table(modeler.tabulate_survival_by_quantile( modeler.data['_validation'] & ~modeler.data['_test'], n_quantiles=config['QUANTILES']), 'Counts_by_Quantile', index=False, path=config['RESULTS_PATH']) # Plot SHAP values for a subset of observations in the final period if isinstance(modeler, (lgb_modelers.GradientBoostedTreesModeler)): subset = modeler.data.index.isin(data_processor.raw_subset.index) shap_values = modeler.compute_shap_values(subset=subset) utils.plot_shap_values( shap_values, data_processor.raw_subset[modeler.categorical_features + modeler.numeric_features], modeler.data[subset][modeler.categorical_features + modeler.numeric_features], config['TIME_IDENTIFIER'], path=config['RESULTS_PATH']) # Save metrics for interacted fixed effects model if ((set() < set(config['FIXED_EFFECT_FEATURES']) <= set( data_processor.data))): ife_modeler = \ pd_modelers.InteractedFixedEffectsModeler( config=config, data=data_processor.data, categorical_features=categorical_features) ife_modeler.build_model() with open(f'{config["RESULTS_PATH"]}Intermediate/Models/IFE_Model.p', 'wb') as file: pickle.dump(ife_modeler.model, file) subset = ife_modeler.data['_validation'] & ~ife_modeler.data['_test'] utils.save_output_table(ife_modeler.evaluate(subset), 'IFE_Metrics', path=config['RESULTS_PATH']) ife_quantiles = ife_modeler.tabulate_survival_by_quantile( subset, n_quantiles=config['QUANTILES']) utils.save_output_table(ife_quantiles, 'IFE_Counts_by_Quantile', index=False, path=config['RESULTS_PATH']) print(f'Output production time: {time() - checkpoint_time} seconds')