def test_ffnnm_hyperoptimize(setup_ffnnm_dataframe, setup_config): """Test that FeedforwardNeuralNetworkModeler.hyperoptimize() returns a dictionary of parameters. """ errors_list = [] cat_features_list = setup_ffnnm_dataframe.select_dtypes( include=["object"]).columns.tolist() for cat_feature in cat_features_list: setup_ffnnm_dataframe[cat_feature] = setup_ffnnm_dataframe[ cat_feature].astype("category") setup_ffnnm_dataframe["FILE_DATE"] = pd.Series( pd.factorize(setup_ffnnm_dataframe["FILE_DATE"])[0]) subset_training_obs = (~setup_ffnnm_dataframe["_validation"] & ~setup_ffnnm_dataframe["_test"] & ~setup_ffnnm_dataframe["_predict_obs"]) training_obs_lead_lengths = setup_ffnnm_dataframe[subset_training_obs][ "_duration"].value_counts() n_intervals = training_obs_lead_lengths[ training_obs_lead_lengths > setup_config["MIN_SURVIVORS_IN_TRAIN"]].index.max() modeler = tf_modelers.FeedforwardNeuralNetworkModeler( config=setup_config, data=setup_ffnnm_dataframe, ) modeler.n_intervals = n_intervals params = modeler.hyperoptimize(2) if not isinstance(params, dict): errors_list.append(f"Parameter set is not a dict.") assert not errors_list, "Errors occurred: \n{}".format( "\n".join(errors_list))
def test_ffnnm_construct_embedding_network(setup_ffnnm_dataframe, setup_config): """Test that FeedforwardNeuralNetworkModeler.construct_embedding_network() returns a Keras training model.""" errors_list = [] assertions = [] config = setup_config data = setup_ffnnm_dataframe subset_training_obs = ~data["_validation"] & ~data["_test"] & ~data["_predict_obs"] train_obs_lead_lengths = data[subset_training_obs]["_duration"].value_counts() n_intervals = train_obs_lead_lengths[ train_obs_lead_lengths > config["MIN_SURVIVORS_IN_TRAIN"] ].index.max() categorical_features = [ "nonmixed_categorical_var", "consistent_mixed_categorical_var", ] for col in categorical_features: data[col] = data[col].astype("category") data["FILE_DATE"], _ = pd.factorize(data["FILE_DATE"]) try: if config["INDIVIDUAL_IDENTIFIER"] == "": config["INDIVIDUAL_IDENTIFIER"] = data.columns[0] if config["TIME_IDENTIFIER"] == "": config["TIME_IDENTIFIER"] = data.columns[1] modeler = tf_modelers.FeedforwardNeuralNetworkModeler(config=config, data=data) modeler.n_intervals = n_intervals modeler.model = modeler.construct_embedding_network() assertions.append(isinstance(modeler.model, Model)) if not assertions[-1]: errors_list.append("Model not of type tensorflow.keras.Model") except Exception as error: assertions.append(False) errors_list.append(str(error)) assertion = all(assertions) assert assertion, "Errors occurred: \n{}".format("\n".join(errors_list))
def test_ffnnm_train(setup_ffnnm_dataframe, setup_config): """Test that FeedforwardNeuralNetworkModeler.train() returns a Keras training model and that the modeler's weights change during training.""" errors_list = [] assertions = [] config = setup_config data = setup_ffnnm_dataframe subset_training_obs = ~data["_validation"] & ~data["_test"] & ~data[ "_predict_obs"] train_obs_lead_lengths = data[subset_training_obs][ "_duration"].value_counts() n_intervals = train_obs_lead_lengths[ train_obs_lead_lengths > config["MIN_SURVIVORS_IN_TRAIN"]].index.max() categorical_features = [ "nonmixed_categorical_var", "consistent_mixed_categorical_var", ] for col in categorical_features: data[col] = data[col].astype("category") data["FILE_DATE"], _ = pd.factorize(data["FILE_DATE"]) try: modeler = tf_modelers.FeedforwardNeuralNetworkModeler(config=config, data=data) modeler.n_intervals = n_intervals modeler.data[modeler.numeric_features] = modeler.data[ modeler.numeric_features].fillna( modeler.config["NON_CAT_MISSING_VALUE"]) modeler.model = modeler.construct_embedding_network() weights_pretrain = modeler.model.get_weights() modeler.model = modeler.train() weights_posttrain = modeler.model.get_weights() no_change_in_weights_list = [] for i, weight_pretrain in enumerate(weights_pretrain): no_change_in_weights_i = np.equal(weight_pretrain, weights_posttrain[i]) no_change_in_weights_list.append(np.all(no_change_in_weights_i)) weights_are_training = not all(no_change_in_weights_list) assertions.append(weights_are_training) if not assertions[-1]: errors_list.append("Model weights have not changed, suggesting " "failure to train") assertions.append(isinstance(modeler.model, Model)) if not assertions[-1]: errors_list.append("Model not of type tensorflow.keras.Model") except Exception as error: assertions.append(False) errors_list.append(str(error)) assertion = all(assertions) assert assertion, "Errors occurred: \n{}".format("\n".join(errors_list))
def test_ffnnm_train(setup_ffnnm_dataframe, setup_config): """Test that FeedforwardNeuralNetworkModeler.train() returns a Keras training model and that the modeler's weights change during training.""" errors_list = [] assertions = [] config = setup_config data = setup_ffnnm_dataframe subset_training_obs = (~data['_validation'] & ~data['_test'] & ~data['_predict_obs']) train_obs_lead_lengths = ( data[subset_training_obs]['_duration'].value_counts()) n_intervals = train_obs_lead_lengths[ train_obs_lead_lengths > config['MIN_SURVIVORS_IN_TRAIN']].index.max() categorical_features = [ 'nonmixed_categorical_var', 'consistent_mixed_categorical_var' ] data['FILE_DATE'], _ = pd.factorize(data['FILE_DATE']) try: modeler = tf_modelers.FeedforwardNeuralNetworkModeler( config=config, data=data, categorical_features=categorical_features) modeler.n_intervals = n_intervals modeler.data = modeler.data.fillna( modeler.config['NON_CAT_MISSING_VALUE']) modeler.model = modeler.construct_embedding_network() weights_pretrain = modeler.model.get_weights() modeler.model = modeler.train() weights_posttrain = modeler.model.get_weights() no_change_in_weights_list = [] for i, weight_pretrain in enumerate(weights_pretrain): no_change_in_weights_i = np.equal(weight_pretrain, weights_posttrain[i]) no_change_in_weights_list.append(np.all(no_change_in_weights_i)) weights_are_training = not all(no_change_in_weights_list) assertions.append(weights_are_training) if not assertions[-1]: errors_list.append('Model weights have not changed, suggesting ' 'failure to train') assertions.append(isinstance(modeler.model, ke.training.Model)) if not assertions[-1]: errors_list.append('Model not of type keras.engine.training.Model') except Exception as error: assertions.append(False) errors_list.append(str(error)) assertion = all(assertions) assert assertion, 'Errors occurred: \n{}'.format('\n'.join(errors_list))
def test_ffnnm_init(setup_ffnnm_dataframe, setup_config): """Test that FeedforwardNeuralNetworkModeler instantiates properly.""" errors_list = [] assertions = [] config = setup_config data = setup_ffnnm_dataframe try: if config['INDIVIDUAL_IDENTIFIER'] == '': config['INDIVIDUAL_IDENTIFIER'] = data.columns[0] if config['TIME_IDENTIFIER'] == '': config['TIME_IDENTIFIER'] = data.columns[1] modeler = tf_modelers.FeedforwardNeuralNetworkModeler( config=config, data=data, categorical_features=[]) assertions.append( isinstance(modeler, tf_modelers.FeedforwardNeuralNetworkModeler)) if assertions[-1] is False: errors_list.append('Modeler did not instantiate properly') except Exception as error: assertions.append(False) errors_list.append(str(error)) assertion = all(assertions) assert assertion, 'Errors occurred: \n{}'.format('\n'.join(errors_list))
def test_ffnnm_init(setup_ffnnm_dataframe, setup_config): """Test that FeedforwardNeuralNetworkModeler instantiates properly.""" errors_list = [] assertions = [] config = setup_config data = setup_ffnnm_dataframe try: if config["INDIVIDUAL_IDENTIFIER"] == "": config["INDIVIDUAL_IDENTIFIER"] = data.columns[0] if config["TIME_IDENTIFIER"] == "": config["TIME_IDENTIFIER"] = data.columns[1] modeler = tf_modelers.FeedforwardNeuralNetworkModeler(config=config, data=data) assertions.append( isinstance(modeler, tf_modelers.FeedforwardNeuralNetworkModeler)) if assertions[-1] is False: errors_list.append("Modeler did not instantiate properly") except Exception as error: assertions.append(False) errors_list.append(str(error)) assertion = all(assertions) assert assertion, "Errors occurred: \n{}".format("\n".join(errors_list))
def test_ffnnm_construct_embedding_network(setup_ffnnm_dataframe, setup_config): """Test that FeedforwardNeuralNetworkModeler.construct_embedding_network() returns a Keras training model.""" errors_list = [] assertions = [] config = setup_config data = setup_ffnnm_dataframe subset_training_obs = (~data['_validation'] & ~data['_test'] & ~data['_predict_obs']) train_obs_lead_lengths = ( data[subset_training_obs]['_duration'].value_counts()) n_intervals = train_obs_lead_lengths[ train_obs_lead_lengths > config['MIN_SURVIVORS_IN_TRAIN']].index.max() categorical_features = [ 'nonmixed_categorical_var', 'consistent_mixed_categorical_var' ] data['FILE_DATE'], _ = pd.factorize(data['FILE_DATE']) try: if config['INDIVIDUAL_IDENTIFIER'] == '': config['INDIVIDUAL_IDENTIFIER'] = data.columns[0] if config['TIME_IDENTIFIER'] == '': config['TIME_IDENTIFIER'] = data.columns[1] modeler = tf_modelers.FeedforwardNeuralNetworkModeler( config=config, data=data, categorical_features=categorical_features) modeler.n_intervals = n_intervals modeler.model = modeler.construct_embedding_network() assertions.append(isinstance(modeler.model, ke.training.Model)) if not assertions[-1]: errors_list.append('Model not of type keras.engine.training.Model') except Exception as error: assertions.append(False) errors_list.append(str(error)) assertion = all(assertions) assert assertion, 'Errors occurred: \n{}'.format('\n'.join(errors_list))
def main(): """Execute default FIFE pipeline from data to forecasts and metrics.""" # Set up I/O checkpoint_time = time() if len(sys.argv) > 1: with open(sys.argv[1], 'r') as file: config = json.load(file) else: print('No configuration file specified.') candidate_configs = [ file for file in os.listdir() if file.endswith('.json') ] assert len(candidate_configs) >= 1, (( 'No json files found in current directory. ' 'Please specify a configuration file in your command, ' 'e.g., "fife example_config.json".')) assert len(candidate_configs) <= 1, (( 'Multiple json files found in current directory. ' 'Please specify a configuration file in your command, ' 'e.g., "fife example_config.json".')) print(f'Using {candidate_configs[0]} as configuration file.') with open(candidate_configs[0], 'r') as file: config = json.load(file) utils.make_results_reproducible(config['SEED']) utils.redirect_output_to_log(path=config['RESULTS_PATH']) print('Produced using FIFE: Finite-Interval Forecasting Engine') print('Copyright (c) 2018 - 2020, Institute for Defense Analyses (IDA)') print('Please cite using the suggested citation in the LICENSE file.\n') utils.print_config(config) # Process data data = utils.import_data_file(config['DATA_FILE_PATH']) if config['INDIVIDUAL_IDENTIFIER'] == '': config['INDIVIDUAL_IDENTIFIER'] = data.columns[0] print('Individual identifier column name not given; assumed to be ' f'leftmost column ({config["INDIVIDUAL_IDENTIFIER"]})') if config['TIME_IDENTIFIER'] == '': config['TIME_IDENTIFIER'] = data.columns[1] print('Time identifier column name not given; assumed to be ' f'second-leftmost column ({config["TIME_IDENTIFIER"]})') data_processor = processors.PanelDataProcessor(config, data) data_processor.build_processed_data() print(f'Data processing time: {time() - checkpoint_time} seconds') checkpoint_time = time() # Save intermediate files utils.save_maps(data_processor.categorical_maps, 'Categorical_Maps', path=config['RESULTS_PATH']) utils.save_maps(data_processor.numeric_ranges, 'Numeric_Ranges', path=config['RESULTS_PATH']) utils.save_intermediate_data(data_processor.data, 'Processed_Data', file_format='pickle', path=config['RESULTS_PATH']) # Train and save model utils.ensure_folder_existence( f'{config["RESULTS_PATH"]}/Intermediate/Models') categorical_features = list(data_processor.categorical_maps.keys()) if config.get('TREE_MODELS'): modeler = \ lgb_modelers.GradientBoostedTreesModeler( config=config, data=data_processor.data, categorical_features=categorical_features) modeler.build_model() for i, lead_specific_model in enumerate(modeler.model): lead_path = (f'{config["RESULTS_PATH"]}/Intermediate/Models/' f'{i + 1}-lead_GBT_Model.json') with open(lead_path, 'w') as file: json.dump(lead_specific_model.dump_model(), file, indent=4) elif config.get('PROPORTIONAL_HAZARDS'): modeler = \ tf_modelers.ProportionalHazardsModeler( config=config, data=data_processor.data, categorical_features=categorical_features) modeler.build_model() modeler.model.save( f'{config["RESULTS_PATH"]}/Intermediate/Models/PH_Model.h5') else: modeler = \ tf_modelers.FeedforwardNeuralNetworkModeler( config=config, data=data_processor.data, categorical_features=categorical_features) modeler.build_model() modeler.model.save( f'{config["RESULTS_PATH"]}/Intermediate/Models/FFNN_Model.h5') print(f'Model training time: {time() - checkpoint_time} seconds') checkpoint_time = time() # Save metrics and forecasts utils.save_output_table(modeler.evaluate(modeler.data['_validation'] & ~modeler.data['_test']), 'Metrics', path=config['RESULTS_PATH']) individual_predictions = modeler.forecast() utils.save_output_table(individual_predictions, 'Survival_Curves', path=config['RESULTS_PATH']) utils.save_output_table( utils.compute_aggregation_uncertainty(individual_predictions), 'Aggregate_Survival_Bounds', index=False, path=config['RESULTS_PATH']) # Save and plot retention rates lead_periods = config['RETENTION_INTERVAL'] time_ids = pd.factorize(modeler.data[modeler.config['TIME_IDENTIFIER']], sort=True)[0] retention_rates = modeler.tabulate_retention_rates( lead_periods=lead_periods, time_ids=time_ids) utils.save_output_table(retention_rates, 'Retention_Rates', path=config['RESULTS_PATH']) axes = retention_rates.plot() axes.set_ylabel(f'{lead_periods}-period Retention Rate') earliest_period = data_processor.numeric_ranges.loc[ data_processor.config["TIME_IDENTIFIER"], "Minimum"] axes.set_xlabel(f'Periods Since {earliest_period}') utils.save_plot('Retention_Rates', path=config['RESULTS_PATH']) # Save event counts by quantile utils.save_output_table(modeler.tabulate_survival_by_quantile( modeler.data['_validation'] & ~modeler.data['_test'], n_quantiles=config['QUANTILES']), 'Counts_by_Quantile', index=False, path=config['RESULTS_PATH']) # Plot SHAP values for a subset of observations in the final period if isinstance(modeler, (lgb_modelers.GradientBoostedTreesModeler)): subset = modeler.data.index.isin(data_processor.raw_subset.index) shap_values = modeler.compute_shap_values(subset=subset) utils.plot_shap_values( shap_values, data_processor.raw_subset[modeler.categorical_features + modeler.numeric_features], modeler.data[subset][modeler.categorical_features + modeler.numeric_features], config['TIME_IDENTIFIER'], path=config['RESULTS_PATH']) # Save metrics for interacted fixed effects model if ((set() < set(config['FIXED_EFFECT_FEATURES']) <= set( data_processor.data))): ife_modeler = \ pd_modelers.InteractedFixedEffectsModeler( config=config, data=data_processor.data, categorical_features=categorical_features) ife_modeler.build_model() with open(f'{config["RESULTS_PATH"]}Intermediate/Models/IFE_Model.p', 'wb') as file: pickle.dump(ife_modeler.model, file) subset = ife_modeler.data['_validation'] & ~ife_modeler.data['_test'] utils.save_output_table(ife_modeler.evaluate(subset), 'IFE_Metrics', path=config['RESULTS_PATH']) ife_quantiles = ife_modeler.tabulate_survival_by_quantile( subset, n_quantiles=config['QUANTILES']) utils.save_output_table(ife_quantiles, 'IFE_Counts_by_Quantile', index=False, path=config['RESULTS_PATH']) print(f'Output production time: {time() - checkpoint_time} seconds')