Example #1
0
def test_process_single_column(setup_config, setup_dataframe):
    """Test that PanelDataProcessor.process_single_column() drops degenerate
    columns, correctly casts categorical columns, and does not modify individual
    identifier column."""
    errors_list = []
    indiv_id_col = setup_config["INDIVIDUAL_IDENTIFIER"]
    degenerate_cols = [
        col for col in setup_dataframe if (setup_dataframe[col].isnull().all())
        | (setup_dataframe[col].nunique() < 2)
    ]
    categorical_cols = [
        col for col in setup_dataframe
        if ("categorical_var" in col) & (col not in degenerate_cols)
    ]
    data_processor = processors.PanelDataProcessor(config=setup_config,
                                                   data=setup_dataframe)
    processed_col = data_processor.process_single_column(indiv_id_col)
    if not processed_col.equals(setup_dataframe[indiv_id_col]):
        errors_list.append(
            "Individual identifier column {indiv_id_col} modified.")
    for degenerate_col in degenerate_cols:
        processed_col = data_processor.process_single_column(degenerate_col)
        if processed_col is not None:
            errors_list.append(
                f"Degenerate column {degenerate_col} not dropped from dataframe."
            )
    for categorical_col in categorical_cols:
        processed_col = data_processor.process_single_column(categorical_col)
        if not isinstance(processed_col.dtype, pd.api.types.CategoricalDtype):
            errors_list.append(
                f"Categorical column {categorical_col} not cast to categorical dtype."
            )
    assert not errors_list, "Errors occurred: \n{}".format(
        "\n".join(errors_list))
Example #2
0
def test_sort_panel_data(setup_config, setup_dataframe):
    """Test that sort_panel_data re-sorts swapped observations."""
    data_processor = processors.PanelDataProcessor(config=setup_config,
                                                   data=setup_dataframe)
    first_two_rows_already_sorted = data_processor.data.iloc[[1, 2], :].copy()
    data_processor.data.iloc[[1, 2], :] = data_processor.data.iloc[[2, 1], :]
    data_processor.data = data_processor.sort_panel_data()
    assert data_processor.data.iloc[[1, 2], :].equals(
        first_two_rows_already_sorted)
Example #3
0
def test_check_panel_consistency(setup_config, setup_dataframe):
    """Test that check_panel_consistency raises an error if an
    observation is duplicated.
    """
    data_processor = processors.PanelDataProcessor(config=setup_config,
                                                   data=setup_dataframe)
    data_processor.data = data_processor.data.append(
        data_processor.data.iloc[[1], :])
    with pytest.raises(AssertionError):
        data_processor.check_panel_consistency()
Example #4
0
def test_flag_final_periods(setup_config, setup_dataframe):
    """Test that flag_final_periods flags observations in the maximum period
    and only those observations as being in the final period.
    """
    data_processor = processors.PanelDataProcessor(config=setup_config,
                                                   data=setup_dataframe)
    final_periods_flag = data_processor.flag_final_periods(1)
    final_period = np.max(data_processor.data["FILE_DATE"])
    pseudo_final_period_flag = data_processor.data["FILE_DATE"] == final_period
    assert final_periods_flag.equals(pseudo_final_period_flag)
Example #5
0
 def test_process_all_columns(setup_config, setup_dataframe):
     """Test that PanelDataProcessor.process_all_columns() replaces the data
     attribute of PanelDataProcessor instance with a Pandas Dataframe."""
     errors_list = []
     for parallelize in [True, False]:
         data_processor = processors.PanelDataProcessor(
             config=setup_config, data=setup_dataframe)
         data_processor.process_all_columns(parallelize=parallelize)
         if not isinstance(data_processor.data, pd.DataFrame):
             errors_list.append(
                 f"Data attribute returned when parallelize={parallelize} "
                 "is not an instance of pd.DataFrame.")
     assert not errors_list, "Errors occurred: \n{}".format(
         "\n".join(errors_list))
Example #6
0
def test_flag_event_observed(setup_config, setup_dataframe):
    """Test that individuals not observed in the maximum period have
    all observations flagged as event observed."""
    data_processor = processors.PanelDataProcessor(config=setup_config,
                                                   data=setup_dataframe)
    data_processor.data["not_right_censored"] = \
        data_processor.flag_event_observed()
    data_processor.data["max_date_by_person"] = (data_processor.data.groupby(
        data_processor.config["INDIVIDUAL_IDENTIFIER"])[
            data_processor.config["TIME_IDENTIFIER"]].transform(max))
    date_is_less_than_max_for_event_observed_obs = (
        data_processor.data.loc[data_processor.data["not_right_censored"],
                                "max_date_by_person"] <
        data_processor.data[data_processor.config["TIME_IDENTIFIER"]].max())
    assert np.prod(date_is_less_than_max_for_event_observed_obs)
Example #7
0
def test_flag_validation_individuals(setup_config, setup_dataframe):
    """Test that validation set is given share of observations and contains
    all observations of each individual therein.
    """
    data_processor = processors.PanelDataProcessor(config=setup_config,
                                                   data=setup_dataframe)
    error_tolerance = 0.1
    data_processor.data[
        "validation"] = data_processor.flag_validation_individuals()
    share_in_validation_sample = np.mean(data_processor.data["validation"])
    share_approximately_correct = (
        (data_processor.config["VALIDATION_SHARE"] - error_tolerance) <=
        share_in_validation_sample) and (
            share_in_validation_sample <=
            (data_processor.config["VALIDATION_SHARE"] + error_tolerance))
    rates_individuals_within_validation_group = data_processor.data.groupby(
        data_processor.config["INDIVIDUAL_IDENTIFIER"])["validation"].mean()
    individual_consistently_in_validation_group = (
        rates_individuals_within_validation_group
        == 1) | (rates_individuals_within_validation_group == 0)
    assert share_approximately_correct
    assert np.mean(individual_consistently_in_validation_group) == 1
Example #8
0
def test_PanelDataProcessor(setup_config, setup_dataframe):
    """Test that PanelDataProcessor binds config and data arguments."""
    data_processor = processors.PanelDataProcessor(config=setup_config,
                                                   data=setup_dataframe)
    assert data_processor.config == setup_config and data_processor.data.equals(
        setup_dataframe)
Example #9
0
def main():
    """Execute default FIFE pipeline from data to survival forecasts and metrics."""
    checkpoint_time = time()
    config = parse_config()
    if config.get("EXIT_COL_PATH"):
        raise NotImplementedError(
            "Forecasting exit circumstances from the command line is not yet supported. Try LGBExitModeler from the FIFE Python package."
        )
    if config.get("STATE_COL"):
        raise NotImplementedError(
            "Forecasting future state from the command line is not yet supported. Try LGBStateModeler from the FIFE Python package."
        )
    utils.make_results_reproducible(config["SEED"])
    utils.redirect_output_to_log(path=config["RESULTS_PATH"])
    utils.print_copyright()
    utils.print_config(config)
    data = read_data(config)
    print(f"I/O setup time: {time() - checkpoint_time} seconds")

    checkpoint_time = time()
    data_processor = processors.PanelDataProcessor(config, data)
    data_processor.build_processed_data()
    utils.save_intermediate_data(
        data_processor.data,
        "Processed_Data",
        file_format="pickle",
        path=config["RESULTS_PATH"],
    )
    print(f"Data processing time: {time() - checkpoint_time} seconds")

    by_feature = config.get("BY_FEATURE", "")
    if by_feature != "" and by_feature not in data.columns:
        raise ValueError(
            "The selected feature for 'BY_FEATURE' is not in the dataset. Check spelling or the original dataset to ensure that you are entering the correct feature name."
        )

    checkpoint_time = time()
    utils.ensure_folder_existence(f'{config["RESULTS_PATH"]}/Intermediate/Models')
    test_intervals = config.get("TEST_INTERVALS", config.get("TEST_PERIODS", 0) - 1)
    if config.get("TREE_MODELS"):
        modeler_class = lgb_modelers.LGBSurvivalModeler
    elif config.get("PROPORTIONAL_HAZARDS"):
        modeler_class = tf_modelers.ProportionalHazardsModeler
    else:
        modeler_class = tf_modelers.FeedforwardNeuralNetworkModeler
    modeler = modeler_class(config=config, data=data_processor.data)
    modeler.n_intervals = (
        test_intervals if test_intervals > 0 else modeler.set_n_intervals()
    )
    if not config.get("TIME_ID_AS_FEATURE"):
        modeler.numeric_features.remove(config["TIME_IDENTIFIER"])
    if config.get("HYPER_TRIALS", 0) > 0:
        params = modeler.hyperoptimize(config["HYPER_TRIALS"])
    else:
        params = None
    modeler.build_model(n_intervals=modeler.n_intervals, params=params)
    modeler.save_model(path=f"{config['RESULTS_PATH']}/Intermediate/Models/")
    print(f"Model training time: {time() - checkpoint_time} seconds")

    checkpoint_time = time()
    if test_intervals > 0:

        # Save metrics
        max_test_intervals = int((len(set(modeler.data["_period"])) - 1) / 2)

        evaluation_subset = modeler.data["_period"] == (
            modeler.data["_period"].max() - min(test_intervals, max_test_intervals)
        )

        if by_feature != "":
            values = list(set(data[by_feature]))

            for feature_value in values:

                evaluation_subset_by_feature = modeler.data[by_feature] == feature_value
                evaluation_subset_comparison = (
                    evaluation_subset & evaluation_subset_by_feature
                )

                utils.save_output_table(
                    modeler.evaluate(evaluation_subset_comparison),
                    f"Metrics_{feature_value}",
                    path=config["RESULTS_PATH"],
                )

        utils.save_output_table(
            modeler.evaluate(evaluation_subset),
            "Metrics",
            path=config["RESULTS_PATH"],
        )

        # Save counts by quantile
        utils.save_output_table(
            modeler.tabulate_survival_by_quantile(
                n_quantiles=config["QUANTILES"],
                subset=evaluation_subset,
            ),
            "Counts_by_Quantile",
            index=False,
            path=config["RESULTS_PATH"],
        )

        # Save forecast errors
        actuals = np.array(
            [
                modeler.data[evaluation_subset]["_duration"] > time_horizon
                for time_horizon in range(test_intervals)
            ]
        ).T
        predictions = modeler.predict(evaluation_subset)
        utils.save_output_table(
            pd.DataFrame(
                predictions - actuals,
                columns=[
                    f"{time_horizon + 1}-period Forecast Error"
                    for time_horizon in range(test_intervals)
                ],
            ),
            "Forecast_Errors",
            index=False,
            path=config["RESULTS_PATH"],
        )

        # Save calibration errors
        predicted_share, actual_share = calibration_curve(
            actuals.flatten(), predictions.flatten(), n_bins=8, strategy="quantile"
        )
        calibration_errors = pd.DataFrame(
            [predicted_share, actual_share, actual_share - predicted_share]
        ).T
        calibration_errors.columns = [
            "Predicted Share",
            "Actual Share",
            "Calibration Error",
        ]
        calibration_errors.index.name = "Quantile"
        calibration_errors.index = calibration_errors.index + 1
        utils.save_output_table(
            calibration_errors,
            "Calibration_Errors",
            path=config["RESULTS_PATH"],
        )

    else:

        # Save forecasts
        individual_predictions = modeler.forecast()
        utils.save_output_table(
            individual_predictions, "Survival_Curves", path=config["RESULTS_PATH"]
        )

        # Save aggregated forecasts with uncertainty intervals
        utils.save_output_table(
            utils.compute_aggregation_uncertainty(individual_predictions),
            "Aggregate_Survival_Bounds",
            index=False,
            path=config["RESULTS_PATH"],
        )

        # Save and plot actual, fitted, and forecasted retention rates
        lead_periods = config["RETENTION_INTERVAL"]
        time_ids = pd.factorize(
            modeler.data[modeler.config["TIME_IDENTIFIER"]], sort=True
        )[0]
        retention_rates = modeler.tabulate_retention_rates(
            lead_periods=lead_periods, time_ids=time_ids
        )
        utils.save_output_table(
            retention_rates, "Retention_Rates", path=config["RESULTS_PATH"]
        )
        axes = retention_rates.plot()
        axes.set_ylabel(f"{lead_periods}-period Retention Rate")
        earliest_period = data_processor.data[
            data_processor.config["TIME_IDENTIFIER"]
        ].min()
        axes.set_xlabel(f"Periods Since {earliest_period}")
        utils.save_plot("Retention_Rates", path=config["RESULTS_PATH"])

        # Plot SHAP values for a subset of observations in the final period
        sample_size = config.get("SHAP_SAMPLE_SIZE", 0)
        if (
            isinstance(modeler, (lgb_modelers.GradientBoostedTreesModeler))
            and sample_size > 0
        ):
            shap_observations = (
                modeler.data[modeler.data["_predict_obs"]]
                .sample(n=sample_size)
                .sort_index()
            )
            subset = modeler.data.index.isin(shap_observations.index)
            shap_values = modeler.compute_shap_values(subset=subset)
            utils.plot_shap_values(
                shap_values,
                shap_observations[
                    modeler.categorical_features + modeler.numeric_features
                ],
                modeler.data[subset][
                    modeler.categorical_features + modeler.numeric_features
                ],
                config["TIME_IDENTIFIER"],
                path=config["RESULTS_PATH"],
            )

    print(f"Output production time: {time() - checkpoint_time} seconds")
Example #10
0
def main():
    """Execute default FIFE pipeline from data to forecasts and metrics."""
    # Set up I/O
    checkpoint_time = time()
    if len(sys.argv) > 1:
        with open(sys.argv[1], 'r') as file:
            config = json.load(file)
    else:
        print('No configuration file specified.')
        candidate_configs = [
            file for file in os.listdir() if file.endswith('.json')
        ]
        assert len(candidate_configs) >= 1, ((
            'No json files found in current directory. '
            'Please specify a configuration file in your command, '
            'e.g., "fife example_config.json".'))
        assert len(candidate_configs) <= 1, ((
            'Multiple json files found in current directory. '
            'Please specify a configuration file in your command, '
            'e.g., "fife example_config.json".'))
        print(f'Using {candidate_configs[0]} as configuration file.')
        with open(candidate_configs[0], 'r') as file:
            config = json.load(file)

    utils.make_results_reproducible(config['SEED'])
    utils.redirect_output_to_log(path=config['RESULTS_PATH'])
    print('Produced using FIFE: Finite-Interval Forecasting Engine')
    print('Copyright (c) 2018 - 2020, Institute for Defense Analyses (IDA)')
    print('Please cite using the suggested citation in the LICENSE file.\n')
    utils.print_config(config)

    # Process data
    data = utils.import_data_file(config['DATA_FILE_PATH'])
    if config['INDIVIDUAL_IDENTIFIER'] == '':
        config['INDIVIDUAL_IDENTIFIER'] = data.columns[0]
        print('Individual identifier column name not given; assumed to be '
              f'leftmost column ({config["INDIVIDUAL_IDENTIFIER"]})')
    if config['TIME_IDENTIFIER'] == '':
        config['TIME_IDENTIFIER'] = data.columns[1]
        print('Time identifier column name not given; assumed to be '
              f'second-leftmost column ({config["TIME_IDENTIFIER"]})')
    data_processor = processors.PanelDataProcessor(config, data)
    data_processor.build_processed_data()
    print(f'Data processing time: {time() - checkpoint_time} seconds')
    checkpoint_time = time()

    # Save intermediate files
    utils.save_maps(data_processor.categorical_maps,
                    'Categorical_Maps',
                    path=config['RESULTS_PATH'])
    utils.save_maps(data_processor.numeric_ranges,
                    'Numeric_Ranges',
                    path=config['RESULTS_PATH'])
    utils.save_intermediate_data(data_processor.data,
                                 'Processed_Data',
                                 file_format='pickle',
                                 path=config['RESULTS_PATH'])

    # Train and save model
    utils.ensure_folder_existence(
        f'{config["RESULTS_PATH"]}/Intermediate/Models')
    categorical_features = list(data_processor.categorical_maps.keys())
    if config.get('TREE_MODELS'):
        modeler = \
            lgb_modelers.GradientBoostedTreesModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        modeler.build_model()
        for i, lead_specific_model in enumerate(modeler.model):
            lead_path = (f'{config["RESULTS_PATH"]}/Intermediate/Models/'
                         f'{i + 1}-lead_GBT_Model.json')
            with open(lead_path, 'w') as file:
                json.dump(lead_specific_model.dump_model(), file, indent=4)
    elif config.get('PROPORTIONAL_HAZARDS'):
        modeler = \
            tf_modelers.ProportionalHazardsModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        modeler.build_model()
        modeler.model.save(
            f'{config["RESULTS_PATH"]}/Intermediate/Models/PH_Model.h5')
    else:
        modeler = \
            tf_modelers.FeedforwardNeuralNetworkModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        modeler.build_model()
        modeler.model.save(
            f'{config["RESULTS_PATH"]}/Intermediate/Models/FFNN_Model.h5')
    print(f'Model training time: {time() - checkpoint_time} seconds')
    checkpoint_time = time()

    # Save metrics and forecasts
    utils.save_output_table(modeler.evaluate(modeler.data['_validation']
                                             & ~modeler.data['_test']),
                            'Metrics',
                            path=config['RESULTS_PATH'])
    individual_predictions = modeler.forecast()
    utils.save_output_table(individual_predictions,
                            'Survival_Curves',
                            path=config['RESULTS_PATH'])
    utils.save_output_table(
        utils.compute_aggregation_uncertainty(individual_predictions),
        'Aggregate_Survival_Bounds',
        index=False,
        path=config['RESULTS_PATH'])

    # Save and plot retention rates
    lead_periods = config['RETENTION_INTERVAL']
    time_ids = pd.factorize(modeler.data[modeler.config['TIME_IDENTIFIER']],
                            sort=True)[0]
    retention_rates = modeler.tabulate_retention_rates(
        lead_periods=lead_periods, time_ids=time_ids)
    utils.save_output_table(retention_rates,
                            'Retention_Rates',
                            path=config['RESULTS_PATH'])
    axes = retention_rates.plot()
    axes.set_ylabel(f'{lead_periods}-period Retention Rate')
    earliest_period = data_processor.numeric_ranges.loc[
        data_processor.config["TIME_IDENTIFIER"], "Minimum"]
    axes.set_xlabel(f'Periods Since {earliest_period}')
    utils.save_plot('Retention_Rates', path=config['RESULTS_PATH'])

    # Save event counts by quantile
    utils.save_output_table(modeler.tabulate_survival_by_quantile(
        modeler.data['_validation'] & ~modeler.data['_test'],
        n_quantiles=config['QUANTILES']),
                            'Counts_by_Quantile',
                            index=False,
                            path=config['RESULTS_PATH'])

    # Plot SHAP values for a subset of observations in the final period
    if isinstance(modeler, (lgb_modelers.GradientBoostedTreesModeler)):
        subset = modeler.data.index.isin(data_processor.raw_subset.index)
        shap_values = modeler.compute_shap_values(subset=subset)
        utils.plot_shap_values(
            shap_values,
            data_processor.raw_subset[modeler.categorical_features +
                                      modeler.numeric_features],
            modeler.data[subset][modeler.categorical_features +
                                 modeler.numeric_features],
            config['TIME_IDENTIFIER'],
            path=config['RESULTS_PATH'])

    # Save metrics for interacted fixed effects model
    if ((set() < set(config['FIXED_EFFECT_FEATURES']) <= set(
            data_processor.data))):
        ife_modeler = \
            pd_modelers.InteractedFixedEffectsModeler(
                config=config, data=data_processor.data,
                categorical_features=categorical_features)
        ife_modeler.build_model()
        with open(f'{config["RESULTS_PATH"]}Intermediate/Models/IFE_Model.p',
                  'wb') as file:
            pickle.dump(ife_modeler.model, file)
        subset = ife_modeler.data['_validation'] & ~ife_modeler.data['_test']
        utils.save_output_table(ife_modeler.evaluate(subset),
                                'IFE_Metrics',
                                path=config['RESULTS_PATH'])
        ife_quantiles = ife_modeler.tabulate_survival_by_quantile(
            subset, n_quantiles=config['QUANTILES'])
        utils.save_output_table(ife_quantiles,
                                'IFE_Counts_by_Quantile',
                                index=False,
                                path=config['RESULTS_PATH'])

    print(f'Output production time: {time() - checkpoint_time} seconds')