Beispiel #1
0
def get_scoring_case_data_paths() -> Tuple[str, str]:
    train_file_path = os.path.join('cases', 'data', 'scoring',
                                   'scoring_train.csv')
    test_file_path = os.path.join('cases', 'data', 'scoring',
                                  'scoring_test.csv')
    full_train_file_path = os.path.join(str(project_root()), train_file_path)
    full_test_file_path = os.path.join(str(project_root()), test_file_path)

    return full_train_file_path, full_test_file_path
Beispiel #2
0
def get_cancer_case_data_paths() -> Tuple[str, str]:
    train_file_path = os.path.join('cases', 'data', 'benchmark',
                                   'cancer_train.csv')
    test_file_path = os.path.join('cases', 'data', 'benchmark',
                                  'cancer_test.csv')
    full_train_file_path = os.path.join(str(project_root()), train_file_path)
    full_test_file_path = os.path.join(str(project_root()), test_file_path)

    return full_train_file_path, full_test_file_path
Beispiel #3
0
def test_credit_scoring_problem():
    test_file_path = str(os.path.dirname(__file__))
    file_path_train = os.path.join(test_file_path,
                                   'data/simple_classification.csv')
    file_path_test = file_path_train
    full_path_train = os.path.join(str(project_root()), file_path_train)
    full_path_test = os.path.join(str(project_root()), file_path_test)

    roc_auc_test = run_credit_scoring_problem(
        full_path_train, full_path_test, max_lead_time=timedelta(minutes=0.1))
    assert roc_auc_test > 0.5
Beispiel #4
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     is_visualise=False):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    chain = get_composite_lstm_chain()

    chain_simple = Chain()
    node_single = PrimaryNode('ridge')
    chain_simple.add_node(node_single)

    chain_lstm = Chain()
    node_lstm = PrimaryNode('lstm')
    chain_lstm.add_node(node_lstm)

    chain.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate), dataset_to_validate,
        f'full-composite_{forecast_length}', is_visualise)

    chain_lstm.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_lstm_only = calculate_validation_metric(
        chain_lstm.predict(dataset_to_validate), dataset_to_validate,
        f'full-lstm-only_{forecast_length}', is_visualise)

    chain_simple.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid_simple = calculate_validation_metric(
        chain_simple.predict(dataset_to_validate), dataset_to_validate,
        f'full-simple_{forecast_length}', is_visualise)

    print(f'RMSE composite: {rmse_on_valid}')
    print(f'RMSE simple: {rmse_on_valid_simple}')
    print(f'RMSE LSTM only: {rmse_on_valid_lstm_only}')

    return rmse_on_valid_simple
Beispiel #5
0
def test_metocean_forecasting_problem():
    test_file_path = str(os.path.dirname(__file__))
    file_path_train = os.path.join(test_file_path,
                                   'data/simple_time_series.csv')
    file_path_test = file_path_train
    full_path_train = os.path.join(str(project_root()), file_path_train)
    full_path_test = os.path.join(str(project_root()), file_path_test)

    rmse = run_metocean_forecasting_problem(full_path_train,
                                            full_path_test,
                                            forecast_length=1,
                                            max_window_size=1)
    assert rmse < 50
Beispiel #6
0
def create_multi_clf_examples_from_excel(file_path: str,
                                         return_df: bool = False):
    df = pd.read_excel(file_path)
    train, test = split_data(df)
    file_dir_name = file_path.replace('.', '/').split('/')[-2]
    file_csv_name = f'{file_dir_name}.csv'
    directory_names = ['examples', 'data', file_dir_name]
    ensure_directory_exists(directory_names)
    if return_df:
        path = os.path.join(directory_names[0], directory_names[1],
                            directory_names[2], file_csv_name)
        full_file_path = os.path.join(str(project_root()), path)
        save_file_to_csv(df, full_file_path)
        return df, full_file_path
    else:
        full_train_file_path, full_test_file_path = get_split_data_paths(
            directory_names)
        save_file_to_csv(train, full_train_file_path)
        save_file_to_csv(train, full_test_file_path)
        return full_train_file_path, full_test_file_path
Beispiel #7
0
class ComposerVisualiser:
    root_parent_path = os.path.join('../', str(project_root()))
    root_parent_path_dirname = os.path.dirname(root_parent_path)
    temp_path = os.path.join(root_parent_path_dirname, 'tmp/')
    if 'tmp' not in os.listdir(root_parent_path_dirname):
        os.mkdir(temp_path)
    gif_prefix = 'for_gif_'

    @staticmethod
    def visualise(chain: Chain, save_path: Optional[str] = None):
        try:
            graph, node_labels = as_nx_graph(chain=chain)
            pos = node_positions(graph.to_undirected())
            plt.figure(figsize=(10, 16))
            nx.draw(graph,
                    pos=pos,
                    with_labels=True,
                    labels=node_labels,
                    font_size=12,
                    font_family='calibri',
                    font_weight='bold',
                    node_size=7000,
                    width=2.0,
                    node_color=colors_by_node_labels(node_labels),
                    cmap='Set3')
            if not save_path:
                plt.show()
            else:
                plt.savefig(save_path)
        except Exception as ex:
            print(f'Visualisation failed with {ex}')

    @staticmethod
    def _visualise_chains(chains, fitnesses):
        fitnesses = deepcopy(fitnesses)
        last_best_chain = chains[0]

        prev_fit = fitnesses[0]

        for ch_id, chain in enumerate(chains):
            graph, node_labels = as_nx_graph(chain=chain)
            pos = node_positions(graph.to_undirected())
            plt.rcParams['axes.titlesize'] = 20
            plt.rcParams['axes.labelsize'] = 20
            plt.rcParams['figure.figsize'] = [10, 10]
            plt.title('Current chain')
            nx.draw(graph,
                    pos=pos,
                    with_labels=True,
                    labels=node_labels,
                    font_size=12,
                    font_family='calibri',
                    font_weight='bold',
                    node_size=scaled_node_size(chain.length),
                    width=2.0,
                    node_color=colors_by_node_labels(node_labels),
                    cmap='Set3')
            path = f'{ComposerVisualiser.temp_path}ch_{ch_id}.png'
            plt.savefig(path, bbox_inches='tight')

            plt.cla()
            plt.clf()
            plt.close('all')

            path_best = f'{ComposerVisualiser.temp_path}best_ch_{ch_id}.png'

            if fitnesses[ch_id] > prev_fit:
                fitnesses[ch_id] = prev_fit
            else:
                last_best_chain = chain
            prev_fit = fitnesses[ch_id]

            best_graph, best_node_labels = as_nx_graph(chain=last_best_chain)
            pos = node_positions(best_graph.to_undirected())
            plt.rcParams['axes.titlesize'] = 20
            plt.rcParams['axes.labelsize'] = 20
            plt.rcParams['figure.figsize'] = [10, 10]
            plt.title(f'Best chain after {round(ch_id)} evals')
            nx.draw(best_graph,
                    pos=pos,
                    with_labels=True,
                    labels=best_node_labels,
                    font_size=12,
                    font_family='calibri',
                    font_weight='bold',
                    node_size=scaled_node_size(chain.length),
                    width=2.0,
                    node_color=colors_by_node_labels(best_node_labels),
                    cmap='Set3')

            plt.savefig(path_best, bbox_inches='tight')

            plt.cla()
            plt.clf()
            plt.close('all')

    @staticmethod
    def _visualise_convergence(fitness_history):
        fitness_history = deepcopy(fitness_history)
        prev_fit = fitness_history[0]
        for fit_id, fit in enumerate(fitness_history):
            if fit > prev_fit:
                fitness_history[fit_id] = prev_fit
            prev_fit = fitness_history[fit_id]
        ts_set = list(range(len(fitness_history)))
        df = pd.DataFrame({
            'ts': ts_set,
            'fitness': [-f for f in fitness_history]
        })

        ind = 0
        for ts in ts_set:
            plt.rcParams['axes.titlesize'] = 20
            plt.rcParams['axes.labelsize'] = 20
            plt.rcParams['figure.figsize'] = [10, 10]

            ind = ind + 1
            plt.plot(df['ts'], df['fitness'], label='Composer')
            plt.xlabel('Evaluation', fontsize=18)
            plt.ylabel('Best ROC AUC', fontsize=18)

            plt.axvline(x=ts, color='black')
            plt.legend(loc='upper left')

            path = f'{ComposerVisualiser.temp_path}{ind}.png'
            plt.savefig(path, bbox_inches='tight')

            plt.cla()
            plt.clf()
            plt.close('all')

    @staticmethod
    def visualise_history(chains, fitnesses):
        print('START VISUALISATION')
        try:
            ComposerVisualiser._clean(with_gif=True)
            ComposerVisualiser._visualise_chains(chains, fitnesses)
            ComposerVisualiser._visualise_convergence(fitnesses)
            ComposerVisualiser._merge_images(len(chains))
            ComposerVisualiser._combine_gifs()
            ComposerVisualiser._clean()
        except Exception as ex:
            print(f'Visualisation failed with {ex}')

    @staticmethod
    def _merge_images(num_images):
        for img_idx in (range(1, num_images)):
            images = list(
                map(Image.open, [
                    f'{ComposerVisualiser.temp_path}ch_{img_idx}.png',
                    f'{ComposerVisualiser.temp_path}best_ch_{img_idx}.png',
                    f'{ComposerVisualiser.temp_path}{img_idx}.png'
                ]))
            widths, heights = zip(*(i.size for i in images))

            total_width = sum(widths)
            max_height = max(heights)

            new_im = Image.new('RGB', (total_width, max_height))

            x_offset = 0
            for im in images:
                new_im.paste(im, (x_offset, 0))
                x_offset += im.size[0]

            new_im.save(
                f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}{img_idx}.png'
            )

    @staticmethod
    def _combine_gifs():
        files = [
            file_name for file_name in iglob(
                f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}*.png'
            )
        ]
        files_idx = [
            int(file_name[len(
                f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}'
            ):(len(file_name) - len('.png'))]) for file_name in iglob(
                f'{ComposerVisualiser.temp_path}{ComposerVisualiser.gif_prefix}*.png'
            )
        ]
        files = [file for _, file in sorted(zip(files_idx, files))]

        with get_writer(
                f'{ComposerVisualiser.temp_path}final_{str(time())}.gif',
                mode='I',
                duration=0.5) as writer:
            for filename in files:
                image = imread(filename)
                writer.append_data(image)

    @staticmethod
    def _clean(with_gif=False):
        try:
            files = glob(f'{ComposerVisualiser.temp_path}*.png')
            if with_gif:
                files += glob(f'{ComposerVisualiser.temp_path}*.gif')
            for file in files:
                remove(file)
        except Exception as ex:
            print(ex)
    chain_evo_composed.fit(input_data=dataset_to_compose, verbose=True)

    if is_visualise:
        ComposerVisualiser.visualise(chain_evo_composed)

    # the quality assessment for the obtained composite models
    roc_on_valid_evo_composed = calculate_validation_metric(
        chain_evo_composed, dataset_to_validate)

    print(f'Composed ROC AUC is {round(roc_on_valid_evo_composed, 3)}')

    return roc_on_valid_evo_composed


if __name__ == '__main__':
    # the dataset was obtained from https://www.kaggle.com/c/GiveMeSomeCredit

    # a dataset that will be used as a train and test set during composition

    file_path_train = 'cases/data/scoring/scoring_train.csv'
    full_path_train = os.path.join(str(project_root()), file_path_train)

    # a dataset for a final validation of the composed model
    file_path_test = 'cases/data/scoring/scoring_test.csv'
    full_path_test = os.path.join(str(project_root()), file_path_test)

    run_credit_scoring_problem(full_path_train,
                               full_path_test,
                               is_visualise=True)
Beispiel #9
0
def run_metocean_forecasting_problem(train_file_path,
                                     test_file_path,
                                     forecast_length=1,
                                     max_window_size=64,
                                     with_visualisation=True):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts)

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), test_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts)

    metric_function = MetricsRepository().metric_by_id(
        RegressionMetricsEnum.RMSE)

    ref_chain = get_composite_lstm_chain()

    available_model_types_primary = ['trend_data_model', 'residual_data_model']

    available_model_types_secondary = [
        'rfr', 'linear', 'ridge', 'lasso', 'additive_data_model'
    ]

    composer = FixedStructureComposer()

    composer_requirements = GPComposerRequirements(
        primary=available_model_types_primary,
        secondary=available_model_types_secondary,
        max_arity=2,
        max_depth=4,
        pop_size=10,
        num_of_generations=10,
        crossover_prob=0,
        mutation_prob=0.8,
        max_lead_time=datetime.timedelta(minutes=20))

    chain = composer.compose_chain(data=dataset_to_train,
                                   initial_chain=ref_chain,
                                   composer_requirements=composer_requirements,
                                   metrics=metric_function,
                                   is_visualise=False)

    if with_visualisation:
        ComposerVisualiser.visualise(chain)

    chain.fit(input_data=dataset_to_train, verbose=False)
    rmse_on_valid = calculate_validation_metric(
        chain.predict(dataset_to_validate),
        dataset_to_validate,
        f'full-composite_{forecast_length}',
        is_visualise=with_visualisation)

    print(f'RMSE composite: {rmse_on_valid}')

    return rmse_on_valid
def run_oil_forecasting_problem(train_file_path,
                                train_file_path_crm,
                                forecast_length,
                                max_window_size,
                                is_visualise=False,
                                well_id='Unknown'):
    # specify the task to solve
    task_to_solve = Task(
        TaskTypesEnum.ts_forecasting,
        TsForecastingParams(forecast_length=forecast_length,
                            max_window_size=max_window_size,
                            return_all_steps=False,
                            make_future_prediction=False))

    full_path_train = os.path.join(str(project_root()), train_file_path)
    dataset_to_train = InputData.from_csv(full_path_train,
                                          task=task_to_solve,
                                          data_type=DataTypesEnum.ts,
                                          delimiter=',')

    # a dataset for a final validation of the composed model
    full_path_test = os.path.join(str(project_root()), train_file_path)
    dataset_to_validate = InputData.from_csv(full_path_test,
                                             task=task_to_solve,
                                             data_type=DataTypesEnum.ts,
                                             delimiter=',')

    full_path_train_crm = os.path.join(str(project_root()),
                                       train_file_path_crm)
    dataset_to_train_crm = InputData.from_csv(full_path_train_crm,
                                              task=task_to_solve,
                                              data_type=DataTypesEnum.ts,
                                              delimiter=',')

    dataset_to_validate_crm = copy(dataset_to_train_crm)

    prediction_full = None
    prediction_full_crm = None
    prediction_full_crm_opt = None

    forecast_window_shift_num = 4

    depth = 100

    for forecasting_step in range(forecast_window_shift_num):
        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_train_local = dataset_to_train.subset(start, end)
        dataset_to_train_local_crm = dataset_to_train_crm.subset(start, end)

        start = 0 + depth * forecasting_step
        end = depth * 2 + depth * (forecasting_step + 1)

        dataset_to_validate_local = dataset_to_validate.subset(
            start + depth, end + depth)
        dataset_to_validate_local_crm = dataset_to_validate_crm.subset(
            start + depth, end + depth)

        chain_simple = Chain(PrimaryNode('lstm'))
        chain_simple_crm = Chain(PrimaryNode('lstm'))
        chain_crm_opt = get_comp_chain()

        chain_simple.fit_from_scratch(input_data=dataset_to_train_local,
                                      verbose=False)
        chain_simple_crm.fit_from_scratch(
            input_data=dataset_to_train_local_crm, verbose=False)
        chain_crm_opt.fit_from_scratch(input_data=dataset_to_train_local_crm,
                                       verbose=False)

        prediction = chain_simple.predict(dataset_to_validate_local)
        prediction_crm = chain_simple_crm.predict(
            dataset_to_validate_local_crm)
        prediction_crm_opt = chain_crm_opt.predict(
            dataset_to_validate_local_crm)

        prediction_full = merge_datasets(prediction_full, prediction,
                                         forecasting_step)
        prediction_full_crm = merge_datasets(prediction_full_crm,
                                             prediction_crm, forecasting_step)
        prediction_full_crm_opt = merge_datasets(prediction_full_crm_opt,
                                                 prediction_crm_opt,
                                                 forecasting_step)

    rmse_on_valid_simple = calculate_validation_metric(
        prediction_full, prediction_full_crm, prediction_full_crm_opt,
        dataset_to_validate, well_id, is_visualise)

    print(well_id)
    print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}')
    print(f'RMSE ML: {round(rmse_on_valid_simple[1])}')
    print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}')
    print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}')

    print(f'DTW CRM: {round(rmse_on_valid_simple[4])}')
    print(f'DTW ML: {round(rmse_on_valid_simple[5])}')
    print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}')
    print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}')

    return rmse_on_valid_simple
    print(f'RMSE CRM: {round(rmse_on_valid_simple[0])}')
    print(f'RMSE ML: {round(rmse_on_valid_simple[1])}')
    print(f'RMSE ML with CRM: {round(rmse_on_valid_simple[2])}')
    print(f'Evo RMSE ML with CRM: {round(rmse_on_valid_simple[3])}')

    print(f'DTW CRM: {round(rmse_on_valid_simple[4])}')
    print(f'DTW ML: {round(rmse_on_valid_simple[5])}')
    print(f'DTW ML with CRM: {round(rmse_on_valid_simple[6])}')
    print(f'DTW RMSE ML with CRM: {round(rmse_on_valid_simple[7])}')

    return rmse_on_valid_simple


if __name__ == '__main__':
    # the dataset was obtained from Volve dataset of oil field

    for well in ['5351', '5599', '7078', '7289', '7405f']:
        full_path_train_crm = f'../production_forecasting/data/oil_crm_prod_X{well}.csv'
        full_path_train_crm = os.path.join(str(project_root()),
                                           full_path_train_crm)

        file_path_train = f'../production_forecasting/data/oil_prod_X{well}.csv'
        full_path_train = os.path.join(str(project_root()), file_path_train)

        run_oil_forecasting_problem(full_path_train,
                                    full_path_train_crm,
                                    forecast_length=100,
                                    max_window_size=100,
                                    is_visualise=True,
                                    well_id=well)