Ejemplo n.º 1
0
def cli(sys_argv: List[str]):
    """Command line interface to merge two dataset

    :param sys_argv: list of command line arguments
    """
    parser = argparse.ArgumentParser()

    parser.add_argument('path_to_dataset_1',
                        type=str,
                        help='Path to a pickled dataset')

    parser.add_argument('path_to_dataset_2',
                        type=str,
                        help='Path a pickled dataset')

    parser.add_argument('--out_path',
                        type=str,
                        help='Path to save the merged dataset')

    args = parser.parse_args(sys_argv)

    dataset1 = file_utils.pickle2dataframe(args.path_to_dataset_1)
    dataset2 = file_utils.pickle2dataframe(args.path_to_dataset_2)
    merged_dataset = dataset1.append(dataset2)
    file_utils.dataframe2pickle(merged_dataset, args.out_path)
Ejemplo n.º 2
0
 def load_data(self, path, transform_numerical, transform_categorical):
     raw_data = file_utils.pickle2dataframe(path)
     data = raw_data[self.all_features]
     if transform_numerical:
         data = transform_numerical(raw_data)
     if transform_categorical:
         data = transform_categorical(raw_data)
     return data
Ejemplo n.º 3
0
def get_categorical_processor(data_path: str,
                              features: List[str],
                              save_path: str = None) -> Preprocessor:
    """Load a data set saved as a pickle and fit a CategoricalPreprocessor

    :param data_path: Path to the data on which the preprocessor will be fitted
    :param features: List of categorical features to preprocess
    :param save_path: Path where to save the parameter of the preprocessor
    :return: fitted categorical preprocessor
    """
    train_data: DataFrame = file_utils.pickle2dataframe(data_path)
    train_data = train_data[features].dropna()
    encoder = CategoricalPreprocessor()
    encoder.fit(train_data)
    if save_path:
        encoder.save(save_path)
    return encoder
Ejemplo n.º 4
0
 def __init__(self,
              path,
              numerical_features=None,
              categorical_features=None,
              output_features=None,
              transform_numerical=None,
              transform_categorical=None):
     self.numerical_features = numerical_features
     self.categorical_features = categorical_features
     self.output_features = output_features
     self.transform_numerical = transform_numerical
     self.transform_categorical = transform_categorical
     self.all_features = self.get_all_features(numerical_features,
                                               categorical_features,
                                               output_features)
     self.raw_data = file_utils.pickle2dataframe(path)
     self.data = self.load_data(path, transform_numerical,
                                transform_categorical)
Ejemplo n.º 5
0
def cli(sys_argv: List[str]):
    data = file_utils.pickle2dataframe(FULL_DATA_PATH)
    data = data[data.PX_LAST > 0.]
    data['LOG_PX_LAST'] = np.log(data.PX_LAST)

    valuation_dates = data['ValuationDate']
    valuation_years = [dates.to_pydatetime().year for dates in valuation_dates]
    valuation_months = [dates.to_pydatetime().month for dates in valuation_dates]

    years = set(valuation_years)
    months = set(valuation_months)

    data['ValuationYear'] = valuation_years
    data['ValuationMonth'] = valuation_months

    data = data[data.ValuationYear == 2017]

    train_months = list(range(1, 9))
    valid_months = [9, 10]
    test_months = [11, 12]

    all_index = data.index

    # remove line with missing values
    data_complete = data.copy().dropna()

    # split data
    train_data = data_complete[data_complete.ValuationMonth.isin(train_months)]
    valid_data = data_complete[data_complete.ValuationMonth.isin(valid_months)]
    test_data = data_complete[data_complete.ValuationMonth.isin(test_months)]

    # data_missing contains only lines with missing data
    missing_index = [i for i in all_index if i not in data_complete.index]
    data_missing = data.loc[missing_index]
    train_data_missing = data_missing[data_missing.ValuationMonth.isin(train_months)]

    # save the data to pickles
    file_utils.dataframe2pickle(train_data, constants.TRAIN_PATH)
    file_utils.dataframe2pickle(valid_data, constants.VALID_PATH)
    file_utils.dataframe2pickle(test_data, constants.TEST_PATH)
    file_utils.dataframe2pickle(train_data_missing, constants.TRAIN_PATH_MISSING)
Ejemplo n.º 6
0
def get_numerical_processor(data_path: str,
                            features: List[str],
                            scale: Tuple[int, int],
                            apply_log: bool,
                            save_path: str = None) -> Preprocessor:
    """Load a training set saved as a pickle and train a NumericalPreprocessor

    :param data_path: Path to the data on which the preprocessor will be fitted
    :param features: List of numerical features to preprocess
    :param scale: See `preprocessors.NumericalPreprocessor` docstring
    :param apply_log: See `preprocessors.NumericalPreprocessor` docstring
    :param save_path: Path where to save the parameter of the preprocessor
    :return: fitted numerical preprocessor
    """
    train_data = file_utils.pickle2dataframe(data_path)
    train_data = train_data[features]
    normalizer = NumericalPreprocessor(scale, apply_log)
    normalizer.fit(train_data)
    if save_path:
        normalizer.save(save_path)
    return normalizer
Ejemplo n.º 7
0
def cli(sys_argv: List[str]):
    """Command line interface to train the models

    :param sys_argv: list of command line arguments
    """
    parser = argparse.ArgumentParser()

    parser.add_argument('--mse_loss', action='store_true')

    parser.add_argument(
        '--writer_path',
        type=str,
        default=None,
        help=
        'path to the pickled writer. Use this option if visualizing losses.')

    parser.add_argument('--price_predictions', action='store_true')

    parser.add_argument(
        '--target_data',
        type=str,
        default=None,
        help=
        "path to the pickled target data set. Use this option if visualizing predictions"
    )

    parser.add_argument(
        '--target_feature',
        type=str,
        default=None,
        help=
        "Name of the target feature. Use this option if visualizing predictions"
    )

    parser.add_argument(
        '--pred',
        type=str,
        default=None,
        help=
        "path to the pickled predictions. Use this option if visualizing predictions"
    )

    parser.add_argument('--output_dir',
                        type=str,
                        default=RESULTS_DIR,
                        help='Path to the directory where to save results')

    parser.add_argument('--output_filename',
                        type=str,
                        help='Name of the figure')

    args = parser.parse_args(sys_argv)

    save_path = '{}/figure_{}'.format(args.output_dir, args.output_filename)

    if args.mse_loss:
        writer: Dict[str,
                     List[float]] = file_utils.load_pickle(args.writer_path)
        print(' [visualize] Saving MSE figure at `{}`'.format(save_path))
        plot_utils.plot_train_valid_rmse_loss(writer['train_loss'],
                                              writer['valid_loss'], save_path)
        print(' [visualize] Done')
        return

    if args.price_predictions:
        dataframe = file_utils.pickle2dataframe(args.target_data)
        targets = dataframe[args.target_feature]
        predictions = file_utils.load_pickle(args.pred)

        plot_utils.scatter_plot_predictions(predictions, targets, save_path)
        return

    exit(
        'You need to specify `--mse_loss` or `--price_predictions` in command line arguments'
    )