Beispiel #1
0
def start_full_training_run(train_df, label, params, verbose_eval,
                            num_boost_round, early_stopping_rounds,
                            output_filepath):
    """"
    Starts a full training run with the provided parameters.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param verbose_eval: The interval where training information is printed
    to console.
    :param num_boost_round: Maximum number of rounds / estimators for the training.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    :param output_filepath: Directory that will contain the trained models.
    """
    with timer("Building model and start training"):
        train_lgb_df = lgb.Dataset(data=train_df, label=label)
        valid_sets = [train_lgb_df]
        lgbm_model = lgb.train(params=params,
                               train_set=train_lgb_df,
                               num_boost_round=num_boost_round,
                               valid_sets=valid_sets,
                               verbose_eval=verbose_eval,
                               early_stopping_rounds=early_stopping_rounds)
    with timer("Saving trained model"):
        save_model(output_filepath, lgbm_model)
def main(data_dir, output_filepath):
    """ Runs data feature engineering scripts to turn interim data from
        (../interim) into data which is ready for usage in ML models
        (saved in ../processed).
        :param data_dir: Directory that contains the data
        :param output_filepath: Directory where processed results will be saved in.
    """
    with open("src/config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

    with timer("Loading interim data"):
        train_df, test_df = load_interim_data(data_dir + "/interim")

    train_df, test_df = build_features(train_df, test_df, cfg=cfg)

    if cfg["exclude_faulty_rows"]:
        with timer("Exclude faulty data and outliers"):
            train_df = exclude_faulty_readings(train_df, data_dir + "/external")

    if cfg["add_leaks_to_train"]:
        with timer("Adding Leak Label to training set"):
            train_df = add_leaked_data(train_df, test_df)

    with timer("Sort training set"):
        train_df.sort_values("timestamp", inplace=True)
        train_df.reset_index(drop=True, inplace=True)

    with timer("Save processed data"):
        save_processed_data(output_filepath, train_df, test_df)
Beispiel #3
0
def start_cv_run(train_df, label, params, splits, verbose_eval,
                 early_stopping_rounds, output_filepath):
    """
    Starts a Cross Validation Run with the parameters provided.
    Scores will be documented and models will be saved.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param splits: Integer describing the number of folds / splitting fraction.
    :param verbose_eval: The interval where training information is printed
    to console.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    :param output_filepath: Directory that will contain the trained models.
    """
    output_filepath = output_filepath + "_cv"
    with timer("Performing " + str(splits) + " fold cross-validation"):
        kf = KFold(n_splits=splits, shuffle=False, random_state=1337)
        for i, (train_index,
                test_index) in enumerate(kf.split(train_df, label)):
            with timer("~~~~ Fold %d of %d ~~~~" % (i + 1, splits)):
                x_train, x_valid = train_df.iloc[train_index], train_df.iloc[
                    test_index]
                y_train, y_valid = label[train_index], label[test_index]
                cat_features = list(
                    x_train.select_dtypes(include=['category']).columns)

                ctb_model = ctb.CatBoostRegressor(**params)
                ctb_model.fit(x_train,
                              y_train,
                              cat_features=cat_features,
                              eval_set=(x_valid, y_valid),
                              verbose_eval=verbose_eval,
                              early_stopping_rounds=early_stopping_rounds)
                save_model(output_filepath, ctb_model)
def start_full_training_run(train_df, label, params, num_boost_round,
                            early_stopping_rounds, output_filepath):
    """"
    Starts a full training run with the provided parameters.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param num_boost_round: Maximum number of rounds / estimators for the training.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    :param output_filepath: Directory that will contain the trained models.
    """
    with timer("Building model and start training"):
        train_dmatrix = xgb.DMatrix(data=train_df, label=label)
        evals = [(train_dmatrix, 'eval')]
        verbose_eval = True
        xgb_model = xgb.train(params=params,
                              dtrain=train_dmatrix,
                              num_boost_round=num_boost_round,
                              evals=evals,
                              verbose_eval=verbose_eval,
                              early_stopping_rounds=early_stopping_rounds)

    with timer("Saving trained model"):
        save_model(output_filepath, xgb_model)
Beispiel #5
0
def start_full_by_building_run(train_df, label, params, splits, verbose_eval,
                               num_boost_round, early_stopping_rounds,
                               output_filepath):
    """
    Trains a model for each of the buildings. Expect a high wall time as the
    count of the buildings is >1000.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param splits: Integer describing the number of folds / splitting fraction.
    :param verbose_eval: The interval where training information is printed
    to console.
    :param num_boost_round: Maximum number of rounds / estimators for the training.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    :param output_filepath: Directory that will contain the trained models.
    """
    output_main_dir = output_filepath + "_by_building"
    train_df["label"] = label
    train_df = train_df.drop(columns=["site_id"], axis=1)
    train_df = train_df.groupby("building_id")
    buildings = [name for name, _ in train_df]

    for b in buildings:
        click.echo("Starting training for Building " + str(b) + ".")
        train_by_building = train_df.get_group(b)
        train_by_building = train_by_building.reset_index(drop=True)
        label = train_by_building["label"]
        train_by_building = train_by_building.drop(
            columns=["building_id", "label"], axis=1)
        with timer("Performing " + str(splits) + " fold cross-validation on building " \
                   + str(b)):
            kf = KFold(n_splits=splits, shuffle=False, random_state=1337)
            for i, (train_index,
                    test_index) in enumerate(kf.split(train_by_building,
                                                      label)):
                with timer("~~~~ Fold %d of %d ~~~~" % (i + 1, splits)):
                    x_train, x_valid = train_by_building.iloc[
                        train_index], train_by_building.iloc[test_index]
                    y_train, y_valid = label[train_index], label[test_index]

                    train_lgb_df = lgb.Dataset(data=x_train, label=y_train)
                    valid_lgb_df = lgb.Dataset(data=x_valid, label=y_valid)

                    valid_sets = [train_lgb_df, valid_lgb_df]
                    evals_result = dict()
                    lgbm_model = lgb.train(
                        params=params,
                        train_set=train_lgb_df,
                        num_boost_round=num_boost_round,
                        valid_sets=valid_sets,
                        valid_names=["train_loss", "eval"],
                        verbose_eval=verbose_eval,
                        evals_result=evals_result,
                        early_stopping_rounds=early_stopping_rounds)
                    output_filepath = output_main_dir + "/" + str(b)
                    save_model(output_filepath, lgbm_model)
Beispiel #6
0
    def setTimer(self, timer=None, value=0, startTimer=False):
        if timer != None:
            if isinstance(value, t.timer):
                self.__timers[timer] = value
            elif type(value) in [float, int]:
                self.__timers[timer] = t.timer(value)
            else:
                self.__timers[timer] = t.timer(0)

            self.__timers[timer].reset(startTimer)
            return self.__timers[timer]
        return None
Beispiel #7
0
def main(data_dir, output_dir):
    """
    Runs data processing scripts to turn raw data (data_dir/raw) and external
    data (data_dir/external) into cleaned data ready for feature engineering
    (saved in output_dir).
    :param data_dir: Directory that contains the raw data
    :param output_dir: Directory where results will be saved in.
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')
    with open("src/config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

    with timer("Loading data"):
        train_df = load_main_csv(data_dir + "/raw/train.csv")
        test_df = load_main_csv(data_dir + "/raw/test.csv")
        weather_train_df = load_weather_csv(data_dir +
                                            "/raw/weather_train.csv")
        weather_test_df = load_weather_csv(data_dir + "/raw/weather_test.csv")
        building_df = load_building_csv(data_dir +
                                        "/raw/building_metadata.csv")
        site_df = load_site_csv(data_dir + "/external/site_info.csv")

    with timer("Merging main and building"):
        train_df = train_df.merge(building_df, on="building_id", how="left")
        test_df = test_df.merge(building_df, on="building_id", how="left")

    if cfg["include_feels_like"]:
        with timer("Create feels_like_temp"):
            weather_train_df = create_feels_like(weather_train_df)
            weather_test_df = create_feels_like(weather_test_df)

    if cfg["impute_weather_data"]:
        with timer("Impute missing weather data"):
            weather_train_df = impute_weather_data(weather_train_df)
            weather_test_df = impute_weather_data(weather_test_df)

    with timer("Merging weather and site"):
        weather_train_df = weather_train_df.merge(site_df,
                                                  on="site_id",
                                                  how="left")
        weather_test_df = weather_test_df.merge(site_df,
                                                on="site_id",
                                                how="left")

    if cfg["localize_timestamps"]:
        with timer("Localizing weather timestamp"):
            weather_train_df = localize_weather_timestamp(weather_train_df)
            weather_test_df = localize_weather_timestamp(weather_test_df)

    with timer("Merging main and weather"):
        train_df = train_df.merge(weather_train_df,
                                  on=["site_id", "timestamp"],
                                  how="left")
        test_df = test_df.merge(weather_test_df,
                                on=["site_id", "timestamp"],
                                how="left")

    with timer("Saving cleansed data"):
        save_joined_data(train_df, test_df, output_dir)
Beispiel #8
0
def main(mode, input_filepath, output_filepath):
    """
    Collects prepared data and starts training an CatBoost model. Parameters
    can be specified by editing src/config.yml.
    :param mode: Specifies mode to run. Now only cv (cross validation)
    is supported.
    :param input_filepath: Directory that contains the processed data.
    :param output_filepath: Directory that will contain the trained models.
    """
    random.seed(1337)
    with timer("Loading processed training data"):
        train_df, label = load_processed_training_data(input_filepath)

    ###########################################################################
    # DEFINE PARAMETERS FOR THE LGBM MODEL                                     #
    ###########################################################################
    with open("src/config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
    params = cfg["ctb_params"]
    early_stopping_rounds = cfg["ctb_early_stopping_rounds"]
    splits = cfg["ctb_splits_for_cv"]
    verbose_eval = cfg["ctb_verbose_eval"]
    ###########################################################################

    if mode == "cv":
        start_cv_run(train_df, label, params, splits, verbose_eval,
                     early_stopping_rounds, output_filepath)
    else:
        raise ValueError("Choose a valid mode: 'cv'")
def train_ctb_model(mode, input_filepath, output_filepath, cfg):
    """
    Collects prepared data and starts training an CatBoost model.
    :param mode: Specifies mode to run. Now only cv (cross validation) is supported.
    :param input_filepath: Directory that contains the processed data.
    :param output_filepath: Directory that will contain the trained models.
    :param cfg: Config read from src/config.yml.
    """
    with timer("Loading processed training data"):
        train_df, label = load_processed_training_data(input_filepath, cfg["columns"])

    ###########################################################################
    # DEFINE PARAMETERS FOR THE LGBM MODEL                                     #
    ###########################################################################
    params = cfg["ctb_params"]
    early_stopping_rounds = cfg["ctb_early_stopping_rounds"]
    splits = cfg["ctb_splits_for_cv"]
    verbose_eval = cfg["ctb_verbose_eval"]
    ###########################################################################

    if mode == "cv":
        start_cv_run(train_df, label, params, splits, verbose_eval,
                     early_stopping_rounds, output_filepath)
    else:
        raise ValueError("Choose a valid mode: 'cv'")
Beispiel #10
0
def create_submission_file(submission_path,
                           row_ids,
                           predictions,
                           use_leaks=False):
    """
    Creates a submission file which fulfills the upload conditions for the
    kaggle challenge.
    :param submission_path: The path for the submission CSV file
    :param row_ids: A vector with the matching row ids for the predicted labels
    :param predictions: Vector containing the predicted labels for the test data
    :param use_leaks: Indicates if leaks will be added to the submission or not
    """
    if use_leaks:
        with timer("Adding leaks to submission file"):
            predictions = add_leaks_to_submission(predictions)

    submission = pd.DataFrame({
        "row_id": row_ids,
        "meter_reading": predictions
    })

    validate_submission(submission)

    submission_dir = os.path.dirname(submission_path)
    os.makedirs(submission_dir, exist_ok=True)
    submission.to_csv(submission_path, index=False)
Beispiel #11
0
def start_cv_run(train_df, label, params, splits, verbose_eval,
                 num_boost_round, early_stopping_rounds, output_filepath):
    """
    Starts a Cross Validation Run with the parameters provided.
    Scores will be documented and models will be saved.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param splits: Integer describing the number of folds / splitting fraction.
    :param verbose_eval: The interval where training information is printed
    to console.
    :param num_boost_round: Maximum number of rounds / estimators for the training.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    :param output_filepath: Directory that will contain the trained models.
    """
    output_filepath = output_filepath + "_cv"
    cv_results = []
    with timer("Performing " + str(splits) + " fold cross-validation"):
        kf = KFold(n_splits=splits, shuffle=False, random_state=1337)
        for i, (train_index,
                test_index) in enumerate(kf.split(train_df, label)):
            with timer("~~~~ Fold %d of %d ~~~~" % (i + 1, splits)):
                x_train, x_valid = train_df.iloc[train_index], train_df.iloc[
                    test_index]
                y_train, y_valid = label[train_index], label[test_index]

                train_lgb_df = lgb.Dataset(data=x_train, label=y_train)
                valid_lgb_df = lgb.Dataset(data=x_valid, label=y_valid)

                valid_sets = [train_lgb_df, valid_lgb_df]
                evals_result = dict()
                lgbm_model = lgb.train(
                    params=params,
                    train_set=train_lgb_df,
                    num_boost_round=num_boost_round,
                    valid_sets=valid_sets,
                    valid_names=["train_loss", "eval"],
                    verbose_eval=verbose_eval,
                    evals_result=evals_result,
                    early_stopping_rounds=early_stopping_rounds)
                save_model(output_filepath, lgbm_model)

                cv_results.append(evals_result)
        evaluate_cv_results(cv_results)
Beispiel #12
0
def main():
    display_surface = timer('Initialisation', initialise_pygame)

    ProgramVariables.game_state = GameState.RUNNING

    running_loop(display_surface)

    pygame.quit()
    sys.exit()
Beispiel #13
0
def predict_with_lgbm_meter(test_df, row_ids, model_filepath):
    """"
    Takes a given directory which contains four models (one for each
    meter type) and then predicts the rows with the respective model
    :param test_df: DataFrame containing the test data
    :param row_ids: A vector with the matching row ids for the predicted labels
    :param model_filepath: Directory that contains the trained model
    :return: Vector containing the predicted labels for the test data
    """

    with timer("Loading models in directory" + model_filepath):
        models_in_dir = sorted(os.listdir(model_filepath))
        test_by_meter = []
        row_id_by_meter = []
        for i in range(4):
            is_meter = test_df["meter"] == i
            test_temp = test_df[is_meter]
            row_temp = row_ids[is_meter]
            test_by_meter.append(test_temp)
            row_id_by_meter.append(row_temp)

    predictions = []
    row_ids_prediction = []
    with timer("Predicting values"):
        for model, test, row in zip(models_in_dir, test_by_meter,
                                    row_id_by_meter):
            del test["meter"]
            lgbm_model = lgb.Booster(model_file=model_filepath + "/" + model)

            predictions_current = lgbm_model.predict(test)
            predictions.extend(list(np.expm1(predictions_current)))
            row_ids_prediction.extend(row)

    # Order the predictions by merging them to the original row ids
    pred_df = pd.DataFrame({"row_id": row_ids_prediction, "pred": predictions})
    row_ids_df = pd.DataFrame({"true_row_ids": row_ids})
    pred_ordered_df = row_ids_df.merge(pred_df,
                                       left_on="true_row_ids",
                                       right_on="row_id",
                                       how="left")
    predictions = pred_ordered_df["pred"].copy(deep=True)
    predictions[predictions < 0] = 0
    return predictions
Beispiel #14
0
def predict_with_ctb(test_df, row_ids, model_filepath):
    """
    Loads the specified model and predicts the target variable which is being
    returned as list.
    :param test_df: DataFrame containing the test data
    :param row_ids: A vector with the matching row ids for the predicted labels
    :param model_filepath: Directory that contains the trained model
    :return: Vector containing the predicted labels for the test data
    """
    if os.path.isdir(model_filepath):
        click.echo("Loading models in directory" + model_filepath)
        models_in_dir = os.listdir(model_filepath)
        num_models = len(models_in_dir)
        predictions = np.zeros(len(row_ids))

        for i, model in enumerate(models_in_dir, start=1):
            with timer("Loading model [" + str(i) + "/" + str(num_models) +
                       "]"):
                ctb_model = ctb.CatBoostRegressor()
                ctb_model.load_model(model_filepath + "/" + model)

            with timer("Predicting values [" + str(i) + "/" + str(num_models) +
                       "]"):
                predictions_current = ctb_model.predict(test_df)
                predictions += np.expm1(predictions_current)

        predictions = predictions / num_models
        predictions[predictions < 0] = 0
        return predictions

    else:
        with timer("Loading model " + model_filepath):
            ctb_model = ctb.CatBoostRegressor()
            ctb_model.load_model(model_filepath)

        with timer("Predicting values"):
            predictions = ctb_model.predict(test_df)
            # Invert log and set possible neg. values to 0
            predictions = np.expm1(predictions)
        predictions[predictions < 0] = 0
        return predictions
Beispiel #15
0
def predict_with_xgb(test_df, model_filepath):
    """
    Loads the specified model and predicts the target variable which is being
    returned as list.
    :param test_df: DataFrame containing the test data
    :param model_filepath: Directory that contains the trained model
    :return: Vector containing the predicted labels for the test data
    """
    test_dmatrix = xgb.DMatrix(test_df)
    del test_df

    with timer("Loading model " + model_filepath):
        xgb_model = xgb.Booster()
        xgb_model.load_model(model_filepath)

    with timer("Predicting values"):
        predictions = xgb_model.predict(test_dmatrix)
        # Invert log and set possible neg. values to 0
        predictions = np.expm1(predictions)
        predictions[predictions < 0] = 0
    return predictions
def start_cv_run(train_df, label, params, num_boost_round,
                 early_stopping_rounds):
    """
    Starts a Cross Validation Run with the parameters provided.
    Scores will be documented and models will be saved.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param num_boost_round: Maximum number of rounds / estimators for the training.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    """
    cv_results = []
    splits = 5
    with timer("Performing " + str(splits) + " fold cross-validation"):
        kf = KFold(n_splits=splits, shuffle=True, random_state=1337)
        for i, (train_index,
                test_index) in enumerate(kf.split(train_df, label)):
            with timer("~~~~ Fold %d of %d ~~~~" % (i + 1, splits)):
                x_train, x_valid = train_df.iloc[train_index], train_df.iloc[
                    test_index]
                y_train, y_valid = label[train_index], label[test_index]

                train_dmatrix = xgb.DMatrix(x_train, y_train)
                valid_dmatrix = xgb.DMatrix(x_valid, y_valid)

                evals = [(train_dmatrix, 'train_loss'),
                         (valid_dmatrix, 'eval')]
                verbose_eval = True
                evals_result = dict()
                xgb_model = xgb.train(
                    params=params,
                    dtrain=train_dmatrix,
                    num_boost_round=num_boost_round,
                    evals=evals,
                    verbose_eval=verbose_eval,
                    evals_result=evals_result,
                    early_stopping_rounds=early_stopping_rounds)
                cv_results.append(evals_result)
        evaluate_xgb_cv_results(cv_results)
Beispiel #17
0
def main(input_filepath, model_type, model_path):
    """
    Loads a trained model and testing data to create a submission file which is
    ready for uploading to the kaggle challenge.
    :param input_filepath: Directory that contains the processed data
    :param model_type: Choose according to the boosting framework (xgb, lgbm, ctb)
    and if it's prediction by meter or building.
    :param model_path: Directory that contains the trained model
    """
    with open("src/config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

    with timer("Loading testing data"):
        test_df = pd.read_pickle(input_filepath + "/test_data.pkl")

    row_ids = test_df["row_id"]
    del test_df["row_id"]

    if model_type == "xgb":
        predictions = predict_with_xgb(test_df, model_path)

    elif model_type == "lgbm":
        predictions = predict_with_lgbm(test_df, row_ids, model_path)

    elif model_type == "ctb":
        predictions = predict_with_ctb(test_df, row_ids, model_path)

    elif model_type == "lgbm_meter":
        predictions = predict_with_lgbm_meter(test_df, row_ids, model_path)

    elif model_type == "lgbm_building":
        predictions = predict_with_lgbm_building(test_df, row_ids, model_path)

    else:
        raise ValueError(model_type +
                         " is not a valid model type to predict from")

    with timer("Creating submission file"):
        create_submission_file(row_ids, predictions, cfg["use_leaks"])
Beispiel #18
0
def start_full_by_meter_run(train_df, label, params, verbose_eval,
                            num_boost_round, early_stopping_rounds,
                            output_filepath):
    """
    Divides the data into the four meter types and trains a model on each one.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param verbose_eval: The interval where training information is printed
    to console.
    :param num_boost_round: Maximum number of rounds / estimators for the training.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    :param output_filepath: Directory that will contain the trained models.
    """
    output_filepath = output_filepath + "_by_meter"
    train_by_meter = []
    label_by_meter = []
    for i in range(4):
        is_meter = train_df["meter"] == i
        train_temp = train_df[is_meter]
        label_temp = label[is_meter]
        train_by_meter.append(train_temp)
        label_by_meter.append(label_temp)

    with timer("Building models and start training"):
        for (train, label) in zip(train_by_meter, label_by_meter):
            del train["meter"]
            train_lgb_df = lgb.Dataset(data=train, label=label)
            valid_sets = [train_lgb_df]
            lgbm_model = lgb.train(params=params,
                                   train_set=train_lgb_df,
                                   num_boost_round=num_boost_round,
                                   valid_sets=valid_sets,
                                   verbose_eval=verbose_eval,
                                   early_stopping_rounds=early_stopping_rounds)
            with timer("Saving trained model"):
                save_model(output_filepath, lgbm_model)
Beispiel #19
0
def load_processed_test_data(input_filepath, columns):
    """
    Loads processed data and returns a df with distinguished row_id column.
    :param input_filepath: Directory that contains the processed data.
    :param columns: The list of columns to load.
    :return Tuple with the Training Data and a vector with the matching labels.
    """
    test_df = pd.read_pickle(input_filepath + "/test_data.pkl")
    row_ids = test_df["row_id"]

    with timer("Dropping unnecessary columns"):
        test_df = drop_unnecessary_columns(test_df, columns)

    return test_df, row_ids
Beispiel #20
0
def load_processed_training_data(input_filepath, columns):
    """
    Loads processed data and returns a df with distinguished label column.
    :param input_filepath: Directory that contains the processed data.
    :param columns: The list of columns to load.
    :return Tuple with the Training Data and a vector with the matching labels.
    """
    train_df = pd.read_pickle(input_filepath + "/train_data.pkl")
    label = np.log1p(train_df["meter_reading"])

    with timer("Dropping unnecessary columns"):
        train_df = drop_unnecessary_columns(train_df, columns)

    return train_df, label
Beispiel #21
0
def main(mode, input_filepath, output_filepath):
    """
    Collects prepared data and starts training an LightGBM model. Parameters
    can be specified by editing src/config.yml.
    :param mode: Specifies mode to run. Options are full (no validation set,
    single fold), cv (cross validation), by_meter (training by meter type),
    by_building (training by building id).
    :param input_filepath: Directory that contains the processed data.
    :param output_filepath: Directory that will contain the trained models.
    """
    random.seed(1337)
    with timer("Loading processed training data"):
        train_df, label = load_processed_training_data(input_filepath)

    ###########################################################################
    # DEFINE PARAMETERS FOR THE LGBM MODEL                                     #
    ###########################################################################
    with open("src/config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
    params = cfg["lgbm_params"]
    num_boost_round = cfg["lgbm_num_boost_round"]
    early_stopping_rounds = cfg["lgbm_early_stopping_rounds"]
    splits = cfg["lgbm_splits_for_cv"]
    verbose_eval = cfg["lgbm_verbose_eval"]
    ###########################################################################

    if mode == "full":
        start_full_training_run(train_df, label, params, verbose_eval,
                                num_boost_round, early_stopping_rounds,
                                output_filepath)
    elif mode == "cv":
        start_cv_run(train_df, label, params, splits, verbose_eval,
                     num_boost_round, early_stopping_rounds, output_filepath)
    elif mode == "by_meter":
        start_full_by_meter_run(train_df, label, params, verbose_eval,
                                num_boost_round, early_stopping_rounds,
                                output_filepath)
    elif mode == "by_building":
        start_full_by_building_run(train_df, label, params, splits,
                                   verbose_eval, num_boost_round,
                                   early_stopping_rounds, output_filepath)
    else:
        raise ValueError("Choose a valid mode: 'full', 'cv'")
def train_xgb_model(mode, input_filepath, output_filepath, cfg):
    """
    Collects prepared data and starts training an XGBoost model. Keep in mind that XGBoost does not accept NA Values.
    So the corresponding function to set these to zero in the preprocessing steps has to be set to true.
    :param mode: Specifies mode to run. Options are full (no validation set, single fold) and cv (cross validation).
    :param input_filepath: Directory that contains the processed data.
    :param output_filepath: Directory that will contain the trained models.
    :param cfg: Config read from src/config.yml.
    """
    with timer("Loading processed training data"):
        train_df, label = load_processed_training_data(input_filepath,
                                                       cfg["columns"])

    ###########################################################################
    # DEFINE PARAMETERS FOR THE XGB MODEL                                     #
    ###########################################################################

    params = {
        "objective": "reg:squarederror",
        "tree_method": "exact",
        "eval_metric": "rmse",
        "booster": "gbtree",
        "verbosity": "1",
    }

    num_boost_round = 5
    early_stopping_rounds = 2

    ###########################################################################

    if mode == "full":
        start_full_training_run(train_df, label, params, num_boost_round,
                                early_stopping_rounds, output_filepath)

    elif mode == "cv":
        start_cv_run(train_df, label, params, num_boost_round,
                     early_stopping_rounds)

    else:
        raise ValueError(
            "Choose a valid mode: 'sub' for submission or 'cv' for cross validation"
        )
Beispiel #23
0
def train_lgbm_model(mode, input_filepath, output_filepath, cfg):
    """
    Collects prepared data and starts training an LightGBM model.
    :param mode: Specifies mode to run. Options are full (no validation set, single fold), cv (cross validation),
    by_meter (training by meter type), by_building (training by building id).
    :param input_filepath: Directory that contains the processed data.
    :param output_filepath: Directory that will contain the trained models.
    :param cfg: Config read from src/config.yml.
    """
    with timer("Loading processed training data"):
        train_df, label = load_processed_training_data(input_filepath,
                                                       cfg["columns"])

    params = cfg["lgbm_params"]
    num_boost_round = cfg["lgbm_num_boost_round"]
    early_stopping_rounds = cfg["lgbm_early_stopping_rounds"]
    splits = cfg["lgbm_splits_for_cv"]
    verbose_eval = cfg["lgbm_verbose_eval"]
    grouped_on_building = cfg["lgbm_cv_grouped_on_building"]
    ###########################################################################

    if mode == "full":
        start_full_training_run(train_df, label, params, verbose_eval,
                                num_boost_round, early_stopping_rounds,
                                output_filepath)
    elif mode == "cv":
        start_cv_run(train_df, label, params, splits, verbose_eval,
                     num_boost_round, early_stopping_rounds, output_filepath,
                     grouped_on_building)
    elif mode == "by_meter":
        start_full_by_meter_run(train_df, label, params, verbose_eval,
                                num_boost_round, early_stopping_rounds,
                                output_filepath)
    elif mode == "by_building":
        start_full_by_building_run(train_df, label, params, splits,
                                   verbose_eval, num_boost_round,
                                   early_stopping_rounds, output_filepath)
    else:
        raise ValueError("Choose a valid mode: 'full', 'cv'")
Beispiel #24
0
    def __init__(self,
                 gravity=(0, 500),
                 fps=30,
                 parent=None,
                 threaded=False,
                 background_color=None,
                 resource_ref_name=None,
                 qApp=None):
        QtGui.QGraphicsScene.__init__(self, parent=parent)
        pm.Space.__init__(self, threaded=threaded)
        bp.blueprint.__init__(self, 'ch_space')

        self.__objs = []

        self.__timerManager = timerManager()
        self.timerManager = self.getTimerManager

        self.__eventManager = bp.eventManager()
        self.eventManager = self.getEventManager

        self.gravity = pm.Vec2d(gravity)

        self.__app = qApp or app

        self.__bodies = {}

        self.__fps = 0
        self.__hz = 0
        self.setFPS(fps)

        self.__running = False
        self.__timer = timer()
        self.__timer.reset(self.__running)

        self.setBackgroundColor(color=background_color)

        self.set_ref_name(resource_ref_name)
        self.add_self_to_catalog()
Beispiel #25
0
def main():
    """
    A hyperparameter search using the hyperopt package is being conducted.
    Parameters can be defined in the respective script. The results will be saved in
    data/hyperopt.
    """
    ################################################################################
    # SET PARAMETER FOR SEARCH HERE
    ################################################################################
    params_hyperopt = {
        "num_leaves": scope.int(hp.quniform("num_leaves", 5, 4096, 10)),
        "min_data_in_leaf":
        scope.int(hp.quniform("min_data_in_leaf", 10, 50, 1)),
        "feature_fraction": hp.uniform("feature_fraction", 0.4, 1.0),
        "min_split_gain": hp.uniform("min_split_gain", 0, 0.5),
        "bagging_fraction": hp.uniform("bagging_fraction", 0.4, 1.0),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 4.0),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 4.0),
    }

    params_static = {
        "learning_rate": 0.05,
        "num_threads": 20,
        "device_type": "cpu",
        "verbosity": -1
    }

    splits = 5
    max_evals = 200

    ################################################################################
    with timer("Loading processed training data"):
        train_df, label = load_processed_training_data("data/processed")
        train_df = lgb.Dataset(train_df, label)

    def objective_function(params):
        kf = KFold(n_splits=splits, shuffle=False, random_state=1337)
        final_params = dict(params, **params_static)
        cv_results = lgb.cv(final_params,
                            train_df,
                            folds=kf,
                            num_boost_round=10000,
                            early_stopping_rounds=50,
                            metrics="rmse",
                            seed=1337)

        score = min(cv_results["rmse-mean"])
        return {'loss': score, 'status': STATUS_OK}

    trials = Trials()
    best_param = fmin(objective_function,
                      params_hyperopt,
                      algo=tpe.suggest,
                      max_evals=max_evals,
                      trials=trials,
                      rstate=np.random.RandomState(1337))

    print("The search proposes these hyperparameters:")
    print(best_param)

    os.makedirs("data/hyperopt/lgbm", exist_ok=True)
    with open("data/hyperopt/lgbm/best_param.pkl", "wb") as handle:
        pickle.dump(best_param, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open("data/hyperopt/lgbm/trials.pkl", "wb") as handle:
        pickle.dump(trials, handle, protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #26
0
def start_cv_run(train_df, label, params, splits, verbose_eval,
                 num_boost_round, early_stopping_rounds, output_filepath,
                 grouped_on_building):
    """
    Starts a Cross Validation Run with the parameters provided.
    Scores will be documented and models will be saved.
    :param train_df: DataFrame which contains the training data.
    :param label: A vector which contains the labels of the training data.
    :param params: Dictionary with the model parameters
    :param splits: Integer describing the number of folds / splitting fraction.
    :param verbose_eval: The interval where training information is printed
    to console.
    :param num_boost_round: Maximum number of rounds / estimators for the training.
    :param early_stopping_rounds: If no improvement of the validation score in
    n rounds occur, the training will be stopped.
    :param output_filepath: Directory that will contain the trained models.
    :param grouped_on_building: Logical indicating whether cv should be done, by
    grouping the folds on building_id. Note that if set to True, building_id must
    not be included in the drop section in the config file.
    """
    if grouped_on_building:
        if not 'building_id' in train_df.columns:
            raise ValueError(
                "For grouped cv, the cross validation is grouped on building_id."
                "Therefore it must be excluded from the drop section in the config file,"
                "before using make data."
                "Note that the building_id is still not included in the model,"
                "it is needed for specifying the folds only and will be dropped afterwards."
            )
        output_filepath = output_filepath + "grouped_cv"
        is_meter0 = (train_df.meter == 0).values
        train_df = train_df.iloc[is_meter0, ]
        label = label.iloc[is_meter0, ]
        train_df = train_df.reset_index(drop=True)
        groups = train_df.building_id
        train_df = train_df.drop(columns='building_id')
        gkf = GroupKFold(n_splits=splits)
        indices = gkf.split(train_df, label, groups)
    else:
        output_filepath = output_filepath + "_cv"
        kf = KFold(n_splits=splits, shuffle=False, random_state=1337)
        indices = kf.split(train_df, label)
    cv_results = []
    with timer("Performing " + str(splits) + " fold cross-validation"):
        for i, (train_index, test_index) in enumerate(indices):
            with timer("~~~~ Fold %d of %d ~~~~" % (i + 1, splits)):
                x_train, x_valid = train_df.iloc[train_index], train_df.iloc[
                    test_index]
                y_train, y_valid = label[train_index], label[test_index]

                train_lgb_df = lgb.Dataset(data=x_train, label=y_train)
                valid_lgb_df = lgb.Dataset(data=x_valid, label=y_valid)

                valid_sets = [train_lgb_df, valid_lgb_df]
                evals_result = dict()
                lgbm_model = lgb.train(
                    params=params,
                    train_set=train_lgb_df,
                    num_boost_round=num_boost_round,
                    valid_sets=valid_sets,
                    valid_names=["train_loss", "eval"],
                    verbose_eval=verbose_eval,
                    evals_result=evals_result,
                    early_stopping_rounds=early_stopping_rounds)
                save_model(output_filepath, lgbm_model)

                cv_results.append(evals_result)
        evaluate_cv_results(cv_results)
def build_features(*dfs, cfg):
    with timer("Encoding categorical features"):
        dfs = [encode_categorical_data(df) for df in dfs]

    with timer("Encoding timestamp features"):
        dfs = [encode_timestamp(df, circular=cfg["circular_timestamp_encoding"]) for df in dfs]

    with timer("Create area per floor feature"):
        dfs = [calculate_area_per_floor(df) for df in dfs]

    if cfg["log_transform_square_feet"]:
        with timer("Taking the log of selected features"):
            dfs = [calculate_square_feet_log(df) for df in dfs]

    if cfg["log_transform_area_per_floor"]:
        with timer("Taking the log of area per floor"):
            dfs = [calculate_area_per_floor_log(df) for df in dfs]

    if cfg["label_square_feet_outlier"]:
        with timer("Create outlier label for square feet"):
            dfs = [label_square_feet_outlier(df) for df in dfs]

    if cfg["label_area_per_floor_outlier"]:
        with timer("Create outlier label for area per floor"):
            dfs = [label_area_per_floor_outlier(df) for df in dfs]

    with timer("Calculating age of buildings"):
        dfs = [calculate_age_of_building(df) for df in dfs]

    if cfg["encode_wind_direction"]:
        with timer("Encoding wind_direction features"):
            dfs = [encode_wind_direction(df) for df in dfs]

    with timer("Calculate relative humidity"):
        dfs = [calculate_relative_humidity(df) for df in dfs]

    if cfg["include_feels_like"]:
        with timer("Create feels_like_temp"):
            dfs = [calculate_feels_like_temp(df) for df in dfs]

    if cfg["fill_na_with_zero"]:
        dfs = [df.fillna(0) for df in dfs]

    if cfg["add_lag_features"]:
        with timer("Adding Lag Features"):
            dfs = [add_lag_features(df, cfg["lag_columns"], cfg["lag_windows"]) for df in dfs]

    return dfs
Beispiel #28
0
def main(input_filepath, output_filepath):
    """ Runs data feature engineering scripts to turn interim data from
        (../interim) into data which is ready for usage in ML models
        (saved in ../processed).
        :param input_filepath: Directory that contains the interim data
        :param output_filepath: Directory where processed results will be saved in.
    """
    with open("src/config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

    with timer("Loading interim data"):
        train_df, test_df = load_interim_data(input_filepath)

    with timer("Encoding categorical features"):
        train_df = encode_categorical_data(train_df)
        test_df = encode_categorical_data(test_df)

    with timer("Encoding timestamp features"):
        train_df = encode_timestamp(train_df, circular=cfg["circular_timestamp_encoding"])
        test_df = encode_timestamp(test_df, circular=cfg["circular_timestamp_encoding"])
    
    with timer("Create area per floor feature"):
        train_df["area_per_floor"] = train_df["square_feet"] / train_df["floor_count"]
        test_df["area_per_floor"] = test_df["square_feet"] / test_df["floor_count"]

    if cfg["log_transform_square_feet"]:
        with timer("Taking the log of selected features"):
            train_df["square_feet"] = np.log(train_df["square_feet"])
            test_df["square_feet"] = np.log(test_df["square_feet"])
    
    if cfg["log_transform_area_per_floor"]:
        with timer("Taking the log of area per floor"):
            train_df["area_per_floor"] = np.log(train_df["area_per_floor"])
            test_df["area_per_floor"] = np.log(test_df["area_per_floor"])
    
    if cfg["label_square_feet_outlier"]:
        with timer("Create outlier label for square feet"):
            train_df["outlier_square_feet"] = label_outlier("square_feet", train_df)
            test_df["outlier_square_feet"] = label_outlier("square_feet", test_df)
    
    if cfg["label_area_per_floor_outlier"]:
        with timer("Create outlier label for area per floor"):
            train_df["outlier_area_per_floor"] = label_outlier("area_per_floor", train_df)
            test_df["outlier_area_per_floor"] = label_outlier("area_per_floor", test_df)

    with timer("Calculating age of buildings"):
        train_df = calculate_age_of_building(train_df)
        test_df = calculate_age_of_building(test_df)

    if cfg["encode_wind_direction"]:
        with timer("Encoding wind_direction features"):
            train_df = encode_wind_direction(train_df)
            test_df = encode_wind_direction(test_df)

    if cfg["fill_na_with_zero"]:
        train_df.fillna(0)
        test_df.fillna(0)

    if cfg["add_lag_features"]:
        with timer("Adding Lag Features"):
            train_df = add_lag_features(train_df, cfg["lag_columns"], cfg["lag_windows"])
            test_df = add_lag_features(test_df, cfg["lag_columns"], cfg["lag_windows"])

    if cfg["exclude_faulty_rows"]:
        with timer("Exclude faulty data and outliers"):
            train_df = exclude_faulty_readings(train_df)

    if cfg["add_leaks_to_train"]:
        with timer("Adding Leak Label to training set"):
            train_df = add_leaked_data(train_df, test_df)

    with timer("Sort training set"):
        train_df.sort_values("timestamp", inplace=True)
        train_df.reset_index(drop=True, inplace=True)

    with timer("Dropping specified columns"):
        train_df = drop_columns(train_df, cfg["drop"])
        test_df = drop_columns(test_df, cfg["drop"])

    with timer("Save processed data"):
        save_processed_data(output_filepath, train_df, test_df)