Esempio n. 1
0
    def run(self):

        [
            utils.create_folder(self.output()[x].path)
            for x in self.output().keys()
        ]

        # Read in X and y
        X_train = utils.load_data(
            self.input()['select_features']['X_train_filtered'].path)
        y_train = utils.load_data(self.input()['prepare_features']['y'].path)

        if not self.model['estimators']:
            raise Exception("Please provide list of estimators to train!")

        # Iterate over our models and our offset targets, performing the grid search to tune hyper-parameters
        for model in self.model['estimators']:
            LOGGER.info('{}: Tuning model - {}'.format(repr(self),
                                                       model["estimator"]))

            grid_search = self.do_grid_search(model, X_train, y_train)
            self.best_model_per_model_type(model, grid_search)

        # Save best models
        for best_model in self.best_estimator_per_model:
            LOGGER.info('{}: BEST {} - {}'.format(
                repr(self), best_model["model"]["estimator_type"],
                str(best_model["best_score"])))

        utils.save_data(self.best_estimator_per_model,
                        self.output()['model_package'].path)
Esempio n. 2
0
    def run(self):
        [
            utils.create_folder(self.output()[x].path)
            for x in self.output().keys()
        ]

        final_model_package = {}

        if len(self.model["estimators"]) > 1:
            # Read best models
            best_models = utils.load_data(
                self.input()["cv"]["model_package"].path)

            # Read in X and y
            X_train = utils.load_data(
                self.input()['select_features']['X_train_filtered'].path)
            y_train = utils.load_data(
                self.input()['prepare_features']['y'].path)

            estimators = []
            for model in best_models:
                estimators.append(
                    (model["model"]["estimator_type"], model["best_model"]))

            eclf = VotingClassifier(estimators=estimators, voting="soft")

            LOGGER.info('{}: Fitting ensemble model '.format(repr(self)))
            eclf.fit(X_train, y_train)

            # Package model
            final_model_package["final_model"] = eclf

        # Save ensemble model
        utils.save_data(final_model_package,
                        self.output()["ensemble_model"].path)
Esempio n. 3
0
    def run(self):
        [utils.create_folder(self.output()[x].path) for x in self.output().keys()]

        # Load data
        training_data = utils.load_data("data/train.csv")
        test_data = utils.load_data("data/test.csv")

        # Combine test and train for data transformation
        # This may not be the best strategy in many real world applications as we are imputing our data with information
        # from test, but for the purpose of improving kaggle score, let's add test here for a better imputation
        combined = training_data.append(test_data, ignore_index=True)
        combined.drop(columns=["Survived"], inplace=True)

        # Fit and transform raw features
        LOGGER.info('{}: Transforming raw features'.format(repr(self)))
        transformer = TitanicFeatureTransformer()
        combined_transformed = transformer.fit_transform(combined)

        # Split back into train and test features
        X_train = combined_transformed[:len(training_data)]
        X_test = combined_transformed[len(training_data):]
        y = training_data["Survived"]

        # Save
        LOGGER.info('{}: Saving transformer and transformed features. {} rows of train data, '
                    '{} rows of test data, {} columns.'.format(repr(self), str(len(X_train)), str(len(X_test)), str(X_train.shape[1])))
        utils.save_data(transformer, self.output()["transformer"].path)
        utils.save_data(X_train, self.output()["X_train"].path)
        utils.save_data(X_test, self.output()["X_test"].path)
        utils.save_data(y, self.output()["y"].path)
Esempio n. 4
0
    def run(self):
        [utils.create_folder(self.output()[x].path) for x in self.output().keys()]

        # Load data
        X_train = utils.load_data(self.input()["prepare_features"]["X_train"].path)
        X_test = utils.load_data(self.input()["prepare_features"]["X_test"].path)
        y = utils.load_data(self.input()["prepare_features"]["y"].path)
        transformer = utils.load_data(self.input()["prepare_features"]["transformer"].path)

        # Feature selection
        LOGGER.info('{}: Selecting features'.format(repr(self)))
        feature_selection_clf = utils.import_object(self.model["feature_selection"]["estimator"])()
        feature_selection_clf.set_params(**self.model["feature_selection"]["parameter_values"])

        feature_selection_model = SelectFromModel(feature_selection_clf)
        feature_selection_model.fit(X_train, y)

        X_train_filtered = feature_selection_model.transform(X_train)
        X_test_filtered = feature_selection_model.transform(X_test)

        # Save
        LOGGER.info('{}: Saving feature selection model and selected features. {} columns of data '
                    'selected.'.format(repr(self), str(X_train_filtered.shape[1])))

        columns = transformer.get_column_order()
        dropped_columns = feature_selection_model.get_support()
        columns_selected = list(compress(columns, list(dropped_columns)))
        LOGGER.info('{}: Selected columns: {}'.format(repr(self), ",".join(columns_selected)))

        utils.save_data(X_train_filtered, self.output()["X_train_filtered"].path)
        utils.save_data(X_test_filtered, self.output()["X_test_filtered"].path)
        utils.save_data(feature_selection_model, self.output()["feature_selection"].path)
Esempio n. 5
0
    def run(self):
        [
            utils.create_folder(self.output()[x].path)
            for x in self.output().keys()
        ]

        # Read prediction data
        pred_data = utils.load_data("data/test.csv")
        predict_folder = os.path.dirname(self.output()["predictions"].path)

        # Read models and transform data
        best_individual_models = utils.load_data(
            self.input()["cv"]["model_package"].path)
        final_model = utils.load_data(
            self.input()["ensemble_clf"]["ensemble_model"].path)
        X_test_filtered = utils.load_data(
            self.input()["select_features"]["X_test_filtered"].path)

        for m in best_individual_models:
            clf = m["best_model"]
            prediction_df = self.make_prediction(clf, X_test_filtered,
                                                 pred_data["PassengerId"])

            utils.save_data(
                prediction_df,
                os.path.join(
                    predict_folder, m["model"]["estimator_type"] + "_" +
                    str(m["best_score"]) + ".csv"))

        if len(self.model["estimators"]) > 1:
            eclf = final_model["final_model"]
            prediction_df = self.make_prediction(eclf, X_test_filtered,
                                                 pred_data["PassengerId"])

            utils.save_data(
                prediction_df,
                os.path.join(predict_folder, "EnsembleClassifier.csv"))

        utils.save_data("", self.output()["predictions"].path)
Esempio n. 6
0
    def run(self):
        [utils.create_folder(self.output()[x].path) for x in self.output().keys()]

        utils.save_data(self.model["build_description"], self.output()["log_name"].path)