Exemple #1
0
class BasePredictions:
    def __init__(self, **kwargs):
        self.X = kwargs.get("X")
        self.y = kwargs.get("y")
        self.combs = kwargs.get("combs")
        self.clf = kwargs.get("clf")

        # arguments with default values
        self.top = kwargs.get("top", 20)
        self.mean_fpr = np.linspace(0, 1, 100)
        self.n_estimators = kwargs.get("n_estimators", 500)
        self.class_weight = kwargs.get("class_weight", "balanced")
        self.min_samples_split = kwargs.get("min_samples_split", 3)
        self.min_samples_leaf = kwargs.get("min_samples_leaf", 3)
        self.colsample_bytree = kwargs.get("colsample_bytree", 0.6)
        self.learning_rate = kwargs.get("learning_rate", 0.1)
        self.random_state = kwargs.get("random_state", 125)
        self.max_depth = kwargs.get("max_depth", None)
        self.objective = kwargs.get("objective", 'binary:logistic')
        self.scale_pos_weight = kwargs.get("scale_pos_weight", 1)

        self._set_classifier()

        self.predicted = dict()
        self.topfeat = dict()
        self.fpr = dict()
        self.tpr = dict()
        self.tprs = dict()
        self.auc = dict()
        self.precision = dict()
        self.recall = dict()
        self.avprec = dict()

    def _set_classifier(self):
        if self.clf.lower() == "randomforest":
            self.clf = RandomForestClassifier(
                bootstrap=True,
                class_weight=self.class_weight,
                max_depth=self.max_depth,
                n_estimators=self.n_estimators,
                max_features='sqrt',
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=self.random_state,
                n_jobs=-1)
        elif self.clf.lower() == "xgboost":
            if self.max_depth is None:
                self.max_depth = 5

            self.clf = xgb.XGBClassifier(
                learning_rate=self.learning_rate,
                colsample_bytree=self.colsample_bytree,
                random_state=self.random_state,
                max_depth=self.max_depth,
                n_estimators=self.n_estimators,
                scale_pos_weight=self.scale_pos_weight,
                objective=self.objective,
                n_jobs=-1)
        else:
            raise ValueError("only randomforest and xgboost are supported")
    def fit(self, X, y, tree="rf", recursive=True, cv=5):
        """
        Fits to the data (X) and target (y) to determine the selected_features.

        Args:
            X (pandas.DataFrame): input data, note that numpy matrix is NOT
                accepted since the X.columns is used for feature names
            y (pandas.Series or np.ndarray): list of outputs used for fitting
                the tree model
            tree (str or instantiated sklearn tree-based model): if a model is
                directly fed, it must have the .feature_importances_ attribute
            recursive (bool): whether to recursively reduce the features (True)
                or just do it once (False)
            cv (int or CrossValidation): sklearn's cross-validation with the
                same options (int or actual instantiated CrossValidation)

        Returns (None):
            sets the class attribute .selected_features
        """
        m0 = len(X.columns)
        if isinstance(tree, str):
            if tree.lower() in ["rf", "random forest", "randomforest"]:
                if self.mode.lower() in ["classification", "classifier"]:
                    tree = RandomForestClassifier(random_state=self.rs)
                else:
                    tree = RandomForestRegressor(random_state=self.rs)
            elif tree.lower() in ["gb", "gbt", "gradiet boosting"]:
                if self.mode.lower() in ["classification", "classifier"]:
                    tree = GradientBoostingClassifier(random_state=self.rs)
                else:
                    tree = GradientBoostingRegressor(random_state=self.rs)
            else:
                raise AutomatminerError(
                    "Unsupported tree_type {}!".format(tree))

        cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree))
        all_feats = []
        for train, _ in cv.split(X, y, groups=None):
            Xtrn = X.iloc[train]
            ytrn = y.iloc[train]
            all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive)
        # take the union of selected features of each fold
        self.selected_features = list(set(all_feats))
        logger.info(
            self._log_prefix +
            "Finished tree-based feature reduction of {} initial features to "
            "{}".format(m0, len(self.selected_features)))
        return self
Exemple #3
0
def main():
    if len(sys.argv) == 4:
        database_filepath, classifier, model_filepath = sys.argv[1:]
        print('Loading data...\n    DATABASE: {}'.format(database_filepath))
        x, y, category_names = load_data(database_filepath)
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.2)

        print('Building model...')
        if classifier.lower() == 'rf':
            message = f'Model used: RandomForest\n'
            classifier = RandomForestClassifier()
            cls_params = {
                'cls__estimator__n_estimators': [50, 100, 200],
                'cls__estimator__max_depth': [None, 1, 3, 5]
            }
        elif classifier.lower() == 'ad':
            message = f'Model used: AdaBoost\n'
            classifier = AdaBoostClassifier()
            cls_params = {
                'cls__estimator__base_estimator': [
                    DecisionTreeClassifier(max_depth=1),
                    RandomForestClassifier(max_depth=1)
                ],
                'cls__estimator__n_estimators': [25, 50, 100],
            }
        elif classifier.lower() == 'dt':
            message = f'Model used: DecisionTree\n'
            classifier = DecisionTreeClassifier()
            cls_params = {'cls__estimator__max_depth': [None, 1, 3, 5]}
        else:
            raise ValueError(f'{classifier.lower()} is not a valid '
                             f'classifier choice')

        params = {
            'vect__max_features': [None, 20, 50, 100],
            'tfidf__use_idf': [True, False],
            'tfidf__norm': ['l1', 'l2'],
            **cls_params
        }
        message = message + f'Parameters for GridSearch:\n{params}'
        print(message)
        model = build_model(classifier=classifier)

        print('Training model...')
        best_estimator = optimize_model(model=model,
                                        params=params,
                                        x=x_train,
                                        y=y_train)

        print('Evaluating model...')
        evaluate_model(best_estimator, x_test, y_test)

        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(best_estimator, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '
              'as the first argument and the filepath of the pickle file to '
              'save the model to as the second argument. \n\nExample: python '
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')