Esempio n. 1
0
    def fit(self, tree, model_name):

        self.initialize(model_name)
        self.load()
        self.log_params()

        if model_name != 'DT':
            model = tree(n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_samples_split=self.min_samples_split,
                         min_samples_leaf=self.min_samples_leaf,
                         max_features=self.max_features,
                         bootstrap=self.bootstrap,
                         n_jobs=self.n_jobs,
                         verbose=self.verbose)
        else:
            model = tree(max_depth=self.max_depth,
                         min_samples_split=self.min_samples_split,
                         min_samples_leaf=self.min_samples_leaf,
                         max_features=self.max_features)

        model.fit(self.X_train.values, self.Y_train.values)
        print(f"{model_name} is fitted")

        if self.should_cross_val:
            scores = cross_val_score(model,
                                     self.X,
                                     self.Y,
                                     cv=self.k,
                                     verbose=0)
            self.log.info(
                f"---- Cross validation with {self.k} groups----\n\nThe results on each split"
                + str(scores) + "\n")
            self.log.info(
                f"The average of the cross validation is {np.mean(scores):.2f}\n"
            )

            print(
                f"|- Cross validation is done for {model_name}. Accuracy: {np.mean(scores):.2f} -|"
            )

        evaluate_classification(
            ['OnTrain', self.X_train, self.Y_train, self.dates_train],
            ['OnTest', self.X_test, self.Y_test, self.dates_test],
            direc=self.directory,
            model=model,
            model_name=model_name,
            logger=self.log,
            slicer=1)

        joblib.dump(model, self.directory + f"/{model_name}.pkl")

        plot_roc(
            pd.get_dummies(self.Y_test, drop_first=False).values,
            model.predict_proba(self.X_test), self.dl.classes_, self.directory)

        # Plotting the Importances
        report_feature_importance(self.directory, model.feature_importances_,
                                  self.X, self.Y, self.n_top_features,
                                  model_name, self.log)
Esempio n. 2
0
    def fit(self):
        
        model = LogisticRegression(C = 100, fit_intercept = True, penalty= 'l1', solver = 'liblinear')
        model.fit(self.X_train, self.Y_train)

        self.dl.log.info("---Model Coeffs---\n" + str(model.coef_))

        coeffs = {}
        for i in range (len(model.coef_[0])):
            coeffs[self.X.columns[i]] = model.coef_[0][i]
        coeffs['c'] = model.intercept_[0]

        evaluate_classification(['OnTrain', self.X_train, self.Y_train, self.dates_train],
                                ['OnTest', self.X_test, self.Y_test, self.dates_test],
                                direc = self.directory,
                                model = model,
                                model_name = self.model_name,
                                logger = self.log,
                                slicer = 1)

        plot_roc(pd.get_dummies(self.Y_test, drop_first = False).values,
                model.predict_proba(self.X_test),
                self.dl.classes_,
                self.directory)
        
        joblib.dump(model, self.directory + f"/Logit.pkl")

        # Plotting the Importances
        report_feature_importance(self.directory, model.coef_[0], self.X, self.Y,
                                    self.n_top_features, "Logit", self.log)
Esempio n. 3
0
    def fit(self):

        self.set_params()
        self.log_params()

        self.model = SVC(C=self.C, kernel=self.kernel, gamma=self.gamma)
        self.model.fit(self.X_train, self.Y_train)

        evaluate_classification(
            ['OnTrain', self.X_train, self.Y_train, self.dates_train],
            ['OnTest', self.X_test, self.Y_test, self.dates_test],
            direc=self.directory,
            model=self.model,
            model_name=self.model_name,
            logger=self.log,
            slicer=1)

        joblib.dump(self.model, self.directory + f"/{self.model_name}.pkl")

        # Plotting the Importances
        if self.kernel == 'linear':
            report_feature_importance(self.directory, self.model.coef_[0],
                                      self.X_train.columns,
                                      self.n_top_features, self.model_name,
                                      self.log)
Esempio n. 4
0
    def fit(self, n = 5):

        self.log.info(f'KNN Classifier is about to be fit on {self.name} with n = {n}')
        model = KNeighborsClassifier(n_neighbors=n, n_jobs = -1)
        model.fit(self.X_train, self.Y_train)
        
        evaluate_classification(['OnTrain', self.X_train, self.Y_train, self.dates_train],
                                ['OnTest', self.X_test, self.Y_test, self.dates_test],
                                direc = self.directory,
                                model = model,
                                model_name = model_name,
                                logger = self.log,
                                slicer = 1)
Esempio n. 5
0
    def get_report(self):

        self.load_model()

        y_train_pred = self.predict_set(self.X_train)
        y_test_pred = self.predict_set(self.X_test)

        evaluate_classification(['OnTrain', self.X_train, self.Y_original_train, self.dates_train, y_train_pred],
                                ['OnTest', self.X_test, self.Y_original_test, self.dates_test, y_test_pred],
                                direc = self.directory,
                                model = self.model,
                                model_name = self.model_name,
                                logger = self.log,
                                slicer = 1)
Esempio n. 6
0
    def fit(self):

        value_counts = self.Y.value_counts()
        value_counts = value_counts / value_counts.sum()

        y_pred_train = np.random.choice(value_counts.index,
                                        size=len(self.Y_train),
                                        p=value_counts.values)
        y_pred_test = np.random.choice(value_counts.index,
                                       size=len(self.Y_test),
                                       p=value_counts.values)

        evaluate_classification([
            'OnTrain', self.X_train, self.Y_train, self.dates_train,
            y_pred_train
        ], ['OnTest', self.X_test, self.Y_test, self.dates_test, y_pred_test],
                                direc=self.directory,
                                model_name=self.model_name,
                                logger=self.log,
                                slicer=1)