Example #1
0
def test():
    '''
    Trains the model and returns its score
    '''
    matplotlib.rcParams['backend'] = 'Qt5Agg'
    matplotlib.get_backend()
    D = DataManager(data_name, data_dir)
    #Load le model
    mdl = model()

    Prepro = prepro.Preprocessor()
    #D.data['X_train'] = Prepro.removeOutliers(D.data['X_train'])
    #D.data['Y_train'] = Prepro.removeOutliers(D.data['Y_train'])
    X_train = D.data['X_train']
    Y_train = D.data['Y_train'].ravel()

    #test de l'entrainement
    mdl.fit(X_train, Y_train)

    #test de la prediction
    Y_hat_train = mdl.predict(D.data['X_train'])
    Y_hat_valid = mdl.predict(D.data['X_valid'])
    Y_hat_test = mdl.predict(D.data['X_test'])

    metric_name, scoring_function = get_metric()
    scores = cross_val_score(mdl,
                             X_train,
                             Y_train,
                             cv=5,
                             scoring=make_scorer(scoring_function))
    print('\nCV score (95 perc. CI): %0.2f (+/- %0.2f)' %
          (scores.mean(), scores.std() * 2))
Example #2
0
 def find_best_params(self, speed):
     """
     Search the best dimensions and features number.
     DO NOT USE THIS FUNCTION UNLESS YOU HAVE A POWERFULL COMPUTER OR/AND A LOT OF TIME OR USE A HIGH SPEED (>4)
     :param speed: The number of features and dimension jumped on each loop
     :return: The best dimensions and features number
     """
     print(speed)
     scores = [[0] * 200] * 200
     Y_train = D.data['Y_train'].ravel()
     for i in range(1, 200, speed):
         M = RandomForestClassifier(n_estimators=136,
                                    max_depth=None,
                                    min_samples_split=2,
                                    random_state=1)
         feature_selection = SelectKBest(chi2, k=i)
         feature_selection.fit(D.data['X_train'], Y_train)
         X_train = feature_selection.transform(D.data['X_train'])
         for j in range(1, 200, speed):
             tmpM = M
             pca = PCA(n_components=j)
             pca.fit(D.data['X_train'], Y_train)
             X_train = pca.transform(D.data['X_train'])
             tmpM.fit(X_train, Y_train)
             metric_name, scoring_function = get_metric()
             scrs = cross_val_score(M,
                                    X_train,
                                    Y_train,
                                    cv=5,
                                    scoring=make_scorer(scoring_function))
             scores[i][j] = (scrs.mean())
     max_pos = np.argmax(scores)
     self.best_features_nb = max_pos // 200
     self.best_dim_nb = max_pos % 200
     print(self.best_features_nb, self.best_dim_nb)
Example #3
0
def f_test_models (X_train, Y_train):

    metric_name, scoring_function = get_metric()
    model_name = ["Nearest Neighbors",
         "Decision Tree", "Random Forest",  "AdaBoost",
         "Naive Bayes"]
    model_list = [
        KNeighborsClassifier(3),
        DecisionTreeClassifier(max_depth=10),
        RandomForestClassifier(max_depth=10, n_estimators=20),
        AdaBoostClassifier(),
        GaussianNB(),
        ]

    s_train = []
    s_test = []
    for i in range(len(model_list)):
        s_prime = cross_validate(model_list[i], X_train, Y_train,cv=5, scoring=make_scorer(scoring_function), return_train_score=True)
        s_train.append(s_prime['train_score'].mean())
        s_test.append(s_prime['test_score'].mean())
    d = {'Score_train': s_train, 
            'Score_test': s_test} 
    
    #Plot
    sd = pd.DataFrame(d, index=[model_name[i] for i in range(len(model_name))] ) 
    ax = sd.plot.bar()
    ax.set_ylabel("Score")
    ax.set_xlabel("Model")
    plt.show()
Example #4
0
def f_test_estimator(X_train, Y_train): 
    s = []
    var =[]
    nb_arbre = np.linspace(15,120, num=6).astype(int)     #Nombre d'arbres que l'on teste
    metric_name, scoring_function = get_metric()
    
    for i in range(len(nb_arbre)):
        clf = RandomForestClassifier(random_state = 42, n_estimators = nb_arbre[i])
        M_prime = model(clf)
        scores = cross_val_score(M_prime, X_train, Y_train, cv=5, scoring=make_scorer(scoring_function))
        s.append(scores.mean()) 
        var.append(scores.std())
    #Plot
    plt.figure(figsize=(6,6))
    plt.xlabel("n_estimator")
    plt.ylabel('Score')
    plt.title('Score results of RandomForest with cross-validation')
    plt.errorbar(nb_arbre, s, var, label='Test set')
Example #5
0
def best_param_MODEL(logistic, distributions): 
    """
    This function finds the best parameters for the RandomizedSearchCV model and returns the best parameters
    
    Parameters
    ----------
    logistic: model's name
    distributions: dictionary of the different parameters of the model that will be tested
  

    Returns
    ------
    search: the best parameters

    """
    metric_name, scoring_function = get_metric()

    clf = RandomizedSearchCV(logistic, distributions, random_state=0, scoring=make_scorer(scoring_function) )
    search = clf.fit(X_train, Y_train)
    search.best_params_
    return search
Example #6
0
    def __init__(self, 
                 X_train, 
                 y_train, 
                 models_list,
                 models_name,
                 preprocessing_name = None, 
				 scoring_function = None):
        '''
        This constructor initialises the datasets, scoring function, and models
        such that they can be used by the other methods.
		
		The default scoring function (scoring_function = None) is the provided one.
        '''
        self.X_train = X_train
        self.y_train = y_train
        if scoring_function == None:
            _, self.scoring_function = get_metric()
        else:
            self.scoring_function = scoring_function
        self.models_list = models_list
        self.models_name = models_name
        self.preprocessing_name = preprocessing_name
Example #7
0
 def find_best_pca(self):
     """
     Find the best dimensions number using the PCA (Principal Component Analysis).
     :return: The best dimensions number
     """
     for i in range(1, 200, 1):
         M = RandomForestClassifier(n_estimators=136,
                                    max_depth=None,
                                    min_samples_split=2,
                                    random_state=1)
         pca = PCA(n_components=i)
         pca.fit(D.data['X_train'], D.data['Y_train'])
         X_train = pca.transform(D.data['X_train'])
         Y_train = D.data['Y_train'].ravel()
         M.fit(X_train, Y_train)
         metric_name, scoring_function = get_metric()
         scores = cross_val_score(M,
                                  X_train,
                                  Y_train,
                                  cv=5,
                                  scoring=make_scorer(scoring_function))
         self.pca_scores.append(scores.mean())
     self.best_dim_nb = self.pca_scores.index(max(self.pca_scores))
Example #8
0
 def find_best_features(self):
     """
     Execute the model with different quantity of features (1 to 200) and return the quantity of features who give the best model's score.
     :return: The best features number
     """
     for i in range(1, 200, 1):
         M = RandomForestClassifier(n_estimators=136,
                                    max_depth=None,
                                    min_samples_split=2,
                                    random_state=1)
         feature_selection = SelectKBest(chi2, k=i)
         feature_selection.fit(D.data['X_train'], D.data['Y_train'])
         X_train = feature_selection.transform(D.data['X_train'])
         Y_train = D.data['Y_train'].ravel()
         M.fit(X_train, Y_train)
         metric_name, scoring_function = get_metric()
         scores = cross_val_score(M,
                                  X_train,
                                  Y_train,
                                  cv=5,
                                  scoring=make_scorer(scoring_function))
         self.features_scores.append(scores.mean())
     self.best_features_nb = self.features_scores.index(
         max(self.features_scores))
Example #9
0
def main():

    #Choose while executing Preprocessed or RAW
    
    print("Here we goooo ! \n Preprocessed (0, default) or Raw (1) ?")
    try:
        choice = int(input())
    except ValueError:
        print("ERREUR: Saisissez un NOMBRE, Fermeture du programme.")
        exit()
    
    warnings.filterwarnings("ignore")
    np.seterr(divide='ignore', invalid='ignore')
    metric_name, scoring_function = get_metric()

    #Choose appropriate directory and  model
    if (choice == 1):
        directory = DIRECTORY + "Raw_Results/"
        data_dir = DATA_DIR_RAW
        clf = ModelRaw()

    else:
        directory = DIRECTORY + "Preprocessed_Results/"
        data_dir = DATA_DIR_PRE
        clf = ModelPreprocessed()

    #Create Directory
    if not os.path.exists(directory):
        os.makedirs(directory)


    #Load data as panda frame
    d_train = load_train(data_dir, DATA_NAME)

    #Transform to numpy
    X = d_train.drop(columns=['target']).to_numpy()
    y = d_train['target'].to_numpy()


    #split data train
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


    #Train classif
    clf.fit(X_train, y_train)


    #Score and ROC curve
    accuracy = clf.score(X_test, y_test)
    print("accuracy =", accuracy)
    y_proba = clf.predict_proba(X_test)
    y_decision = y_proba[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_decision, pos_label=1)
    plot_ROC(fpr, tpr, directory=directory)


    #Confusion Matrix
    y_test_pre = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_test_pre)
    plot_mat_conf(cm, directory=directory)



    if True:
        #plot score of 5 different model
        model_name = ["Nearest Neighbors","Decision Tree",
                      "Random Forest",  "AdaBoost",
                      "Naive Bayes"]
        model_list = [
            KNeighborsClassifier(3),
            DecisionTreeClassifier(max_depth=10),
            RandomForestClassifier(max_depth=10, n_estimators=20),
            AdaBoostClassifier(),
            GaussianNB(),
            ]

        s_train = []
        s_test = []
        for i in range(len(model_list)):
            s_prime = cross_validate(ModelPreprocessed(classifier = model_list[i]), X, y, cv=3, scoring=make_scorer(scoring_function), return_train_score=True)
            s_train.append(s_prime['train_score'].mean())
            s_test.append(s_prime['test_score'].mean())

        plot_test_model (s_train, s_test, model_name, directory=directory)


        #plot score with different values of n_estimators
        s = []
        var = []
        n_est = np.linspace(15,120, num=4).astype(int)
        metric_name, scoring_function = get_metric()

        for i in range(len(n_est)):
            clf_prime = RandomForestClassifier(random_state = 42, n_estimators = n_est[i])
            scores = cross_val_score(ModelPreprocessed(classifier = clf_prime), X, y, cv=5, scoring=make_scorer(scoring_function))
            s.append(scores.mean())
            var.append(scores.std())
        plot_test_estimator(n_est, s, var, directory=directory)

        #plot decision surface for Decision tree, RandForest and Adaboost
        plot_decision_surface_tree_classif(X_train, y_train, directory=directory)
Example #10
0
	def cross_validation_Classifier(self):
		metric_name1, scoring_function1 = get_metric()
		return cross_val_score(self.M, self.x,self.y, cv=5 ,scoring = make_scorer(scoring_function1))
Example #11
0
	    KNeighborsClassifier(1),
	    DecisionTreeClassifier(max_depth=10),
	    #RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1),
	    RandomForestClassifier(n_estimators=116, max_depth=None, min_samples_split=2, random_state=1),
	    MLPClassifier(alpha=1, max_iter=1000),
	    AdaBoostClassifier(),
	    GaussianNB(),
	    QuadraticDiscriminantAnalysis(),
	    ExtraTreesClassifier()
	]



	X_train = D.data['X_train']
	Y_train = D.data['Y_train'].ravel()
	metric_name, scoring_function = get_metric()

	#compareModel(model_name, model_list)


	#M_Model = model(RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0))
	#doBestModel(M_Model)

	model_listS = [
    ('rf', ExtraTreesClassifier()),
    #('knb',     KNeighborsClassifier(1),
    ('rfc', RandomForestClassifier(n_estimators=116, max_depth=None, min_samples_split=2, random_state=1)),
   # ('rfc1',      MLPClassifier(alpha=1, max_iter=1000)),
   # ('rfc2',      GaussianNB()),
    #('rfc3',          QuadraticDiscriminantAnalysis()),