def ovrSVM(self, X, Y, X_test): ovrClassifier = OneVsRestClassifier(LinearSVC(C=100, random_state=0), 100) print ovrClassifier.get_params() ovrClassifier.fit(X, Y) Y_predict = ovrClassifier.predict(X_test) return Y_predict
def fit_model(X_train, Y_train): model_to_set = OneVsRestClassifier(XGBClassifier()) parameters = { 'estimator__colsample_bylevel': [0.7, 0.8], 'estimator__colsample_bytree': [0.7, 0.8], 'estimator__colsample_bynode': [0.7, 0.8], 'estimator__subsample': [0.7, 0.8] # 'estimator__max_depth': [6, 8, 10, 12, 14], # 'estimator__n_estimators': [100], # 'estimator__min_child_weight': [1, 2, 3, 5, 7] } print(model_to_set.get_params()) model_tuning = RandomizedSearchCV(estimator=model_to_set, param_distributions=parameters, n_jobs=-1, n_iter=15, cv=5, verbose=1) # model_tunning = GridSearchCV(model_to_set, # param_grid=parameters, # n_jobs=-1, # cv=5, # verbose=1) # Y_train = Y_train.astype('int') model_tuning.fit(X_train, Y_train) # print(model_tuning.best_score_) print(model_tuning.best_params_) print('test') return model_tuning
def SVMTraining(XEstimate, XValidate, Parameters, class_labels): svcClassifier = SVC(kernel='rbf', probability=True) gridSearcher = GridSearchCV(svcClassifier, Parameters) clf = OneVsRestClassifier(gridSearcher) clf.fit(XEstimate, class_labels) Yvalidate = clf.predict(XValidate) EstParameters = clf.get_params() return {"Yvalidate": Yvalidate, "EstParameters": EstParameters, "clf": clf}
class CustomSVCImplementation(ModelImplementation): def __init__(self, log: Log = None, **params: Optional[dict]): super().__init__(log) if not params: self.inner_model = SVC(kernel='linear', probability=True, class_weight='balanced') else: self.inner_model = SVC(**params) self.params = params self.model = OneVsRestClassifier(self.inner_model) self.classes = None def fit(self, train_data): """ Method fit model on a dataset :param train_data: data to train the model """ self.classes = np.unique(train_data.target) self.model.fit(train_data.features, train_data.target) return self.model def predict(self, input_data, is_fit_pipeline_stage: Optional[bool] = None): """ Method make prediction with labels of classes :param input_data: data with features to process :param is_fit_pipeline_stage: is this fit or predict stage for pipeline """ prediction = self.model.predict(input_data.features) return prediction def predict_proba(self, input_data): """ Method make prediction with probabilities of classes :param input_data: data with features to process """ prediction = self.model.predict_proba(input_data.features) return prediction def get_params(self): """ Method return parameters, which can be optimized for particular operation """ return self.model.get_params() @property def classes_(self): return self.classes
def prepare_data_frame_and_build_model(visu=True): df, mb, labels = prepare_data_frame(load_raw=False) tfidf_vect = TfidfVectorizer(max_features=50000) x_train, x_test, y_train, y_test = train_test_split(df[['clean_x', 'title_not_modified']], labels) # df['features'] = tfidf_vect.fit_transform(df['clean_x']) x_train_tf_idf = tfidf_vect.fit_transform(x_train['clean_x']) x_test_tf_idf = tfidf_vect.transform(x_test['clean_x']) clf = OneVsRestClassifier(LogisticRegression()) # clf = OneVsRestClassifier(RandomForestClassifier()) #Score moins bon (0.60939) et temps énorme ! # clf = OneVsRestClassifier(svm.SVC()) # clf = OneVsRestClassifier(DecisionTreeClassifier()) #Mauvais score, et lent fit (0.471) clf.fit(x_train_tf_idf, y_train) print("Classifier parameters :") print(clf.get_params()) threshold_decision = np.vectorize(lambda t: 1 if t > seuil else 0) y_pred = threshold_decision(clf.predict_proba(x_test_tf_idf)) f1score = f1_score(y_test, y_pred, average='micro') eval1 = hamming_score(y_test, y_pred) eval2 = true_positive(y_test, y_pred) eval3 = false_positive(y_test, y_pred) eval4 = true_negative(y_test, y_pred) eval5 = false_negative(y_test, y_pred) for i in range(10): random_pos = random.randint(0, len(x_test)) y_p = predict(overview=x_test['clean_x'].values[random_pos], multilabel_binarizer=mb, classifier=clf, tfidf_vect=tfidf_vect) print("Title : ", x_test['title_not_modified'].values[random_pos], x_test['clean_x'].values[random_pos]) print('Predicted : ', y_p[0]) print('Actual :', mb.inverse_transform(y_test)[random_pos]) print("__________________________________________________") for i in range(5): random_pos = random.randint(0, len(x_test)) get_nearest_films(overview=x_test['clean_x'].values[random_pos], title=x_test['title_not_modified'] .values[random_pos], tfidf_vect=x_train_tf_idf, tfidf_matrix=tfidf_vect, df=x_train) if visu: print("Hamming SCORE ", eval1) print('F1 SCORE ', f1score) print("Taux de vrai positifs ", eval2) print("Taux de faux positifs ", eval3) print("Taux de vrai négatifs ", eval4) print("Taux de faux négatifs ", eval5)
class Classifier(object): '''Classifier base class. Uses OneVsRest for multiclass problems''' def __init__(self, clf, x_train, y_train): n_classes = len(set(y_train)) if n_classes > 2: self.clf = OneVsRestClassifier(clf) else: self.clf = clf self.clf.fit(x_train, y_train) def __call__(self, x_val): return self.clf.predict_proba(x_val) def describe(self): return dict( (k, v) for k, v in self.clf.get_params().iteritems() if not callable(v))
def SVMTraining(XEstimate, XValidate, Parameters, class_labels): #clf = svm.SVC(decision_function_shape='ovo',Parameters) svcClassifier = SVC(kernel='rbf', probability=True) gridSearcher = GridSearchCV(svcClassifier, Parameters) clf = OneVsRestClassifier(gridSearcher) #clf = OneVsRestClassifier(GridSearchCV(SVC(kernel='rbf', probability=True), Parameters)) print(clf.get_params) clf.fit(XEstimate, class_labels) Yvalidate = clf.predict(XValidate) EstParameters = clf.get_params() #print(clf.predict_proba(XValidate)) mini = 1 for i in clf.predict_proba(XValidate): mini = min(max(i), mini) #print(mini) print(clf.get_params) #print(svcClassifier.__dict__) #print(clf.d(XValidate)) return {"Yvalidate": Yvalidate, "EstParameters": EstParameters, "clf": clf}
class CustomSVC: def __init__(self): self.fitted_model = None self.classes_ = None def fit(self, train_data: np.array, target_data: np.array): self.fitted_model = OneVsRestClassifier( SVC(kernel='linear', probability=True, class_weight='balanced')) self.classes_ = np.unique(target_data) self.fitted_model.fit(train_data, target_data) return self.fitted_model def predict(self, data_to_predict: np.array): return self.fitted_model.predict(data_to_predict) def predict_proba(self, data_to_predict: np.array): return self.fitted_model.predict_proba(data_to_predict) def get_params(self): return self.fitted_model.get_params()
def train(self, input, output, nb_validation_split=1, shuffle_dataset=[True], kernel=['rbf'], degree=[3], gamma=['auto'], c=[1.0]): classifier = OneVsRestClassifier(SVC(kernel=kernel, degree=degree, gamma=gamma, coef0=0.0, tol=1e-3, C=c, shrinking=True, verbose=1, max_iter=-1)) x_train, x_test, y_train, y_test = train_test_split(input, output, test_size=0.33) # TODO : degree parameter should only be used when kernet is poly param_grid = dict(estimator__C=c, estimator__gamma=gamma, estimator__kernel=kernel, estimator__degree=degree) cv = MlUtils.get_cross_validation(nb_validation_split, shuffle_dataset, output.shape[1]) print(classifier.get_params().keys()) grid_search = GridSearchCV(classifier, param_grid, cv=cv) grid_result = grid_search.fit(x_train, y_train) MlUtils.print_gridsearch_results(grid_result) return classifier, grid_result
start_time = time.time() model = OneVsRestClassifier(svm.SVC(kernel='rbf', cache_size=500)) param_grid = { "estimator__C": [0.005, 0.05, 500], "estimator__gamma": [0.001, 0.01, 1, 10, 100, 1000] } clf_grid = GridSearchCV(model, param_grid=param_grid, score_func=f1_score) clf_grid.fit(train_X_ch2, train_y) print("--- %s seconds ---" % (time.time() - start_time)) clf.best_estimator_ clf_grid.best_params_ clf.get_params() clf2 = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1000, gamma=0.001)) print("--- %s seconds ---" % (time.time() - start_time)) clf2.fit(train_X_ch2, train_y) prediction = clf2.predict(test_X_ch2) result = prediction.tolist() misc.writeResult("./result/submission_ch2_fit_2000_1000_0.0001_lemma.csv", result) clf2 = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1000, gamma=0.001)) clf2.fit(train_X_pca, train_y) prediction = clf.predict(test_X_pca) result = prediction.tolist() misc.writeResult("./result/submission_pca_fit_2000_1000_0.0001.csv", result)
start_time = time.time() model = OneVsRestClassifier(svm.SVC(kernel='rbf', cache_size=500)) param_grid = { "estimator__C": [0.005, 0.05, 500], "estimator__gamma": [0.001, 0.01, 1, 10, 100, 1000] } clf_grid = GridSearchCV(model, param_grid=param_grid, score_func=f1_score) clf_grid.fit(train_X_ch2, train_y) print("--- %s seconds ---" % (time.time() - start_time)) clf.best_estimator_ clf_grid.best_params_ clf.get_params() clf2 = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1000, gamma=0.001)) print("--- %s seconds ---" % (time.time() - start_time)) clf2.fit(train_X_ch2, train_y) prediction = clf2.predict(test_X_ch2) result = prediction.tolist() misc.writeResult("./result/submission_ch2_fit_2000_1000_0.0001_lemma.csv", result) clf2 = OneVsRestClassifier(svm.SVC(kernel='rbf', C=1000, gamma=0.001)) clf2.fit(train_X_pca, train_y) prediction = clf.predict(test_X_pca) result = prediction.tolist() misc.writeResult("./result/submission_pca_fit_2000_1000_0.0001.csv", result)
# scoring = "arrucary" grid_no_up = GridSearchCV(XGB, param_grid=params_xgb, cv=kf, scoring='accuracy').fit(X_train_resampled, y_train_resampled) print(grid_no_up.best_score_) print(grid_no_up.best_params_) print(grid_no_up.cv_results_) # Use OneVsRestClassifier xgb_ovr_clf = OneVsRestClassifier( xgboost.XGBClassifier(objective="multi:softmax", num_class=3)) # Get params' key pprint(xgb_ovr_clf.get_params()) # another way to print params xgb_ovr_clf.estimator.get_params().keys() # Set ranges of parameters # booster types: booster = ['gbtree'] random_grid = { 'estimator__n_estimators': [100, 200, 300], # Number of trees 'estimator__max_depth': [6, 8, 10], # Maximum number of levels in tree 'estimator__validate_parameters': [ True ], # When set to True, XGBoost will perform validation of input parameters to check whether a parameter is used or not. 'estimator__min_child_weight': [ 1, 2, 3 ], # the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. #Smaller weight, smaller samples. If too big, will result in overfitiing 'estimator__gamma':
# Now, we go ahead and train the hyper parameters of the classifier using GridSearch and CV. # In[335]: tuned_parameters = [{ 'estimator__alpha': [0.001, 0.01, 0.1, 0.5], 'estimator__penalty': ['l1', 'l2', 'elasticnet'], 'estimator__loss': ['log', 'modified_huber'] }] scores = ['preceision'] #['precision', 'recall'] estimator = OneVsRestClassifier( SGDClassifier(random_state=0, learning_rate='optimal', class_weight='balanced', n_iter=100)) estimator.get_params().keys() gsearch = GridSearchCV(estimator, tuned_parameters, cv=5, scoring='average_precision') gsearch.fit(X_train, Y_train) print gsearch.best_score_ print gsearch.best_params_ # # Top Informative Features for Each Aspect # We train our model using the optimized paramters obtain from the GridSearch. Now, we can find a the most informative features in for each category. # In[336]: best_classifier = OneVsRestClassifier( SGDClassifier(alpha=0.001,
def linearKernel(parameters): test_accuracy = [] test_accuracy_with_params = [] k_test_fold = StratifiedKFold(5) for (train, test) in (k_test_fold.split(normalized_glass_features, glass_type)): #test fold test_dataset_features = normalized_glass_features[test] test_dataset_types = glass_type[test] #Rest of the data training_dataset_features = normalized_glass_features[train] training_dataset_types = glass_type[train] #Splitting rest of the data into 80-20% such as 20% for validation set (training_features, validation_features, training_glassType, validation_glassType) = train_test_split(training_dataset_features, training_dataset_types, train_size=0.80, random_state=1) validation_acuracies = [] start_time_training = time.clock() #Training different models with different hyperparameters for cvalue in cvalues: classifier = OneVsRestClassifier(SVC(kernel="linear", C=cvalue)) classifier.fit(training_features, training_glassType) validation_true, validation_pred = validation_glassType, classifier.predict( validation_features) accuracy_Validationset = metrics.accuracy_score( validation_true, validation_pred) validation_acuracies.append( (classifier.get_params().get('estimator__C'), accuracy_Validationset)) validation_acuracies.sort(key=lambda val: val[1]) print("sorted validation_acuracies :", validation_acuracies) #optimal hyperparameter with accuracy print(" optimal hyperparameter with accuracy is :", validation_acuracies[-1]) #Training a new model on the entire 4 folds with optimal Hyper Parameters classifier_1 = OneVsRestClassifier( SVC(kernel="linear", C=validation_acuracies[-1][0])) classifier_1.fit(training_dataset_features, training_dataset_types) end_time_training = time.clock() print("Time taken to train for one fold for linear kernel is :", (end_time_training - start_time_training)) test_true, test_pred = test_dataset_types, classifier_1.predict( test_dataset_features) accuracy_test = metrics.accuracy_score(test_true, test_pred) print("accuracy_test :", accuracy_test) test_accuracy_with_params.append( (classifier_1.get_params().get('estimator__C'), accuracy_test)) test_accuracy.append(accuracy_test) print("Test Accuracies for all fold with params :", test_accuracy_with_params) print("Test Accuracies for all fold:", test_accuracy) average_accuracy = sum(test_accuracy) / len(test_accuracy) print("average accuracy for linear SVM is :", average_accuracy)
def ovrSVM(self, X, Y, X_test): ovrClassifier = OneVsRestClassifier(LinearSVC(C = 100, random_state=0), 100) print ovrClassifier.get_params() ovrClassifier.fit(X, Y) Y_predict = ovrClassifier.predict(X_test) return Y_predict