validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric seed = 7 scoring = 'accuracy' # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) print "--------different model accuray evaluation--------" # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) model.fit(X_train, Y_train) predictions = model.predict(X_validation) msg = "%s: %f (%f), accuracy score: %f" % (name, cv_results.mean(), cv_results.std(), accuracy_score(Y_validation, predictions)) print(msg)
params = {'penalty':['l1', 'l2'], 'C':[1, 2, 3, 5, 10]} lr = LogisticRegression(random_state = 0) clf = GridSearchCV(lr, param_grid = params, scoring = accuracy_scorer, cv = 5, n_jobs = -1) clf.fit(X_train, y_train) print('Best score: {}'.format(clf.best_score_)) print('Best parameters: {}'.format(clf.best_params_)) lr_best = LogisticRegression(penalty = 'l1', C = 1, random_state = 0) # In[ ]: params = {'kernel':['linear', 'rbf'], 'C':[1, 3, 5, 10], 'degree':[3, 5, 10]} svc = SVC(probability = True, random_state = 0) clf = GridSearchCV(svc, param_grid = params, scoring = accuracy_scorer, cv = 5, n_jobs = -1) clf.fit(X_train, y_train) print('Best score: {}'.format(clf.best_score_)) print('Best parameters: {}'.format(clf.best_params_)) svc_best = SVC(C = 10, degree = 3, kernel = 'linear', probability = True, random_state = 0) # In[ ]: voting_clf = VotingClassifier(estimators=[('rf', rf_best), ('bag', bag_best), ('gbc', gbc_best), ('lr', lr_best), ('svc', svc_best)] , voting='hard') voting_clf.fit(X_train, y_train) y_pred = voting_clf.predict(X_test)
import cv2, glob, random, math, numpy as np, dlib, itertools from sklearn.svm import SVC __author__ = "Paul van Gent, 2016" #Please leave this line in emotions = ["anger", "contempt", "disgust", "fear", "happiness", "neutral", "sadness", "surprise"] #Emotion list clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) detector = dlib.get_frontal_face_detector() predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat") #Or set this to whatever you named the downloaded file clf = SVC(kernel='linear', probability=True, tol=1e-3)#, verbose = True) #Set the classifier as a support vector machines with polynomial kernel def get_files(emotion): #Define function to get file list, randomly shuffle it and split 80/20 files = glob.glob("dataset\\%s\\*" %emotion) random.shuffle(files) training = files[:int(len(files)*0.8)] #get first 80% of file list prediction = files[-int(len(files)*0.2):] #get last 20% of file list return training, prediction def get_landmarks(image): detections = detector(image, 1) for k,d in enumerate(detections): #For all detected face instances individually shape = predictor(image, d) #Draw Facial Landmarks with the predictor class xlist = [] ylist = [] for i in range(1,68): #Store X and Y coordinates in two lists xlist.append(float(shape.part(i).x)) ylist.append(float(shape.part(i).y)) xmean = np.mean(xlist) #Get the mean of both axes to determine centre of gravity ymean = np.mean(ylist) xcentral = [(x-xmean) for x in xlist] #get distance between each point and the central point in both axes ycentral = [(y-ymean) for y in ylist]
train = pd.read_csv('train.csv', header=None) train_y = pd.read_csv('trainLabels.csv', header=None) test = pd.read_csv('test.csv', header=None) #test.ix[:,1:11].hist() n_pca = 21 n_gmm = 4 pca = PCA(n_components=n_pca, whiten=True).fit(train) train_pca = pca.transform(train) X_train, X_val, y_train, y_val = \ train_test_split(train_pca, train_y,test_size=0.2, random_state=0) gmm = GMM(n_components=n_gmm, covariance_type='full').fit(X_train) svc = SVC().fit(gmm.predict_proba(X_train), y_train) svc.score(gmm.predict_proba(X_val), y_val) forest = ensemble.ExtraTreesClassifier(n_estimators=400).fit( gmm.predict_proba(X_train), y_train) forest.score(gmm.predict_proba(X_val), y_val) test_pca = pca.transform(test) gmm_all = GMM(n_components=n_gmm, covariance_type='full').fit(train_pca) svc_all = SVC().fit(gmm_all.predict_proba(train_pca), train_y) pred_svc = svc_all.predict(gmm_all.predict_proba(test_pca)) forest_all = ensemble.RandomForestClassifier(n_estimators=400).fit( gmm_all.predict_proba(train_pca), train_y) pred_forest = forest_all.predict(gmm_all.predict_proba(test_pca))
def fit(self, train_data, train_labels, val_data, val_labels): """ Fits to training data. Args: train_data (ndarray): Training data. train_labels (ndarray): Training labels. val_data (ndarray): Validation data. val_labels (ndarray): Validation labels. """ split = np.append(-np.ones(train_labels.shape, dtype=np.float32), np.zeros(val_labels.shape, dtype=np.float32)) ps = PredefinedSplit(split) sh = train_data.shape train_data = np.append(train_data, val_data , axis=0) train_labels = np.append(train_labels , val_labels, axis=0) del val_data, val_labels if self.kernel == 'linear': if self.probability: clf = SVC(kernel='linear', class_weight='balanced', random_state=6, decision_function_shape='ovr', max_iter=1000, probability=self.probability, **self.scikit_args) else: clf = LinearSVC(class_weight='balanced', dual=False, random_state=6, multi_class='ovr', max_iter=1000, **self.scikit_args) #Cross-validate over these parameters params = {'C': 2.0**np.arange(-9,16,2,dtype=np.float)} elif self.kernel == 'rbf': clf = SVC(random_state=6, class_weight='balanced', cache_size=16000, decision_function_shape='ovr',max_iter=1000, tol=1e-4, probability=self.probability, **self.scikit_args) params = {'C': 2.0**np.arange(-9,16,2,dtype=np.float), 'gamma': 2.0**np.arange(-15,4,2,dtype=np.float)} #Coarse search gs = GridSearchCV(clf, params, refit=False, n_jobs=self.n_jobs, verbose=self.verbosity, cv=ps) gs.fit(train_data, train_labels) #Fine-Tune Search if self.kernel == 'linear': best_C = np.log2(gs.best_params_['C']) params = {'C': 2.0**np.linspace(best_C-2,best_C+2,10, dtype=np.float)} elif self.kernel == 'rbf': best_C = np.log2(gs.best_params_['C']) best_G = np.log2(gs.best_params_['gamma']) params = {'C': 2.0**np.linspace(best_C-2,best_C+2,10, dtype=np.float), 'gamma': 2.0**np.linspace(best_G-2,best_G+2,10, dtype=np.float)} self.gs = GridSearchCV(clf, params, refit=self.refit, n_jobs=self.n_jobs, verbose=self.verbosity, cv=ps) self.gs.fit(train_data, train_labels) if not self.refit: clf.set_params(C=gs.best_params_['C']) if self.kernel == 'rbf': clf.set_params(gamma=gs.best_params_['gamma']) self.gs = clf self.gs.fit(train_data[:sh[0]], train_labels[:sh[0]])
y, test_size=0.2, random_state=0) ## Data without information about depth X_train_ND, X_test_ND = np.delete(arr=X_train, obj=[0, 4, 6], axis=1), np.delete(arr=X_test, obj=[0, 4, 6], axis=1) ####### II: Classification ####### # Define Classifiers nb = GaussianNB() knn = KNeighborsClassifier() svc = SVC(probability=True) ## Fit Classifiers without depth: fit_nb_ND = nb.fit(X_train_ND, y_train) fit_knn_ND = knn.fit(X_train_ND, y_train) fit_svc_ND = svc.fit(X_train_ND, y_train) # Predict with Classifiers ## Save methods in dict to iterate over them. methods = {"Naive Bayes": nb, "KNN": knn, "SVM": svc} ## With Depth: accuracies = [] precisions = [] for method_name, method in methods.items():
t0 = time() x_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0)) ############################################################################### # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } #clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False) clf = clf.fit(x_train_pca, y_train) #clf = cv2.createFisherFaceRecognizer() #clf.train(x_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") #print(clf.best_estimator_) # Save the classifier joblib.dump(clf, "recognition_clf.pkl", compress=3) ###############################################################################
X = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # No need to fit Test Set as its already fitted to Training Set # Fitting classifier to the Training set from sklearn.svm import SVC classifier = SVC(kernel = "rbf", random_state = 0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix(Classification Evaluation Metric) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print("Confusion Matrix : %s " % (cm)) # Visualizing the Training Set Results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train x1Values = np.arange(start = X_set[:, 0].min()-1, stop = X_set[:, 0].max()+1, step = 0.01) x2Values = np.arange(start = X_set[:, 1].min()-1, stop = X_set[:, 1].max()+1, step = 0.01)
def main(): st.title("Binary Classification Web App") st.sidebar.title("Binary Classification Web App") st.markdown("Are your mushrooms edible or poisonous? 🍄") st.sidebar.markdown("Are your mushrooms edible or poisonous? 🍄") #st.cache : #until and unless the function name and arguments are chaged the data is cached # just use the cached data to rerun #Label Encoding : #refers to converting the labels into numeric form #so as to convert it into the machine-readable form. Machine learning algorithms #can then decide in a better way on how those labels must be operated. #It is an important pre-processing step for the structured dataset in supervised learning. @st.cache(persist=True) def load_data(): data = pd.read_csv("mushrooms.csv") labelencoder = LabelEncoder() for col in data.columns: data[col] = labelencoder.fit_transform(data[col]) #st.write(data) #to check the dataset after label encoding return data @st.cache(persist=True) def split(df): y = df.type x = df.drop(columns=['type']) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) return x_train, x_test, y_train, y_test def plot_metrics(metrics_list): if 'Confusion Matrix' in metrics_list: st.subheader("Confusion Matrix") plot_confusion_matrix(model, x_test, y_test, display_labels=class_names) st.pyplot() if 'ROC Curve' in metrics_list: st.subheader("ROC Curve") plot_roc_curve(model, x_test, y_test) st.pyplot() if 'Precision-Recall Curve' in metrics_list: st.subheader('Precision-Recall Curve') plot_precision_recall_curve(model, x_test, y_test) st.pyplot() df = load_data() class_names = ['edible', 'poisonous'] #for confusion matrix x_train, x_test, y_train, y_test = split(df) #take user input of hyperparameters st.sidebar.subheader("Choose Classifier") classifier = st.sidebar.selectbox("Classifier", ("Support Vector Machine (SVM)", "Logistic Regression", "Random Forest")) if classifier == 'Support Vector Machine (SVM)': st.sidebar.subheader("Model Hyperparameters") #choose parameters C = st.sidebar.number_input("C (Regularization parameter)", 0.01, 10.0, step=0.01, key='C_SVM') kernel = st.sidebar.radio("Kernel", ("rbf", "linear"), key='kernel') gamma = st.sidebar.radio("Gamma (Kernel Coefficient)", ("scale", "auto"), key='gamma') metrics = st.sidebar.multiselect( "What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve')) if st.sidebar.button("Classify", key='classify'): st.subheader("Support Vector Machine (SVM) Results") model = SVC(C=C, kernel=kernel, gamma=gamma) model.fit(x_train, y_train) accuracy = model.score(x_test, y_test) y_pred = model.predict(x_test) st.write("Accuracy: ", accuracy.round(2)) st.write( "Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2)) st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2)) plot_metrics(metrics) if classifier == 'Logistic Regression': st.sidebar.subheader("Model Hyperparameters") C = st.sidebar.number_input("C (Regularization parameter)", 0.01, 10.0, step=0.01, key='C_LR') max_iter = st.sidebar.slider("Maximum number of iterations", 100, 500, key='max_iter') metrics = st.sidebar.multiselect( "What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve')) if st.sidebar.button("Classify", key='classify'): st.subheader("Logistic Regression Results") model = LogisticRegression(C=C, penalty='l2', max_iter=max_iter) model.fit(x_train, y_train) accuracy = model.score(x_test, y_test) y_pred = model.predict(x_test) st.write("Accuracy: ", accuracy.round(2)) st.write( "Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2)) st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2)) plot_metrics(metrics) if classifier == 'Random Forest': st.sidebar.subheader("Model Hyperparameters") n_estimators = st.sidebar.number_input( "The number of trees in the forest", 100, 5000, step=10, key='n_estimators') max_depth = st.sidebar.number_input("The maximum depth of the tree", 1, 20, step=1, key='n_estimators') bootstrap = st.sidebar.radio("Bootstrap samples when building trees", ('True', 'False'), key='bootstrap') metrics = st.sidebar.multiselect( "What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve')) if st.sidebar.button("Classify", key='classify'): st.subheader("Random Forest Results") model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, bootstrap=bootstrap, n_jobs=-1) model.fit(x_train, y_train) accuracy = model.score(x_test, y_test) y_pred = model.predict(x_test) st.write("Accuracy: ", accuracy.round(2)) st.write( "Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2)) st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2)) plot_metrics(metrics) if st.sidebar.checkbox("Show raw data", False): st.subheader("Mushroom Data Set (Classification)") st.write(df) st.markdown( "This [data set](https://archive.ics.uci.edu/ml/datasets/Mushroom) includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms " "in the Agaricus and Lepiota Family (pp. 500-525). Each species is identified as definitely edible, definitely poisonous, " "or of unknown edibility and not recommended. This latter class was combined with the poisonous one." )
y = dataset.iloc[:, 4].values #Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) #Feature Scaling (Zscore, it standardizes the data) no need in from sklearn.preprocessing import StandardScalar sc_X = StandardScalar() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) #Fitting regression model to the Training set #Create regression model from sklearn.svm import SVC classifier = SVC(kernal = 'linear', random_state = 0) classifier.fit(X_train, y_train) #Predicting the Test set results y_pred = classifier.predict(X_test) #Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) #Visualizing the Training set results (use this to see test set results by changing the variable) from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrind(np.arage(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('red', 'green'))) plt.xlim(X1.min(), X1.max())
nifti_masker = NiftiMasker(mask_img=mask_filename, sessions=session, smoothing_fwhm=4, standardize=True, memory="nilearn_cache", memory_level=1) func_filename = haxby_dataset.func[0] X = nifti_masker.fit_transform(func_filename) # Restrict to non rest data X = X[condition_mask] session = session[condition_mask] ########################################################################### # Build the decoder that we will use # Define the prediction function to be used. # Here we use a Support Vector Classification, with a linear kernel from sklearn.svm import SVC svc = SVC(kernel='linear') # Define the dimension reduction to be used. # Here we use a classical univariate feature selection based on F-test, # namely Anova. We set the number of features to be selected to 500 from sklearn.feature_selection import SelectKBest, f_classif feature_selection = SelectKBest(f_classif, k=500) # We have our classifier (SVC), our feature selection (SelectKBest), and now, # we can plug them together in a *pipeline* that performs the two operations # successively: from sklearn.pipeline import Pipeline anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)]) ###########################################################################
#from sklearn import preprocessing #le = preprocessing.LabelEncoder() #bankdata = bankdata.apply(le.fit_transform) droplist = ['class'] X = bankdata.drop(droplist, axis=1) y = bankdata['class'] #从这儿开始才是算法,上面是处理输入的数据csv from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30) #labels = np.unique(X); print(labels) from sklearn.svm import SVC clf = SVC() #kernel='rbf' #clf = SVC(kernel='poly',degree=4) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) #y_pred = svclassifier.predict(X_test) # #from sklearn.metrics import classification_report, confusion_matrix #print(confusion_matrix(y_test,y_pred)) #print(classification_report(y_test,y_pred)) # #
X = data[:, 0:4] Y = data[:, 4] val_size = 0.2 scoring = "accuracy" (X_train, X_val, Y_train, Y_val) = model_selection.train_test_split(X, Y, test_size=val_size) models = { "LR": LogisticRegression(solver="lbfgs", multi_class="auto"), "LDA": LinearDiscriminantAnalysis(solver='lsqr'), "KNN": KNeighborsClassifier(), "DTC": DecisionTreeClassifier(), "NB": GaussianNB(), "SVC": SVC(), "MLP": MLPClassifier(), } results = [] for name, model in models.items(): kfold = model_selection.KFold(n_splits=10) cross_res = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append((name, cross_res)) for name, res in results: print("{:6} {:2.4} {:2.4}").format(name, res.mean(), res.std())
# Hyperparameter search over all possible dimensions for PCA reduction # 'pca__n_components': np.arange(1, 17), # 'svm__gamma': np.arange(0.001, 0.1, 0.001) } svm_classification_pipeline = Pipeline( [ # Apply PCA to SVM Classification #('pca', PCA()), # Apply scaling to SVM Classification #('scale', StandardScaler()), ('svm', SVC()) ] ) _accuracy_grid_search(values_train, hdi_class_train, svm_classification_pipeline, classification_svm_parameters) # ## u) # In[17]: classification_svm_parameters = { # Use linear kernel for SVM Classification
##### splitting data into train and test set x_train, x_test, y_train, y_test = train_test_split(data['cleaned_text'], data['labels'], test_size=0.2, random_state=10) ############### fit frequency based word embeddings into our data set to turn text into wordvectors vectorizer = TfidfVectorizer(lowercase=True, stop_words=STOPWORDS) vectorizer.fit(x_train) x_train_vect = vectorizer.transform(x_train) x_test_vect = vectorizer.transform(x_test) ############# Build our classifier with Linear Support vector machine model = SVC(C=1, kernel='linear', class_weight='balanced') model.fit(x_train_vect, y_train) y_pred = model.predict(x_test_vect) cm = confusion_matrix(y_test, y_pred) ########## confusion matrix for test set pipeline = make_pipeline( vectorizer, model) #### save our model with pipeline function for future analysis def predict(text): score = pipeline.predict([clean_text(text)])
# Provided to give you a starting point. Try a variety of classifiers. # Stratified ShuffleSplit cross-validator. # Provides train/test indices to split data in train/test sets. # This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, # which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class. # NaiveBayes from sklearn.naive_bayes import GaussianNB nb_clf = GaussianNB() # SVM from sklearn.svm import SVC svm_clf = SVC() # DecisionTree from sklearn.tree import DecisionTreeClassifier dt_clf = DecisionTreeClassifier() # RandomForest from sklearn.ensemble import RandomForestClassifier rf_clf = RandomForestClassifier(n_estimators=25) # AdaBoost from sklearn.ensemble import AdaBoostClassifier ab_clf = AdaBoostClassifier()
from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(max_depth=6, max_features=7) rfc.fit(X_train, y_train) pred_rfc = rfc.predict(X_test) print(confusion_matrix(y_test, pred_rfc)) print(classification_report(y_test, pred_rfc)) print(accuracy_score(y_test, pred_rfc)) rfc.fit(X_train_all, y_train_all) pred_all_rfc = rfc.predict(X_test_all) sub_rfc = pd.DataFrame() sub_rfc['PassengerId'] = df_test['PassengerId'] sub_rfc['Survived'] = pred_all_rfc #sub_rfc.to_csv('randforest.csv',index=False) from sklearn.svm import SVC svc = SVC(gamma = 0.01, C = 100)#, probability=True) svc.fit(X_train_sc, y_train_sc) pred_svc = svc.predict(X_test_sc) print(confusion_matrix(y_test_sc, pred_svc)) print(classification_report(y_test_sc, pred_svc)) print(accuracy_score(y_test_sc, pred_svc)) svc.fit(X_train_all_sc, y_train_all_sc) pred_all_svc = svc.predict(X_test_all_sc) sub_svc = pd.DataFrame() sub_svc['PassengerId'] = df_test['PassengerId'] sub_svc['Survived'] = pred_all_svc sub_svc.to_csv('svc.csv',index=False)
# In[ ]: from sklearn.linear_model import LogisticRegression logmodel = LogisticRegression(max_iter=100) logmodel.fit(X_train, y_train) ypred = logmodel.predict(X_test) print(logmodel.score(X_train, y_train)) print(confusion_matrix(y_test, ypred)) print(classification_report(y_test, ypred)) # *4. SVM* # In[ ]: from sklearn.svm import SVC modelsvc = SVC(probability=True, gamma='auto') modelsvc.fit(X_train, y_train) ypred = modelsvc.predict(X_test) print(modelsvc.score(X_train, y_train)) print(confusion_matrix(y_test, ypred)) print(classification_report(y_test, ypred)) # *6. Decision Tree* # In[ ]: from sklearn.tree import DecisionTreeClassifier dmodel = DecisionTreeClassifier() dmodel.fit(X_train, y_train) ypred = dmodel.predict(X_test) print(dmodel.score(X_train, y_train))
X = dataset.iloc[:, [2,3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # fitting classifier to the Training set from sklearn.svm import SVC classifier = SVC(kernel ='rbf', random_state = 0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
feats=[] humor = [] for key in dict.keys(): value = dict[key] feats.append(value[0].tolist()) humor.append(value[1].tolist()) feats = np.array(feats) humor = np.array(humor) if options.clf == 'GaussianProc': clf = GaussianProcessClassifier() elif options.clf == "SVC": clf = SVC() elif options.clf == "LinearSVC": clf = LinearSVC(max_iter=10000,dual=False) elif options.clf == "DecisionTree": clf = DecisionTreeClassifier() elif options.clf == "RandomForest": clf = RandomForestClassifier() elif options.clf == "AdaBoost": clf = AdaBoostClassifier(n_estimators=100) elif options.clf == "XGBoost": clf = XGBClassifier() elif options.clf == "KNN": clf = KNeighborsClassifier(n_neighbors=5) elif options.clf == "GaussianNB": clf = GaussianNB() elif options.clf == "RBF":
#summary of the model predicion print(classification_report(y_test,y_pred)) print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred)) #accuracy score of the model from sklearn.metrics import accuracy_score print('accuracy score :',accuracy_score(y_pred,y_test)) """### **Support Vector Machine(SVM)**""" #Support Vector Machine(SVM) #importing the library from sklearn.svm import SVC #creating local variable classifier classifier = SVC(kernel='linear',random_state=0) #Training the model classifier.fit(X_train,y_train) #predicting the value of Y y_pred = classifier.predict(X_test) #importing metrics for evaluation from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report #summary of the model predicion print(classification_report(y_test,y_pred)) print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred)) #accuracy score of the model
def get_res(x_train, y_train, x_test, y_test): knn = KNeighborsClassifier() knn.fit(x_train, y_train) lg = LogisticRegression(penalty='l2') lg.fit(x_train, y_train) dtc = DecisionTreeClassifier() dtc.fit(x_train, y_train) gb = GradientBoostingClassifier(n_estimators=200) gb.fit(x_train, y_train) ab = AdaBoostClassifier() ab.fit(x_train, y_train) gnb = GaussianNB() gnb.fit(x_train, y_train) svm = SVC() svm.fit(x_train, y_train) mnb = MultinomialNB(alpha=0.01) mnb.fit(x_train, y_train) bnb = BernoulliNB(alpha=1.0, binarize=0.31, fit_prior=True, class_prior=None) bnb.fit(x_train, y_train) rtc = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=47) rtc.fit(x_train, y_train) num_list = [ knn.score(x_test, y_test), lg.score(x_test, y_test), dtc.score(x_test, y_test), gb.score(x_test, y_test), ab.score(x_test, y_test), gnb.score(x_test, y_test), svm.score(x_test, y_test), mnb.score(x_test, y_test), bnb.score(x_test, y_test), rtc.score(x_test, y_test) ] name_list = [ 'KNN', 'Logistic', 'DecisionTree', 'GradientBoosting', 'AdaBoost', 'GaussianNB', 'SVC', 'MultinomialNB', 'BernoulliNB', 'RandomForest' ] plt.title('title') num_list = np.around(num_list, decimals=3) autolabel( plt.bar(range(len(num_list)), num_list, color='rb', tick_label=name_list, width=0.4)) plt.show()
forest['target_size'] = forest['size_category'] forest = forest.drop('size_category', axis=1) forest.columns from sklearn.svm import SVC from sklearn.model_selection import train_test_split train, test = train_test_split(forest, test_size=0.3) test.head() forest.shape train_X = train.iloc[:, 0:45] train_X.columns train_y = train.iloc[:, -1] test_X = test.iloc[:, 0:45] test_y = test.iloc[:, -1] model_linear = SVC(kernel="linear") model_linear.fit(train_X, train_y) pred_test_linear = model_linear.predict(test_X) np.mean(pred_test_linear == test_y) # Accuracy = 1.0 # Kernel = poly model_poly = SVC(kernel="poly") model_poly.fit(train_X, train_y) pred_test_poly = model_poly.predict(test_X) np.mean(pred_test_poly == test_y) # Accuracy = 1.0 # kernel = rbf model_rbf = SVC(kernel="rbf") model_rbf.fit(train_X, train_y)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0) # Feature scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) # Fitting the SVM to the Training Set from sklearn.svm import SVC cl = SVC(kernel='linear', random_state=0) cl.fit(X_train, Y_train) # Predicint the test set results y_pred = cl.predict(X_test) # Making the confustion matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, Y_set = X_train, Y_train X1, X2 = np.meshgrid(
import pandas as pd from sklearn.metrics import accuracy_score, log_loss from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis classifiers = [ KNeighborsClassifier(3), SVC(kernel="rbf", C=0.025, probability=True), NuSVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis()] def classify_test(X_train,y_train,X_test,y_test): for clf in classifiers: try: clf.fit(X_train, y_train) except: print('{} is wrong'.format(clf.__class__.__name__)) else: name = clf.__class__.__name__ print("="*30)
from sklearn.naive_bayes import BernoulliNB from sklearn.svm import SVC print ' ' print '=============================' print 'Bernoulli SVC Classifier:' classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) classifierBi.classify_many(test) for pdist in classifierBi.prob_classify_many(test): print pdist.prob('human'), pdist.prob('auto') for i in range(len(classifierBi.classify_many(test))): print classifierBi.classify_many(test)[i] classifierSVC = SklearnClassifier(SVC(), sparse=True).train(train_set) classifierSVC.classify_many(test) # svc = nltk.classify.accuracy(classifierSVC, test_set) # print 'accuracy is %.2f' %round(svc*100,4), '%' def SVC(): classifierBi = SklearnClassifier(BernoulliNB()).train(train_set) return classifierSVC.classify_many(test) print "Performance of running Bernoulli SVC Classifier on test set: ", timeit.timeit( "SVC", setup="from __main__ import SVC", number=1) print ' ' print '=============================' print 'Linear SVC Classifier:' classifierLinSVC = SklearnClassifier(LinearSVC(),
# Success print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size)) # Return the results return results # Import the three supervised learning models from sklearn from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier # TODO: Initialize the three models clf_A = SVC() clf_B = DecisionTreeClassifier(min_samples_split=20) clf_C = AdaBoostClassifier() # Calculate the number of samples for 1%, 10%, and 100% of the training data # HINT: samples_100 is the entire training set i.e. len(y_train) # HINT: samples_10 is 10% of samples_100 # HINT: samples_1 is 1% of samples_100 samples_100 = len(y_train) samples_10 = len(y_train) // 10 samples_1 = len(y_train) // 100 # Collect results on the learners results = {} results = train_predict(clf_A, samples_1, X_train, y_train, X_test, y_test)
def train(args): print("train call") print("Loading embeddings.") fname = "{}/labels.csv".format(args.workDir) labels = pd.read_csv(fname, header=None).as_matrix()[:, 1] labels = map(itemgetter(1), map(os.path.split, map(os.path.dirname, labels))) # Get the directory. fname = "{}/reps.csv".format(args.workDir) embeddings = pd.read_csv(fname, header=None).as_matrix() le = LabelEncoder().fit(labels) labelsNum = le.transform(labels) nClasses = len(le.classes_) print("Training for {} classes.".format(nClasses)) if args.classifier == 'LinearSvm': clf = SVC(C=1, kernel='linear', probability=True) elif args.classifier == 'GridSearchSvm': print(""" Warning: In our experiences, using a grid search over SVM hyper-parameters only gives marginally better performance than a linear SVM with C=1 and is not worth the extra computations of performing a grid search. """) param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']} ] clf = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5) elif args.classifier == 'GMM': # Doesn't work best clf = GMM(n_components=nClasses) # ref: # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py elif args.classifier == 'RadialSvm': # Radial Basis Function kernel # works better with C = 1 and gamma = 2 clf = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif args.classifier == 'DecisionTree': # Doesn't work best clf = DecisionTreeClassifier(max_depth=20) elif args.classifier == 'GaussianNB': clf = GaussianNB() # ref: https://jessesw.com/Deep-Learning/ elif args.classifier == 'DBN': from nolearn.dbn import DBN clf = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1], # i/p nodes, hidden nodes, o/p nodes learn_rates=0.3, # Smaller steps mean a possibly more accurate result, but the # training will take longer learn_rate_decays=0.9, # a factor the initial learning rate will be multiplied by # after each iteration of the training epochs=300, # no of iternation # dropouts = 0.25, # Express the percentage of nodes that # will be randomly dropped as a decimal. verbose=1) if args.ldaDim > 0: clf_final = clf clf = Pipeline([('lda', LDA(n_components=args.ldaDim)), ('clf', clf_final)]) clf.fit(embeddings, labelsNum) fName = "{}/classifier.pkl".format(args.workDir) print("Saving classifier to '{}'".format(fName)) with open(fName, 'w') as f: pickle.dump((le, clf), f)
import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import StratifiedShuffleSplit from sklearn.metrics import accuracy_score, log_loss from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.linear_model import LogisticRegression classifiers = [ KNeighborsClassifier(3), SVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression()] log_cols = ["Classifier", "Accuracy"] log = pd.DataFrame(columns=log_cols) sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0) X = train[0::, 1::]
second_pc = pca.components_[1] #print var, sum(var), eigenfaces.shape, ei_mean.shape, X_train_pca.shape ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' #Grid encuantra el mejor parametro de C y gamma pa ser utilizado con el kernel rbf clf = GridSearchCV( SVC(kernel='rbf', class_weight='balanced', probability=True), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) y_proba = clf.predict_proba(X_test_pca) print "done in %0.3fs" % (time() - t0) #Guardar Variables del modelo ya entrenado with open('Clasificador.pkl', 'w') as f: # Python 3: open(..., 'wb')