def ensemble(train_features, train_labels, test_features, test_labels): print("\n\nEnsemble") print( "===================================================================") ks = [1, 3, 5] for k in ks: print("k = ", k) m_knn = knc(n_neighbors=k) parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svr = svm.SVC(probability=True) m_svm = GridSearchCV(svr, parameters).fit(train_features, train_labels) m_mlp = MLPClassifier(max_iter=1000) clf1 = VotingClassifier(estimators=[('knn', m_knn), ('svm', m_svm), ('mpl', m_mlp)], voting='hard') clf2 = VotingClassifier(estimators=[('knn', m_knn), ('svm', m_svm), ('mpl', m_mlp)], voting='soft') clf1.fit(train_features, train_labels) clf2.fit(train_features, train_labels) result1 = clf1.predict(test_features) result2 = clf2.predict(test_features) printResult(test_labels, result) printResult(test_labels, result) print( "===================================================================")
def model_data(training_data): dtc = DecisionTreeClassifier(random_state=9, min_samples_split=5) dtc.fit(training_data['data'], training_data['result']) nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) nn.fit(training_data['data'], training_data['result']) svc = SVC(C=100, kernel="linear") svc.fit(training_data['data'], training_data['result']) rfc = RFC(n_estimators=10, criterion='entropy', max_depth=10, min_samples_split=5, bootstrap='true', random_state=None) rfc.fit(training_data['data'], training_data['result']) knc_map = knc(n_neighbors=15, weights='distance') knc_map.fit(training_data['data'], training_data['result']) gbc_map = gbc(n_estimators=150, verbose=0) gbc_map.fit(training_data['data'], training_data['result']) return { 'Decision Tree Classifier': dtc, 'Neural Networks': nn, 'Support Vector Machines': svc, 'Random Forest Classification': rfc, 'k Nearest Neighbours': knc_map, 'Gradient Boosting Classifier': gbc_map }
def voxelizePart(grid, scale, translate, dims, cloud, labels, partId, outputPath): bbmin = np.min(cloud, axis=0) bbmax = np.max(cloud, axis=0) center = 0.5 * (bbmax - bbmin) w1s = np.where(grid == 1) grid_xyz = [[x, y, z] for x, y, z in zip(w1s[0], w1s[1], w1s[2])] grid_xyz = np.array(grid_xyz) grid_xyz_sc = [] for p in grid_xyz: trans_p = [0, 0, 0] trans_p[0] = scale * ((1 / scale) * center[0] - 0.5 + float( (p[0] + 0.5) / dims)) + translate[0] trans_p[1] = scale * ((1 / scale) * center[1] - 0.5 + float( (p[1] + 0.5) / dims)) + translate[1] trans_p[2] = scale * ((1 / scale) * center[2] - 0.5 + float( (p[2] + 0.5) / dims)) + translate[2] grid_xyz_sc.append(trans_p) grid_xyz_sc = np.array(grid_xyz_sc) #grid_xyz_sc is now in the same coordinate frame as the point-cloud clf = knc(n_neighbors=1) clf.fit(cloud, labels) voxelLabels = clf.predict(grid_xyz_sc) partIndices = voxelLabels == partId partVoxelIndices = grid_xyz[partIndices, :] partvox = np.zeros((dims, dims, dims, 1)) partvox[partVoxelIndices[:, 0], partVoxelIndices[:, 2], partVoxelIndices[:, 1], 0] = 1 partvox = partvox.astype('int') partbinvox = binvox_rw.Voxels(partvox, (dims, dims, dims), [0, 0, 0], 1, 'xzy') partname = 'model_' + str(partId) + '.binvox' binvox_rw.write(partbinvox, open(os.path.join(outputPath, partname), 'wb'))
def bias_variance(data, nbrs, bootstrap=10): result_dict = {} for bootsrap_num in bootstrap: train, test = split(data) classifier = knc(n_neighbors=nbrs) coordinates_train, labels_train = train[:, :2], train[:, -2] classifier.fit(coordinates_train, labels_train) for instance in test: coordinates_test = instance[:2] pred = classifier.predict(coordinates_test) result_dict[instance[:-1]].append(pred)
def runAlgo(filename): # Read a given dataset to csv and then run it through six different # classifiers. 3 Normal classifiers and 3 ensembles print(filename) d, t = load_csv(filename) runModel("Decision Tree", tree.DecisionTreeClassifier(), d, t) runModel("KNearesest Ne", knc(), d, t) runModel("Neural Networ", MLPClassifier(hidden_layer_sizes=(30,30,30)), d, t) runModel("Bagging ", BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.9, max_features=0.9), d, t) runModel("Random Forest", RandomForestClassifier(n_estimators=100), d, t) runModel("AdaBoost ", AdaBoostClassifier(n_estimators=100), d, t)
def knn(f_train,l_train,f_test): from sklearn.neighbors import KNeighborsClassifier as knc import time clf=knc(n_neighbors=3) start_time=time.time() clf.fit(f_train,l_train) print("Training Time: %s seconds"%(time.time()-start_time)) start_time=time.time() pre=clf.predict(f_test) print("Predicting Time: %s seconds"%(time.time()-start_time)) return pre
def handwritingClassTestSKL(): #测试集的Labels hwLabels = [] #返回trainingDigits目录下的文件名 trainingFileList = listdir("trainingDigits") #返回文件夹下文件的个数 m = len(trainingFileList) #初始化训练的Mat矩阵,测试集 trainingMat = np.zeros((m, 1024)) #从文件名中解析出训练集的类别 for i in range(m): #获得文件的名字 fileNameStr = trainingFileList[i] #fileStr = fileNameStr.split('.')[0] #获得分类的数字 classNumStr = int(fileNameStr.split('_')[0]) #将获得的类别添加到hwLabels中 hwLabels.append(classNumStr) #将每一个文件的1x1024数据存储到trainingMat矩阵中 trainingMat[i, :] = img2vector("trainingDigits/%s" % fileNameStr) """ SK-learn method """ #构建kNN分类器 neigh = knc(n_neighbors=3, algorithm='auto') #拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签 neigh.fit(trainingMat, hwLabels) #返回testDigits目录下的文件列表 testFileList = listdir("testDigits") #错误检测计数 errorCount = 0.0 #测试数据的数量 mTest = len(testFileList) #从文件中解析出测试集的类别并进行分类测试 for i in range(mTest): #获得文件的名字 fileNameStr = testFileList[i] #fileStr = fileNameStr.split('.')[0] #获得分类的数字 classNumStr = int(fileNameStr.split('_')[0]) #获得测试集的1x1024向量,用于训练 vectorUnderTest = img2vector("testDigits/%s" % fileNameStr) #获得预测结果 #classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) classifierResult = neigh.predict(vectorUnderTest) print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) if (classifierResult != classNumStr): errorCount += 1.0 print("\n the total number of errors is: %d" % errorCount) print("\n the total error rate is: %100.2f%%" % (errorCount / float(mTest) * 100))
def run(menu, value): dataset = Dataset(menu.getValue("d") + ".data") percent = int(menu.getValue("p")) / 100 nNeighbors = int(menu.getValue("k")) algorithm = menu.getValue("a") te_data, tr_data, te_target, tr_target = ms.train_test_split( dataset.data, dataset.target, test_size=percent) classifier = Classifier(nNeighbors) if algorithm == "Quade" else knc( n_neighbors=nNeighbors) classifier.fit(tr_data, tr_target) predicted_target = classifier.predict(te_data) print("Accuracy of Algorithm:", accuracy(te_target, predicted_target))
def knn_function(train_features, train_labels, test_features, test_labels): print("\n\nKNN") print( "===================================================================") ks = [1, 3, 5] for k in ks: knn = knc(n_neighbors=k) knn.fit(train_features, train_labels) result = knn.predict(test_features) print("k = ", k) printResult(test_labels, result) print( "===================================================================")
def train_model_knc (features, labels) : # Scaling is very important for distance based classifiers scaler = StandardScaler() clf_knc = knc() # Transforms are applied exactly in the order specified estimators = [('sscaler', scaler), ('knc', clf_knc)] # p = 2 corresponds to Euclidean distance, p = 1 corresponds to Manhattan distance params_dict = {'knc__n_neighbors': [5, 8, 10, 15, 20, 25, 30], 'knc__weights':['uniform', 'distance'], 'knc__p': [1, 2]} clf = GridSearchCV(Pipeline(estimators), params_dict, scoring = 'roc_auc', cv = 5) clf.fit(features, labels) print ("Best estimator: ", clf.best_estimator_) print ("Best best scores: %.4f" %(clf.best_score_)) #print ("Best grid scores: ", clf.grid_scores_) return clf
def make_prediction_grid(points, outcomes, limits, steps=1, k=5): (x_min, x_max, y_min, y_max) = limits xs = np.arange(x_min, x_max, steps) ys = np.arange(y_min, y_max, steps) knn = knc(n_neighbors=k) knn.fit(points, outcomes) (xx, yy) = np.meshgrid(xs, ys) prediction_grid = np.zeros(xx.shape, dtype=int) for i, x in enumerate(xs): for j, y in enumerate(ys): p = np.array([x, y]) prediction_grid[j, i] = knn.predict([p])[0] return (xx, yy, prediction_grid)
def covid_knn(trainfile, testfile, process_rank): with open(trainfile) as covidfile: cases = pd.read_csv(covidfile, index_col="id") with open(testfile) as casefile: tests = pd.read_csv(casefile, index_col="id") features = ['age', 'bmi', 'HbA1c'] cases = normalizeDF(cases, features) features.append('resp_disease') X = cases[features].values y = cases['death_risk'].values # split the dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # knn classification classifier = knc(n_neighbors=6) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) # knn testing with input files tests = normalizeDF(tests, features) Z = tests[features].values Z_pred = classifier.predict(Z) print("The predictions for: " + testfile) print(Z_pred) if process_rank == 0: accuracy = acc(y_test, y_pred) * 100 print("Accuracy of the model is: ") print(accuracy)
from time import time from split import data_split X_train, X_test, Y_train, Y_test = data_split() from sklearn.neighbors import KNeighborsClassifier as knc t0 = time() clf = knc(n_neighbors=20) clf.fit(X_train, Y_train) print("Training Time: " + str(round(time() - t0, 3)) + "s") t1 = time() pred = clf.predict(X_test) print("Testing Time: " + str(round(time() - t1, 3)) + "s") from sklearn.metrics import accuracy_score print(accuracy_score(Y_test, pred))
model.fit( X , y ) X_test = [ row[ :-1 ] for row in test_data ] y_real = [ row[ -1 ] for row in test_data ] y_pred = model.predict( X_test ) print report( y_real , y_pred ) tp = lambda x : 1 if x == 'spam' else 0 real = [ tp( v ) for v in y_real ] pred = [ tp( v ) for v in y_pred ] print mean_absolute_error( real , pred ) print mean_squared_error( real , pred ) if __name__ == '__main__' : if len( sys.argv ) > 2 : train_fpath , test_fpath = sys.argv[ 1: ] train_data = import_csv( train_fpath ) test_data = import_csv( test_fpath ) ''' DECISION TREE ''' cf = dtc( criterion = 'gini' , max_depth = 50 ) classify( cf , train_data , test_data , 'decision_tree' ) ''' NEAREST NEIGHBORS ''' cf = knc( n_neighbors = 1 , metric = 'hamming' ) classify( cf , train_data , test_data , 'knearest_neighbors' ) ''' NAIVE BAYES ''' cf = mnb( alpha = 100.0 ) classify( cf , train_data , test_data , 'naive_bayes' ) else : print "Usage python %s [train_csv_file] [test_csv_file]" % sys.argv[ 0 ]
test_data = np.array(test) print("CONVERTED TO NUMPY ...") print() print("SPLITTING TRAIN DATA ...") print() features = [] target = [] #obtaining categories in target target = train['Category'].values features = train_data[0:, 1:5] model = knc(n_neighbors = 9) #model = knc(n_neighbors = 9, n_jobs = -1) # print('TRAINING THE MODEL USING...', model) print() print('MODELLING DATA...') #splitting into train and target variables print() model.fit(features,target) print('MODELLING DONE...') print() print('PREDICTING TEST SET...')
from sklearn.preprocessing import StandardScaler as ss df = pd.read_csv('Social_Network_Ads.csv') #sns.scatterplot(df['EstimatedSalary'],df['Purchased']) df.drop('User ID', inplace=True, axis=1) #print(df.info()) gen = pd.get_dummies(df['Gender'], drop_first=True) df.drop('Gender', inplace=True, axis=1) #print(gen.head()) dff = pd.concat([df, gen], axis=1) #print(dff.info()) x = dff.drop('Purchased', axis=1) y = dff['Purchased'] print(y.head()) sss = ss() xx = sss.fit_transform(x) xtrain, xtest, ytrain, ytest = train_test_split(xx, y, test_size=0.3, random_state=101) cm = knc(n_neighbors=3) cm.fit(xtrain, ytrain) pdata = cm.predict(xtest) creport = cr(ytest, pdata) print(creport)
y = train_data['Survived'] X = train_data.drop(['Survived'], axis=1) # # ML # In[ ]: from sklearn.tree import DecisionTreeClassifier as dtc model1 = dtc() model1.fit(X, y) from sklearn.neighbors import KNeighborsClassifier as knc model2 = knc(n_neighbors=5) model2.fit(X, y) from sklearn.svm import SVC model4 = SVC(C=1.0, kernel='rbf', degree=3) model4.fit(X, y) from sklearn.ensemble import RandomForestClassifier as rfc model3 = rfc(n_estimators=100, max_depth=3, max_features=0.5, min_samples_leaf=32) model3.fit(X, y)
def classify(s): import pandas as pd import numpy import pandas_montecarlo from scipy.stats import shapiro, kruskal, f_oneway from sklearn.ensemble import RandomForestClassifier as rfc from sklearn.neighbors import KNeighborsClassifier as knc from sklearn.svm import SVC as svc from sklearn.linear_model import LogisticRegression as lgr ## RandomForest Classifier with monte carlo simulated training set numpy.random.seed(s) #df = pd.read_csv("mc_test_data.csv") #df = pd.read_csv("rndf_filt_data.csv") df = pd.read_csv("data.csv") #random forest selected the following columns as most predictive df = df[['diagnosis','area_worst','concave points_mean','concave points_worst','perimeter_worst','radius_worst']] #print(df.head()) #df = df.drop(["id","Unnamed: 32"],axis=1) #df = df.drop(["Unnamed: 0"],axis=1) df = df.replace({'diagnosis': "M"}, 1) df = df.replace({'diagnosis': "B"}, 0) #split dataset for mc seed and testing df_mc, df = numpy.split(df, [int(.7*len(df))]) #split dataset by class #df_1 = pd.read_csv("mc_data_M.csv").drop(["Unnamed: 0"],axis=1) #df_0 = pd.read_csv("mc_data_B.csv").drop(["Unnamed: 0"],axis=1) df_1 = df_mc.loc[df_mc.diagnosis==1] df_0 = df_mc.loc[df_mc.diagnosis==0] df_1 = df_1.drop(["diagnosis"],axis=1) df_0 = df_0.drop(["diagnosis"],axis=1) #simulate class 0 data mc_sim_df_0 = pd.DataFrame() mc_sim_df_0['diagnosis']= ['0'] * len(df_0.index) for col in df_0.columns: col_sim = df_0[col].montecarlo(sims = 2, bust = 0, goal = 0).data col_sim = col_sim.drop(["original"],axis = 1) for col2 in col_sim.columns: mc_sim_df_0[col]=col_sim[col2] #if(shapiro(mc_sim_df_1[col])[1]>0.05): #print(kruskal(mc_sim_df_1[col],df_1[col])) #else: #print(f_oneway(mc_sim_df_1[col],df_1[col])) #simulate class 1 data mc_sim_df_1 = pd.DataFrame() mc_sim_df_1['diagnosis']= ['1'] * len(df_1.index) for col in df_1.columns: col_sim = df_1[col].montecarlo(sims = 2, bust = 0, goal = 0).data col_sim = col_sim.drop(["original"],axis = 1) for col2 in col_sim.columns: mc_sim_df_1[col]=col_sim[col2] #if(shapiro(mc_sim_df_1[col])[1]>0.05): #print(kruskal(mc_sim_df_1[col],df_1[col])) #else: #print(f_oneway(mc_sim_df_1[col],df_1[col])) #diag = mc_sim_df_1.append(mc_sim_df_0)['diagnosis'] mc_sim_df = mc_sim_df_1.append(mc_sim_df_0) #shuffling dataframe for good luck #mc_sim_df = mc_sim_df.sample(frac=1) #mc_sim_df['diagnosis']=diag mc_sim_df.head(20) #values formatted labels = df["diagnosis"] df = df.drop("diagnosis",axis=1) dfDev, dfTes = numpy.split(df, [int(.7*len(df))]) DDev, DTes = numpy.split(labels, [int(.7*len(labels))]) #DTrn = mc_sim_df['diagnosis'] #dfTrn = mc_sim_df.drop(['diagnosis'], axis = 1) DTrn = df_mc['diagnosis'] dfTrn = df_mc.drop(['diagnosis'], axis = 1) scores = [] #run model and test #randomforest model = rfc() model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #knn model = knc() model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #svc model = svc(kernel="linear") model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #svc model = svc(kernel="rbf") model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) #logistic regression model = lgr() model = model.fit(dfTrn.values,DTrn) pd = model.predict(dfDev) hit = 0 for i in range(len(pd)): if(int(pd[i])==int(DDev.iloc[i])): hit+=1 scores.append(hit/len(pd)) return scores
df.dropna(thresh=8, inplace=True) df.shape #players=1130 #set missings to 0 df.fillna(value=0, inplace=True) df.isnull().sum() #set explanatory and response variables explanatory = [ col for col in df.columns if col not in ['playerid', 'inducted', 'year'] ] df_exp = df[explanatory] df_res = df.inducted #KNN knn = knc(p=2) #specify Euclidean distance param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_accuracy = gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_f1 = gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1, 30, 2)) #set up grid for results kn_auc = gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res) #Naive Bayes nb = mnb() nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy') nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1')
sex = 'F' scaler = StandardScaler() data_partial = data[data['Sex'] == sex].drop('Sex', axis=1) # corr_matrix_f, corr_matrix_m = data_f.corr(), data_m.corr() # plot_corr_matrices(corr_matrix_f, corr_matrix_m) y = data_partial['EmoState'] X = scaler.fit_transform(data_partial.drop('EmoState', axis=1)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=71) models = (('DTC', dtc()), ('SVM', svc(C=10)), ('KNN', knc(n_neighbors=10)), ('SGDC', sgdc()), ('GNBC', gnbc()), ('MLPC', mlpc(max_iter=1000, learning_rate='adaptive'))) results = [] names = [] seed = 13 scoring = 'accuracy' for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True) cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
y, test_size=0.3, random_state=0) train_x.head() train_y.head() test_x.head() test_y.head() # KNN using sklearn # Importing Knn algorithm from sklearn.neighbors from sklearn.neighbors import KNeighborsClassifier as knc # for 3 nearest neighbours neighbour = knc(n_neighbors=3) # Fitting with training data neighbour.fit(train_x, train_y) # train accuracy train_acc = np.mean(neighbour.predict(train_x) == train_y) train_acc # 95.71% # test accuracy test_acc = np.mean(neighbour.predict(test_x) == test_y) test_acc # 93.55% # for 5 nearest neighbours neighbour = knc(n_neighbors=5)
import sys import sklearn from classifier_utils import * from sklearn.neighbors import KNeighborsClassifier as knc if __name__ == '__main__' : if len( sys.argv ) > 3 : infilepath , k , dist = sys.argv[ 1: ] data = import_csv( infilepath ) cf = knc( n_neighbors = int( k ) , metric = dist ) stats = cross_validation( data , cf ) print "PARAMS: K=%s , metric=%s" % ( k , dist ) print_stats( stats ) else : print "Usage python %s [csv_file] [neighbors] [distance]" % sys.argv[ 0 ]
b = pd.read_csv('ft.csv',index_col=0) b = np.array(b) valid_data = valid_data * b.T test_data = test_data * b.T train_data = train_data * b.T k = 101 nbs = 92 c = np.argsort(b,axis = 0) for k in range(1,k,1): d = c[-k:-1,0] print d d = list(d) d.append(c[-1,0]) td = train_data[:,d] tsd = test_data[:,d] vld = valid_data[:,d] #train_data = train_data[0:100,:] #train_class = train_class[0:100] for nb in range(91,nbs,1): clf = knc(n_neighbors = nb) clf.fit(td,train_class) print print k,nb print 'scr' print clf.score(tsd,test_class)
sns.countplot(zoo['animal name']) from sklearn.model_selection import train_test_split train,test = train_test_split(zoo,test_size=0.2) train.head() train.shape test.shape from sklearn.neighbors import KNeighborsClassifier as knc neigh = knc(n_neighbors = 3) neigh neigh.fit(train.iloc[:,2:17],train.iloc[:,17]) train_acc = np.mean(neigh.predict(train.iloc[:,2:17])==train.iloc[:,17]) train_acc test_acc = np.mean(neigh.predict(test.iloc[:,2:17])==test.iloc[:,17]) test_acc neigh1 = knc(n_neighbors = 5)
def knn(test_mode=False, custom_data=False): results = { 'test': { 'accuracy': None, 'confusion': None }, 'train': { 'accuracy': [], 'confusion': [] }, 'best_model': None, 'best_acc': 0 } settings = { 'cv_iter': 100, 'cv_score': 'accuracy', 'n_cv': 3, 'n_folds': 10, 'n_samples': 2000, } if test_mode: settings['n_samples'] = 100 data_path = os.path.join( Path(__file__).resolve().parents[2], 'data', 'processed') if custom_data: data = np.load(os.path.join(data_path, 'vectors.npy')) X = data.item()['data'] y = data.item()['labels'] X_train = X[:60000, :] X_test = X[60000:, :] y_train = y[:60000] y_test = y[60000:] del X, y, data metric = cosine else: train_data = os.path.join(data_path, 'training.pt') test_data = os.path.join(data_path, 'test.pt') X_train, y_train = convert_ds_to_np(train_data) X_test, y_test = convert_ds_to_np(train_data) metric = 'euclidean' X_train = X_train[:settings['n_samples'], :] y_train = y_train[:settings['n_samples']] X_test = X_test[:settings['n_samples'], :] y_test = y_test[:settings['n_samples']] # model set up using pipeline for randomized CV clf = knc(metric=metric, algorithm='brute') cv_opts = {'n_neighbors': randint(2, 10)} model = RandomizedSearchCV(clf, cv_opts, n_jobs=-1, n_iter=settings['cv_iter'], cv=settings['n_cv'], scoring=settings['cv_score']) kf = StratifiedKFold(n_splits=settings['n_folds'], shuffle=True) for i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)): X_trn = X_train[train_idx] X_vld = X_train[valid_idx] y_trn = y_train[train_idx] y_vld = y_train[valid_idx] model.fit(X_trn, y_trn) y_pred = model.predict(X_vld) this_acc = accuracy_score(y_pred, y_vld) results['train']['accuracy'].append(this_acc) results['train']['confusion'].append(confusion_matrix(y_pred, y_vld)) print('[{}/{}]: this={} : best={}'.format(i + 1, settings['n_folds'], this_acc, results['best_acc'])) if this_acc > results['best_acc']: results['best_acc'] = this_acc results['best_model'] = copy(model) # get test performance with best model: y_pred = results['best_model'].predict(X_test) results['test']['accuracy'] = accuracy_score(y_pred, y_test) results['test']['confusion'] = confusion_matrix(y_pred, y_test) return (results)
output.append(-1) elif x > -0.5 and x < 0.5: output.append(0) #%% data_ml = pd.DataFrame(data2_norm_gen) data_ml1 = data_ml.drop(['missed_book', "date"], axis=1) from sklearn.model_selection import train_test_split as tts x_train1, x_test1, y_train1, y_test1 = tts(data_ml1, output) from sklearn.neighbors import KNeighborsClassifier as knc cla = knc() cla.fit(x_train1, y_train1) pred = cla.predict(x_test1) from sklearn.metrics import accuracy_score as ac acc = ac(y_test1, pred) #%% Plotting hexbin for data 2 and patterns day_section = [] for i in data2.hour: if i >= 0 and i < 3:
# Import required libraries from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier as knc from sklearn import metrics import numpy as np # Load dataset iris = datasets.load_iris() # Feature and label differentiation x = iris.data y = iris.target # Set initial K value k = 3 # Checks from 3 - 23 to assess best fit K value for the model. # Iterates through KNN model creation to find best fit. while k < 23: x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2) # 0.2 value changed to 0.3 to test 30% test size also knn = knc(n_neighbors=k) knn.fit(x_train, y_train) test_prediction = knn.predict(x_test) accuracy = metrics.accuracy_score(y_test, test_prediction) print(k, "=", accuracy) k += 2 # Code run through 10 times with test_size set to 0.2 and a further 10 times with test_size set to 0.3 # Results saved to tables. See Images/k_values.PNG
y = [] for line in infile: y += [line] infile.close() #Get X and Y train_x = pd.DataFrame(data) train_y = pd.Series(y) ########################################################################## #kNN from sklearn.neighbors import KNeighborsClassifier as knc from sklearn.metrics import confusion_matrix as cm #set up and fit neigh = knc() neigh.fit(train_x, train_y) #Test the training set train_pred = neigh.predict(train_x) #confusion matrix confusion = cm(train_y, train_pred) fun = lambda x: x / sum(x) cm_perc = np.apply_along_axis(fun, 1, confusion) # array([[0.60714286, 0.15079365, 0.15873016, 0.08333333], # [0.09896907, 0.78762887, 0.06597938, 0.04742268], # [0.21988528, 0.12810707, 0.56978967, 0.08221797], # [0.23360656, 0.17213115, 0.23155738, 0.36270492]])
import pandas as pd import sklearn.cross_validation as cv from sklearn.neighbors import KNeighborsClassifier as knc from sklearn.preprocessing import scale df = pd.read_csv('wine.data', index_col=None) target = df[df.columns[0]] x = df.drop(df.columns[0], axis=1) kf = cv.KFold(len(df.index), n_folds=5, shuffle=True, random_state=42) unscaled_result = list() for i in xrange(1, 50): cs_result = cv.cross_val_score(knc(n_neighbors=i), X=x, y=target, cv=kf) unscaled_result.append(cs_result.mean()) max_unscaled = max(unscaled_result) print unscaled_result.index(max_unscaled) + 1 print max_unscaled scaled_x = scale(X=x) scaled_result = list() for i in xrange(1, 50): cs_result = cv.cross_val_score(knc(n_neighbors=i), X=scaled_x, y=target, cv=kf) scaled_result.append(cs_result.mean()) max_scaled = max(scaled_result) print scaled_result.index(max_scaled) + 1 print max_scaled
#drop 27 players where all B/P/F stats are missing df.dropna(thresh=8, inplace=True) df.shape #players=1130 #set missings to 0 df.fillna(value=0, inplace=True) df.isnull().sum() #set explanatory and response variables explanatory = [col for col in df.columns if col not in ['playerid', 'inducted','year']] df_exp = df[explanatory] df_res = df.inducted #KNN knn=knc(p = 2) #specify Euclidean distance param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_accuracy=gscv(knn, param_grid, cv=10, scoring='accuracy').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_f1=gscv(knn, param_grid, cv=10, scoring='f1').fit(df_exp, df_res) param_grid = dict(n_neighbors=range(1,30, 2)) #set up grid for results kn_auc=gscv(knn, param_grid, cv=10, scoring='roc_auc').fit(df_exp, df_res) #Naive Bayes nb = mnb() nb_accuracy = cvs(nb, df_exp, df_res, cv=10, scoring='accuracy') nb_f1 = cvs(nbclass, df_exp, df_res, cv=10, scoring='f1') nb_auc = cvs(nbclass, df_exp, df_res, cv=10, scoring='roc_auc')
sns.heatmap(cm, annot=True) plt.xlabel('Predicted') plt.ylabel('Truth') print(classification_report(y_test, y_pred)) #applying k-fold cross validation from sklearn.model_selection import cross_val_score as cvs accuracies = cvs(estimator=classifier,X=x_train,y=y_train,cv=10) print(accuracies.mean()) print(accuracies.std()) """K-NN""" from sklearn.neighbors import KNeighborsClassifier as knc classifier=knc(n_neighbors=10,metric='minkowski',p=2) classifier.fit(x_train, y_train) #predicting the test set results y_pred=classifier.predict(x_test) from sklearn.metrics import confusion_matrix, classification_report cm=confusion_matrix(y_test, y_pred) plt.figure(figsize = (5,5)) sns.heatmap(cm, annot=True) plt.xlabel('Predicted') plt.ylabel('Truth') print(classification_report(y_test, y_pred))
valid_class = np.ravel(np.array(valid_class)) print train_data.shape print test_data.shape print valid_data.shape ''' b = pd.read_csv('ft.csv',index_col=0) b = np.array(b) valid_data = valid_data * b.T test_data = test_data * b.T train_data = train_data * b.T ''' #train_data = train_data[0:100,:] #train_class = train_class[0:100] svc = knc() nb = range(31,52,5) svm_parameters = {'n_neighbors':nb} clf = gsc(svc, svm_parameters) clf.fit(train_data,train_class) print clf.grid_scores_ print clf.best_score_ print clf.best_estimator_ print clf.best_params_
import numpy as np from sklearn.neighbors import KNeighborsClassifier as knc from sklearn.metrics import accuracy_score from time import time X = np.array([[2, 5], [3, 6], [1, 7], [1, 2], [4, 3], [6, 8], [7, 3], [6, 1], [8, 7], [9, 3]]) Y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2]) startTime = time() clf = knc(n_neighbors=3) clf.fit(X, Y) pred = clf.predict([[0, 1]]) print(pred) testX = np.array([[1, 9], [3, 1], [4, 7], [6, 5], [5, 5], [7, 9]]) testY = np.array([1, 1, 1, 2, 2, 2]) pred = clf.predict(testX) print("Accuracy ", accuracy_score(testY, pred) * 100) print("Time ", round(time() - startTime, 3), "sec")
dict(zip(range(nodes_n), [[it] for it in dm.index]))) names = dict(zip(range(nodes_n), dm.index)) G = nx.relabel_nodes(G, names) mst = nx.minimum_spanning_tree(G) mst_json = json_graph.node_link_data(mst) for each in mst_json['links']: each['type'] = 'mst' #mst_json['links'] = [] # if you uncomment upper line, this will also display MST lines or it will only display KNN line. # KNN part # KNN config nearest_num = 1 ### M = knc(weights='distance', metric='precomputed') M.fit(dm.values.tolist(), list(dm.index)) query_dict = {} for _idx, name in enumerate(mst_json['nodes']): name = name['id'] query_dict[name] = _idx for _idx, name in enumerate(list(dm.index)): temp = M.kneighbors(np.array(dm.values.tolist()[_idx]).reshape(1, -1), n_neighbors=nearest_num + 1) for num in range(nearest_num): links = { 'source': query_dict[name], 'target': query_dict[list(dm.index)[temp[1][0][num + 1]]], 'weight': temp[0][0][num + 1],
G, {_i: list(distance.columns)[_i] for _i in range(len(G.nodes()))}) # Above graph it isn't have any labels. we need to assign its. mst_G = nx.mst.minimum_spanning_tree(G) # generate a new graph which is MST version of ori G. for edge in mst_G.edges(): draw_data.append( go.Scatter( x=[vals[names.index(edge[0]), 0], vals[names.index(edge[1]), 0]], y=[vals[names.index(edge[0]), 1], vals[names.index(edge[1]), 1]], mode='lines', showlegend=False, line=dict(color=MST_color, width=MST_width))) # KNN M = knc(n_neighbors=3, weights='distance', metric='precomputed') M.fit(distance.values.tolist(), list(distance.index)) # same as before, Model doen't have any label, so below if using whole row to represent sample. for _idx in range(len(names)): temp = M.kneighbors(distance.values.tolist()[_idx], n_neighbors=2)[1] # 2 is mean choose closest one sample, total is 2, but need to consider itself is the nearest. # temp is a complex structure object. # last [1:] mean except itself, left -> right is distance nearest -> far. for _x in temp[0][1:]: if type(dot_color) == list: current_color = dot_color[_idx] else: current_color = ['#000000'] draw_data.append(
#XTRAINN=np.array(XTRAINN) # #xtr,xts,ytr,yts=train_test_split(XTRAINN,Y=YTRAINN,test_size=0.2,random_state=1) #svc_op.fit(xtrain2,ytrain2) #acc=[] #acc=cvs(estimator=svc_op,X=xts,y=yts,cv=3) #acc.mean() # y_stpred_svc=svc_op.predict(xtest2) #KNN from sklearn.neighbors import KNeighborsClassifier as knc knn=knc() param=[{'n_neighbors':[240,250,230],'p':[2],'metric':['minkowski']}] gs_knn=GridSearchCV(estimator=knn,param_grid=param,verbose=5,n_jobs=-1,cv=10) gs_knn.fit(xtrain2,ytrain2)#try later.....this skips fitting on kfold data gs_knn.best_params_ gs_knn.best_score_ knnf=knc(n_neighbors=230 ,p=2 ,metric='minkowski') knnf.fit(xtrain2,ytrain2) 200=59.82 300-59.881 250-59.94 #knnf.fit(xtrain2,ytrain2)