def _select(feature_file, feature_file_path, train_samples_range, num_replicates, num_test_samples, models, output_dir, config_file=None): """helper function to do model selection, used with either config_file or another set of arguments passed to hvc.select""" labels = np.asarray(feature_file['labels']) # call grab_n_samples this first time to get indices for test/validation set # and a list of song IDs from which we will draw the training set indices below test_IDs, train_song_ID_list = grab_n_samples_by_song( feature_file['songfile_IDs'], feature_file['labels'], num_test_samples, return_popped_songlist=True) test_labels = labels[test_IDs] score_arr = np.zeros( (len(train_samples_range), len(range(num_replicates)), len(models))) avg_acc_arr = np.zeros( (len(train_samples_range), len(range(num_replicates)), len(models))) pred_labels_arr = np.empty( (len(train_samples_range), len(range(num_replicates)), len(models)), dtype='O') train_IDs_arr = np.empty( (len(train_samples_range), len(range(num_replicates))), dtype='O') for num_samples_ind, num_train_samples in enumerate(train_samples_range): for replicate in range(num_replicates): print('Training models with {0} samples, replicate #{1}'.format( num_train_samples, replicate)) # here we call grab_n_samples again with the train_song_ID_list # from above. currently each fold is a random grab without # anything like k-folds. # For testing on large datasets this is okay but in situations # where we're data-limited it's less than ideal, the whole point # is to not have to hand-label a large data set train_IDs = grab_n_samples_by_song(feature_file['songfile_IDs'], feature_file['labels'], num_train_samples, song_ID_list=train_song_ID_list) train_IDs_arr[num_samples_ind, replicate] = train_IDs train_labels = labels[train_IDs] for model_ind, model_dict in enumerate(models): # lazy-imports to avoid loading all of # scikit-learn and tensorflow if possible if model_dict['model_name'] == 'svm': if 'SVC' not in locals(): from sklearn.svm import SVC elif model_dict['model_name'] == 'knn': if 'neighbors' not in locals(): from sklearn import neighbors elif model_dict['model_name'] == 'flatwindow': if 'flatwindow' not in locals(): from hvc.neuralnet.models.flatwindow import flatwindow from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping # save info associated with model such as indices of training samples. # Note this is done outside the if/elif list for switching between # models. model_output_dir = os.path.join( output_dir, determine_model_output_folder_name(model_dict)) if not os.path.isdir(model_output_dir): os.makedirs(model_output_dir) model_fname_str = \ '{0}_{1}samples_replicate{2}.model'.format(model_dict['model_name'], num_train_samples, replicate) model_filename = os.path.join(model_output_dir, model_fname_str) # if-elif that switches based on model type, # start with sklearn models if model_dict['model_name'] in model_types['sklearn']: # if model_dict specifies using a certain feature group if 'feature_group' in model_dict: # determine if we already figured out which features belong to that feature group. # Can only do that if model_dict defined for todo_list, not if model_dict defined # at top level of select config file if 'feature_list_indices' in model_dict: feature_inds = np.in1d( feature_file['features_arr_column_IDs'], model_dict['feature_list_indices']) else: # have to figure out feature list indices ftr_grp_ID_dict = feature_file[ 'feature_group_ID_dict'] ftr_list_grp_ID = feature_file[ 'feature_list_group_ID'] # figure out what they are by finding ID # corresponding to feature # group or groups in ID_dict, and then finding all the indices in the # feature list that have that group ID #, using ftr_list_grp_ID, a list # the same length as feature list where each element indicates whether # the element with the same index in the feature list belongs to a # feature group and if so which one, by ID # if type(model_dict['feature_group']) == str: ftr_grp_ID = ftr_grp_ID_dict[ model_dict['feature_group']] # now find all the indices of features associated with the # feature group for that model ftr_list_inds = [ ind for ind, val in enumerate(ftr_list_grp_ID) if val == ftr_grp_ID ] # if user specified more than one feature group elif type(model_dict['feature_group']) == list: ftr_list_inds = [] for ftr_grp in model_dict['feature_group']: ftr_grp_ID = ftr_grp_ID_dict[ftr_grp] # now find all the indices of features associated with the # feature group for that model ftr_list_inds.extend([ ind for ind, val in enumerate( ftr_list_grp_ID) if val == ftr_grp_ID ]) # finally use ftr_list_inds to get the actual columns we need from the # features array. Need to this because multiple columns might belong to # the same feature, e.g. if the feature is a spectrum feature_inds = np.in1d( feature_file['features_arr_column_IDs'], ftr_list_inds) # put feature list indices in model dict so we have it later when # saving summary file model_dict['feature_list_indices'] = ftr_list_inds elif 'feature_list_indices' in model_dict and\ 'feature_group' not in model_dict: # if no feature group in model dict, use feature list indices # Note that for neuralnet models, there will be neither if type(model_dict['feature_list_indices']) is str: if model_dict['feature_list_indices'] == 'all': feature_inds = np.ones( (feature_file['features_arr_column_IDs']. shape[-1], )).astype(bool) else: raise ValueError( 'received invalid string for feature_list_indices: {}' .format( model_dict['feature_list_indices'])) else: # use 'feature_list_indices' from model_dict to get the actual columns # we need from the features array. Again, need to this because multiple # columns might belong to the same feature, # e.g. if the feature is a spectrum feature_inds = np.in1d( feature_file['features_arr_column_IDs'], model_dict['feature_list_indices']) if model_dict['model_name'] == 'svm': print('training svm. ', end='') clf = SVC(C=model_dict['hyperparameters']['C'], gamma=model_dict['hyperparameters']['gamma'], decision_function_shape='ovr', probability=model_dict['predict_proba']) elif model_dict['model_name'] == 'knn': print('training knn. ', end='') clf = neighbors.KNeighborsClassifier( model_dict['hyperparameters']['k'], 'distance') #use 'advanced indexing' to get only sample rows and only feature models features_train = feature_file['features'][ train_IDs[:, np.newaxis], feature_inds] scaler = StandardScaler() features_train = scaler.fit_transform(features_train) features_test = feature_file['features'][ test_IDs[:, np.newaxis], feature_inds] features_test = scaler.transform(features_test) print('fitting model. ', end='') clf.fit(features_train, train_labels) score = clf.score(features_test, test_labels) print('score on test set: {:05.4f} '.format(score), end='') score_arr[num_samples_ind, replicate, model_ind] = score pred_labels = clf.predict(features_test) pred_labels_arr[num_samples_ind, replicate, model_ind] = pred_labels acc_by_label, avg_acc = get_acc_by_label( test_labels, pred_labels, feature_file['labels_to_use']) print(', average accuracy on test set: {:05.4f}'.format( avg_acc)) avg_acc_arr[num_samples_ind, replicate, model_ind] = avg_acc joblib.dump(clf, model_filename) # this is the middle of the if-elif that switches based on model type # end sklearn, start keras models elif model_dict['model_name'] in model_types['keras']: if 'neuralnet_input' in model_dict: neuralnet_input = model_dict['neuralnet_input'] spects = feature_file['neuralnet_inputs'][ neuralnet_input] else: # if not specified, assume that input should be the one that # corresponds to the neural net model being trained neuralnet_input = model_dict['model_name'] try: spects = feature_file['neuralnet_inputs'][ neuralnet_input] except KeyError: raise KeyError( 'no input specified for model {}, and ' 'input type for that model was not found in ' 'feature file'.format( model_dict['model_name'])) if 'SpectScaler' not in locals(): from hvc.neuralnet.utils import SpectScaler if 'test_labels_onehot' not in locals(): from sklearn.preprocessing import LabelBinarizer label_binarizer = LabelBinarizer() test_labels_onehot = label_binarizer.fit_transform( test_labels) if 'test_spects' not in locals(): # get spects for test set, # also add axis so correct input_shape for keras.conv_2d test_spects = spects[test_IDs, :, :] train_labels_onehot = label_binarizer.transform( train_labels) # get spects for train set, # also add axis so correct input_shape for keras.conv_2d train_spects = spects[train_IDs, :, :] # scale all spects by mean and std of training set spect_scaler = SpectScaler() # concatenate all spects then rotate so # Hz bins are columns, i.e., 'features' spect_scaler.fit(train_spects) train_spects_scaled = spect_scaler.transform(train_spects) test_spects_scaled = spect_scaler.transform(test_spects) # have to add 'channels' axis for keras 2-d convolution # even though these are spectrograms, don't have channels # like an image would. # Decided to leave it explicit here instead of hiding in a function train_spects_scaled = train_spects_scaled[:, :, :, np.newaxis] test_spects_scaled = test_spects_scaled[:, :, :, np.newaxis] num_samples, num_freqbins, num_timebins, num_channels = \ train_spects_scaled.shape num_label_classes = len(feature_file['labels_to_use']) input_shape = (num_freqbins, num_timebins, num_channels) flatwin = flatwindow(input_shape=input_shape, num_label_classes=num_label_classes) csv_str = ''.join([ 'flatwindow_training_', '{}_samples_'.format(num_train_samples), 'replicate_{}'.format(replicate), '.log' ]) csv_filename = os.path.join(model_output_dir, csv_str) csv_logger = CSVLogger(csv_filename, separator=',', append=True) checkpoint = ModelCheckpoint(model_filename, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='max') earlystop = EarlyStopping(monitor='val_acc', min_delta=0, patience=20, verbose=1, mode='auto') callbacks_list = [csv_logger, checkpoint, earlystop] flatwin.fit( train_spects_scaled, train_labels_onehot, validation_data=(test_spects_scaled, test_labels_onehot), batch_size=model_dict['hyperparameters']['batch_size'], epochs=model_dict['hyperparameters']['epochs'], callbacks=callbacks_list, verbose=1) pred_labels = flatwin.predict(test_spects_scaled, batch_size=32, verbose=1) pred_labels = label_binarizer.inverse_transform( pred_labels) score = accuracy_score(test_labels, pred_labels) print('score on test set: {:05.4f} '.format(score), end='') score_arr[num_samples_ind, replicate, model_ind] = score pred_labels_arr[num_samples_ind, replicate, model_ind] = pred_labels acc_by_label, avg_acc = get_acc_by_label( test_labels, pred_labels, feature_file['labels_to_use']) print(', average accuracy on test set: {:05.4f}'.format( avg_acc)) avg_acc_arr[num_samples_ind, replicate, model_ind] = avg_acc model_meta_fname_str = \ '{0}_{1}samples_replicate{2}.meta'.format(model_dict['model_name'], num_train_samples, replicate) model_meta_filename = os.path.join(model_output_dir, model_meta_fname_str) model_meta_output_dict = { 'model_filename': model_filename, 'config_file': config_file, 'feature_file': feature_file_path, 'test_IDs': test_IDs, 'train_IDs': train_IDs, 'model_name': model_dict['model_name'], 'pred_labels': pred_labels, 'test_labels': test_labels } if 'scaler' in locals(): model_meta_output_dict['scaler'] = scaler # have to delete scaler # so it's not still in memory next loop # (e.g. because a different model that doesn't use scaler # is tested in next loop) del scaler elif 'spect_scaler' in locals(): # neural net models uses scaler on spectrogram # instead of vanilla sklearn scalar model_meta_output_dict['spect_scaler'] = spect_scaler del spect_scaler if 'label_binarizer' in locals(): model_meta_output_dict['label_binarizer'] = label_binarizer if model_dict['model_name'] in model_types['sklearn']: # to be able to extract features for predictions # on unlabeled data set, need list of features if ((type(model_dict['feature_list_indices']) is str) and (model_dict['feature_list_indices'] == 'all')): model_feature_list = feature_file['feature_list'] else: model_feature_list = [ feature_file['feature_list'][ind] for ind in model_dict['feature_list_indices'] ] model_meta_output_dict['feature_list'] = model_feature_list elif model_dict['model_name'] in model_types['keras']: model_meta_output_dict['feature_list'] = [neuralnet_input] joblib.dump(model_meta_output_dict, model_meta_filename) # after looping through all samples + replicates output_filename = os.path.join( output_dir, 'summary_model_select_file_created_' + timestamp()) select_summary_dict = { 'config_file': config_file, 'feature_file': feature_file_path, 'train_samples_range': train_samples_range, 'num_replicates': num_replicates, 'model_dict': model_dict, 'test_IDs': test_IDs, 'train_IDs_arr': train_IDs_arr, 'score_arr': score_arr, 'avg_acc_arr': avg_acc_arr, 'pred_labels_arr': pred_labels_arr, } joblib.dump(select_summary_dict, output_filename)
def trainAlgo(imageArr, labelArr, DIR_NAME): X_train = [] y_labels = [] model_save_path = str(DIR_NAME) + "_knn.clf" n_neighbors = 3 #model_save_path = None #n_neighbors = None knn_algo = 'ball_tree' verbose = False proto = "ML/deploy.prototxt.txt" caffmodel = "ML/res10_300x300_ssd_iter_140000.caffemodel" confid = 0.99 net = cv2.dnn.readNetFromCaffe(proto, caffmodel) for x in range(len(imageArr)): #print("Training Identity " + labelArr[x] + " " + str(x)) sys.stdout.write("\r" + str(x + 1) + " of " + str(len(imageArr)) + " has been processed") sys.stdout.flush() try: count = 0 imageA = np.array(imageArr[x]) #imageRGB = cv2.cvtColor(imageA, cv2.COLOR_BGR2RGB) (h, w) = imageA.shape[:2] blob = cv2.dnn.blobFromImage(cv2.resize(imageA, (300, 300)), 1.0, (300, 300), (104.0, 177.0, 123.0)) net.setInput(blob) detections = net.forward() for i in range(0, detections.shape[2]): # print(detections.shape) count += 1 confidence = detections[0, 0, i, 2] if confidence > confid and count == 1: box = detections[0, 0, i, 3:7] * np.array([w, h, w, h]) (startX, startY, endX, endY) = box.astype("int") #face_bounding_boxes = "("+startX+","+endX+","+startY+","+endY+")" roi = imageA[startY:endY, startX:endX] #print(face_recognition.face_encodings(roi)) X_train.append(face_recognition.face_encodings(roi)[0]) y_labels.append(labelArr[x]) except Exception as e: print("") print(e) if n_neighbors is None: n_neighbors = int(round(math.sqrt(len(X)))) if verbose: print("Chose n_neighbors automatically:", n_neighbors) # Create and train the KNN classifier knn_clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm=knn_algo, weights='distance') knn_clf.fit(X_train, y_labels) # Save the trained KNN classifier if model_save_path is not None: with open(model_save_path, 'wb') as f: pickle.dump(knn_clf, f) print("**Training Completed**") return knn_clf
random_state=999) # train, test = train_test_split(df, test_size=0.4, random_state=999) print(type(train)) train.reset_index(inplace=True) test.reset_index(inplace=True) # print(test.to_string()) # shuffle = False si hay dimensión temporal cv = KFold(n_splits=27, shuffle=False) for i, weights in enumerate(['uniform', 'distance']): total_scores = [] for n_neighbors in range(1, 30): fold_accuracy = [] knn = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) for train_fold, test_fold in cv.split(train): # División train test aleatoria f_train = train.loc[train_fold] f_test = train.loc[test_fold] # entrenamiento y ejecución del modelo knn.fit(X=f_train.drop(['attack'], axis=1), y=f_train['attack']) y_pred = knn.predict(X=f_test.drop(['attack'], axis=1)) # evaluación del modelo acc = accuracy_score(f_test['attack'], y_pred) fold_accuracy.append(acc) total_scores.append(sum(fold_accuracy) / len(fold_accuracy)) plt.plot(range(1, len(total_scores) + 1), total_scores,
from sklearn import neighbors from utilities import load_magic04, load_wine, scale_data, train_model, tune_hyperparameters, model_complexity, learning_curve df, factors, response = load_wine() # df, factors, response = load_magic04() df_train, df_test = scale_data(df, response) classifier = neighbors.KNeighborsClassifier(weights="distance") train_model(classifier, df_train, None, factors, response) best_params = tune_hyperparameters(classifier, df_train, factors, response, { "n_neighbors": range(1, 20), "p": range(1, 4) }) # "n_neighbors": range(1, 20) "p": range(1, 4) "metric": ["minkowski","euclidean","manhattan","chebyshev"] "weights": ["uniform", "distance"] model_complexity( neighbors.KNeighborsClassifier(weights="distance", p=best_params["p"]), df_train, factors, response, {"n_neighbors": range(1, 30)}, "n_neighbors") classifier = neighbors.KNeighborsClassifier( weights="distance", n_neighbors=best_params["n_neighbors"], p=best_params["p"]) train_model(classifier, df_train, df_test, factors, response, "Final ") learning_curve(classifier, df_train, factors, response)
# slicing by using a two-dim dataset X = iris.data[:, :2] print(X) print(X.shape) y = iris.target print(y) print(y.shape) h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for weights in ['uniform', 'distance']: # we create an instance of Neighbours Classifier and fit the data. clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf.fit(X, y) joblib.dump(clf, 'model.pkl') clf = joblib.load('model.pkl') # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
h = .02 #optimal number of neighbors n_neighbors = 31 #color maps from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) from sklearn import neighbors for weights in ['uniform', 'distance']: #create instance of KNNClassifier and fit clf = neighbors.KNeighborsClassifier(n_neighbors, algorithm='ball_tree', weights=weights) clf.fit(x_pca, y) #plot decision boundary; assign color to each #point in the mesh [x_min, x_max] x [y_min, y_max] x_min, x_max = x_pca[:, 0].min() - 1, x_pca[:, 0].max() + 1 y_min, y_max = x_pca[:, 1].min() - 1, x_pca[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) #put result in color plot Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
def draw_k_nearest(polarity=None, subjectivity=None, names_dem=None, names_rep=None, k_neighbors=3): ''' This function generates an image showing the results of a KNN on an input scatterplot :param polarity: dictionary containing the average polarity values of each person :param subjectivity: dictionary containing average subjectivity of each person :param names_dem: list of names of all democrats in dataset :params names_rep: list of names of all republicans in dataset :param k_neighbors: K value for KNN calculation :return: None, image shown on screen ''' assert isinstance(polarity, dict) assert isinstance(subjectivity, dict) assert isinstance(names_dem, list) assert isinstance(names_rep, list) assert all(isinstance(i, str) for i in names_dem) assert all(isinstance(i, str) for i in names_rep) assert isinstance(k_neighbors, int) assert k_neighbors > 0 cmap_back = ListedColormap(['#00AAFF', '#FFAAAA']) cmap_scatter_dem = ListedColormap(['b']) cmap_scatter_rep = ListedColormap(['#FF0000']) X = [] y = [] x_dem = [] x_rep = [] for name in names_dem: X.append([polarity[name], subjectivity[name]]) x_dem.append([polarity[name], subjectivity[name]]) y.append(0) for name in names_rep: X.append([polarity[name], subjectivity[name]]) x_rep.append([polarity[name], subjectivity[name]]) y.append(1) X = np.array(X) x_dem = np.array(x_dem) x_rep = np.array(x_rep) y = np.array(y) h = .001 clf = neighbors.KNeighborsClassifier(k_neighbors, weights='distance') clf.fit(X, y) x_min, x_max = X[:, 0].min() - 0.05, X[:, 0].max() + 0.05 y_min, y_max = X[:, 1].min() - 0.05, X[:, 1].max() + 0.05 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_back) plt.scatter(x_dem[:, 0], x_dem[:, 1], cmap=cmap_scatter_dem, marker='o', edgecolors='black', linewidths=1, label='Democrats') plt.scatter(x_rep[:, 0], x_rep[:, 1], cmap=cmap_scatter_rep, marker='^', edgecolors='black', linewidths=1, label='Republicans') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.title('Democrats vs Republicans') plt.xlabel('Polarity') plt.ylabel('Subjectivity') plt.legend() plt.show()
df['nbh'] = le.transform(df.neighbourhood_cleansed) tfid = TfidfVectorizer() ttext = tfid.fit_transform(df['comments']) #print top_feats_in_doc(ttext, tfid.get_feature_names(), 1, 10) #sys.exit(0) print "%d %d" % (ttext.shape[0], len(df['nbh'])) X_train, X_test, y_train, y_test = \ train_test_split(ttext, df['nbh'], test_size=0.2, random_state=1) rs = 1 ests = [ neighbors.KNeighborsClassifier(3), RandomForestClassifier(random_state=rs) ] ests_labels = np.array(['KNeighbors', 'RandomForest']) for i, e in enumerate(ests): e.fit(X_train, y_train) this_score = metrics.accuracy_score(y_test, e.predict(X_test)) scorestr = "%s: Accuracy Score %0.2f" % (ests_labels[i], this_score) print print scorestr print "-" * len(scorestr) print metrics.classification_report(y_test, e.predict(X_test), target_names=le.classes_)
import pandas as pd import os from sklearn import neighbors, model_selection dir = 'E:/' titanic_train = pd.read_csv(os.path.join(dir, 'train.csv')) print(titanic_train.info()) print(titanic_train.columns) X_train = titanic_train[['SibSp', 'Parch']] y_train = titanic_train['Survived'] knn_estimator = neighbors.KNeighborsClassifier() knn_estimator.fit(X_train, y_train) model_selection.cross_val_score(knn_estimator, X_train, y_train, scoring="accuracy", cv=5).mean() titanic_test = pd.read_csv(os.path.join(dir, 'test.csv')) print(titanic_test.info()) X_test = titanic_test[['SibSp', 'Parch']] titanic_test['Survived'] = knn_estimator.predict(X_test) titanic_test.to_csv(os.path.join(dir, 'submission.csv'), columns=['PassengerId', 'Survived'], index=False)
return (yp) # In[53]: accuracyModel(test_separate, simple_logistic_separate) test['predicted'] = accuracyModel(test, simple_logistic_separate) # In[54]: test.plot.scatter(x='predicted', y='target') # In[59]: classifiers = [ neighbors.KNeighborsClassifier(20, weights='distance', n_jobs=-1), linear_model.LogisticRegression(C=10e10, penalty='l1', n_jobs=-1), RandomForestClassifier(n_estimators=1000, min_samples_split=5, max_depth=None, n_jobs=-1), DecisionTreeClassifier('gini') ] label_columns = ["Classifier", "Accuracy"] label = pd.DataFrame(columns=label_columns) nFolds = 8 shuffleSplit = StratifiedShuffleSplit(n_splits=nFolds, test_size=0.25, random_state=3)
x = a[xx] indices.append(xx) cdr = dataprovider.get_CDR(x) ll = dataprovider.retrieve_full_data(x) #CrossSectionalData.show_slices([ll[:,:,50]]) if cdr == None or cdr > 1: continue feat = AlzheimerFeatures.surrounding_points_discrete_with_pos( ll, step, step_2, [dataprovider.get_gender(x)]) print(len(feat)) allfeatures1.append(regressor.predict(feat)[0:cut]) #plt.plot(regressor.predict(feat)) #plt.show() ally1.append(cdr) rbf_svc = neighbors.KNeighborsClassifier(n_neighbors=7) rbf_svc.fit(allfeatures1, ally1) ll = dataprovider.retrieve_full_data(56) error = 0 index = 0 for xx in range(len(a)): x = a[xx] cdr = dataprovider.get_CDR(x) if cdr == None or cdr > 1 or xx in indices: continue ll = dataprovider.retrieve_full_data(x) feat = AlzheimerFeatures.surrounding_points_discrete_with_pos( ll, step, step_2, [dataprovider.get_gender(x)]) predictq = regressor.predict(feat)[:cut] #plt.plot(predict)
def woolyTrain(): '''Trains two models: 1. SVC with PCA 2. k nearest neighbors/ Then does a prediction afterwards''' import pickle import sklearn.model_selection as model_selection import sklearn.decomposition as decomposition import sklearn.preprocessing as preprocessing import sklearn.neighbors as neighbors import sklearn.svm as svm import PIL import os import numpy import scipy.stats as stats d = 'Photos\\Processed' with open('y.dat', 'rb') as f: dc = pickle.load(f) y = [] c = [] X = [] normi = stats.norm.ppf(0.5 + (1. / 6.)) for k in dc: # yi = (k['air'] - k['air_mean'])/k['air_std'] yi = (k['precip'] - k['mean']) / k['std'] if yi <= -normi: ci = 'below' elif yi <= normi: ci = 'normal' else: ci = 'above' im = PIL.Image.open(os.path.join(d, k['photo'])) xi = numpy.array(im) h, w, nrgb = numpy.shape(xi) X.append(xi.flatten()) y.append(yi) c.append(ci) #split! Xt, Xs, ct, cs = model_selection.train_test_split(X, c, test_size=0.10) #PCA. 100 components n = min(100, len(Xt)) pca = decomposition.KernelPCA(n_components=n, kernel='rbf') pca.fit(Xt) Xtp = pca.transform(Xt) Xsp = pca.transform(Xs) #sklearn! SVC trans = preprocessing.QuantileTransformer(output_distribution='normal') Xtpn = trans.fit_transform(Xtp) Xspn = trans.transform(Xsp) svc = svm.SVC(kernel='rbf', gamma='auto', C=1., probability=True) svc.fit(Xtpn, ct) sc = [svc.score(Xtpn, ct), svc.score(Xspn, cs)] #perfect fit, duh print('SVC with PCA: {:.0%} training, {:.0%} test'.format(*sc)) #huge file for some reason, return the best model for processing #dump trans, svc, ct, Xtpn, and then fit in function. df = {'svc': svc, 'trans': trans, 'pca': pca} with open('precip model.dat', 'wb') as f: pickle.dump(df, f) #sklearn! Nearest neighbor neigh = neighbors.KNeighborsClassifier() neigh.fit(Xt, ct) sn = [neigh.score(Xt, ct), neigh.score(Xs, cs)] #perfect fit, duh print('Nearest neighbor: {:.0%} training, {:.0%} test'.format(*sn)) print('Returning models') #huge file for some reason, return the best model for processing return svc, neigh, pca, trans
print('Distance of c1 in training set:') print('{:18s} = {:.4f}'.format('Mean', mu_c1)) print('{:18s} = {:.4f}'.format('Standard deviation', sd_c1)) print('{:18s} = {:.4f}\n'.format('Threshold', threshold_c1)) pass_rate = utils.get_rate(x_passed_s2, x_passed_s1) print(f'Pass rate = {pass_rate * 100:.4f}%') if pass_rate == 0: raise Exception('All samples are blocked by Reliability check') # Stage 3 - Decidability print('\n---------- Decidability ----------------') model_knn = knn.KNeighborsClassifier(n_neighbors=k, n_jobs=-1, weights='distance') model_knn.fit(x_train, y_train) x_passed_s3, ind_passed_s3 = ad.check_decidability(x_passed_s2, pred_passed_s2, model_knn) # Print pass_rate = utils.get_rate(x_passed_s3, x_passed_s2) print(f'Pass rate = {pass_rate * 100:.4f}%') if pass_rate == 0: raise Exception('All samples are blocked by Decidability check') x_passed_ad = x_passed_s3
def plot_mult_decision_boundary(ax, X, y, k, scaled=True, title='Title', xlabel='xlabel', ylabel='ylabel', hard_class=True): """Plot the decision boundary of a kNN classifier. Builds and fits a sklearn kNN classifier internally. X must contain only 2 continuous features. Function modeled on sci-kit learn example. Parameters ---------- ax: Matplotlib axes object The plot to draw the data and boundary on X: numpy array Training data y: numpy array Target labels k: int The number of neighbors that get a vote. scaled: boolean, optional (default=True) If true scales the features, else uses features in original units title: string, optional (default = 'Title') A string for the title of the plot xlabel: string, optional (default = 'xlabel') A string for the label on the x-axis of the plot ylabel: string, optional (default = 'ylabel') A string for the label on the y-axis of the plot hard_class: boolean, optional (default = True) Use hard (deterministic) boundaries vs. soft (probabilistic) boundaries Returns ------- None """ x_mesh_step_size = 0.1 y_mesh_step_size = 0.01 #Hard code in colors for classes, one class in red, one in blue bg_colors = np.array( [np.array([255, 150, 150]) / 255, np.array([150, 150, 255]) / 255]) cmap_light = ListedColormap(bg_colors) cmap_bold = ListedColormap(['#FF0000', '#0000FF']) #Build a kNN classifier clf = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform') if scaled: #Build pipeline to scale features clf = make_pipeline(StandardScaler(), clf) clf.fit(X, y) else: clf.fit(X, y) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = 45, 85 y_min, y_max = 2, 4 xx, yy = np.meshgrid(np.arange(x_min, x_max, x_mesh_step_size), np.arange(y_min, y_max, y_mesh_step_size)) if hard_class: dec_boundary = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) ax.pcolormesh(xx, yy, dec_boundary, cmap=cmap_light) ax.scatter(X[:, 0], X[:, 1], c='black', cmap=cmap_bold) else: dec_boundary = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) colors = dec_boundary.dot(bg_colors) ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) ax.imshow(colors.reshape(200, 400, 3), origin="lower", aspect="auto", extent=(x_min, x_max, y_min, y_max)) ax.set_title(title + ", k={0}, scaled={1}".format(k, scaled)) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_xlim((x_min, x_max)) ax.set_ylim((y_min, y_max))
#df = df.drop(df[df.label=='4u-Amantha'].index) df.describe() X = df[list(df.columns)[1:-1]] y = df['label'] #apply preprocessing to X #X = preprocessing.scale(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform') clf.fit(X, y) y_predictions = clf.predict(X_test) #for i in range(0,len(y_predictions)): #print y_predictions[i], y_test.as_matrix()[i] print 'Accuracy:', clf.score(X_test, y_test) #printing the training data size for each element print collections.Counter(y_train.factorize()[0]) #draw confusion matrix #get all labels #le = preprocessing.LabelEncoder()
lr__C = [1] ) lr_grid_search = GridSearchCV(lr_pipe, lr_parameters, cv = cv, scoring = 'accuracy') lr_grid_search.fit(trainData[Predictors], trainData['Survived']) # In[76]: kn_pipe = Pipeline(steps = [('feature_union', FeatureUnion([('pca', PCA()), ('select_KBest', SelectKBest()) ])), ('kn', neighbors.KNeighborsClassifier()) ]) kn_parameters = dict(feature_union__pca__n_components = [30], feature_union__pca__whiten = [True], feature_union__select_KBest__k = [45], kn__n_neighbors = [4], kn__algorithm = ['auto'], kn__leaf_size = [10], kn__weights = ['uniform'], kn__p = [1] ) kn_grid_search = GridSearchCV(kn_pipe, kn_parameters, cv = cv,
max = scores.mean() print(x, " ", max) if max > Gmax: Gmax = max kernel = x print("the best kernel is ", kernel) # ## KNN # In[17]: from sklearn import neighbors bestk = 0 best_score = 0 for j in range(1, 60): classifier = neighbors.KNeighborsClassifier(n_neighbors=j) max = -1 for i in range(3, 20): scores = kcv(classifier, features_scaled, ans, cv=i) if scores.mean() > max: max = scores.mean() if best_score < max: best_k = j best_score = max print("best value of k ", best_k, " with mean score ", best_score) # ## Naive Bayes # In[40]: from sklearn import naive_bayes as nb
continue if i == '?': cur.append(0) else: cur.append(float(i)) except ValueError, e: print "error", e, "on line", sz print "Processing sample " + str(sz) + " = ", cur if sz < TRAINING_TUPLES: Xnow.append(cur[:-1]) # learn more about slice on SO Ynow.append(cur[-1:][0]) else: testTuple.append(cur) aknn = neighbors.KNeighborsClassifier(2, weights='distance') Xtrain = np.array(Xnow) # Create an empty numpy array Ytrain = np.array(Ynow) # Result of each sample print Xtrain, Ytrain aknn.fit(Xtrain, Ytrain) for tup in testTuple: cur = [] for i in range(12): cur.append(tup[i]) y = aknn.predict(np.array(cur)) real = tup[-1:][0] # take the last value if (y <= 1):
@author: lucas """ import numpy as np from sklearn import preprocessing, model_selection, neighbors import pandas as pd df = pd.read_csv('house-votes-84.data') df.replace('?', -9999, inplace=True) df.replace('republican', 0, inplace=True) df.replace('democrat', 1, inplace=True) df.replace('y', 1, inplace=True) df.replace('n', 1, inplace=True) #df.drop(['id'],1,inplace=True) print(df) X = np.array(df.drop(['party'], 1)) y = np.array(df['party']) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) clf = neighbors.KNeighborsClassifier() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print(accuracy) example_measure = np.array([0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1]) example_measure = example_measure.reshape(1, -1) prediction = clf.predict(example_measure) print(prediction)
def __init__(self, data, labels, training_fraction, arguments): super().__init__(data, labels, training_fraction) # n_neighbors = int(input("Choose a number of neighbors:")) # self.model = n.KNeighborsClassifier(n_neighbors) self.model = n.KNeighborsClassifier()
testImageReconConcat = np.concatenate((testImageReconConcat, testImage), axis=1) for K in valuesOfK: output = averageFace for i in range(0, K): output = np.add(output, eigenFaces[i] * lowDimTestImage[i]) cv2.putText(output,'K='+str(K),(0,20), font, 0.5,(255,255,255)) testImageReconConcat = np.concatenate((testImageReconConcat, output), axis=1) # Display result at 2x size cv2.imshow("Train image reconstruction",cv2.resize(trainImageReconConcat, (0,0), fx=2, fy=2) ) cv2.imshow("Test image reconstruction", cv2.resize(testImageReconConcat, (0,0), fx=2, fy=2)) # classification now # lowDimImages contains array of image vectors of train images (320 such vectors) faceClassifier = neighbors.KNeighborsClassifier(n_neighbors = 3) y=[] # filling y with target class. every 8 images is 1 class, total 40 classes for i in range(0,40): faceClass = i for j in range(0,8): y.append(faceClass) faceClassifier.fit(lowDimImages, y) #classifier is now trained. we now load test images and their base truth classes groundTruth = [] # filling groundTruth with target class. evergroundTruth 2 images is 1 class, total 40 classes for i in range(0,40): faceClass = i for j in range(0,2):
Y, test_size=0.3, random_state=random.seed()) # préparation de la validation croisée ... from sklearn.cross_validation import KFold kf = KFold(len(X_train), n_folds=10, shuffle=True) scores = [] # pour sélectionner le paramètre optimal k from sklearn import neighbors for k in range(1, 30): score = 0 clf = neighbors.KNeighborsClassifier(k) for learn, test in kf: X_train_val = [X_train[i] for i in learn] Y_train_val = [Y_train[i] for i in learn] clf.fit(X_train_val, Y_train_val) X_test_val = [X_train[i] for i in test] Y_test_val = [Y_train[i] for i in test] score = score + clf.score(X_test_val, Y_test_val) scores.append(score) # valeur optimale de k : k_opt = scores.index(max(scores)) + 1 # affichage de tous les scores. On constate : # - que les scores correspondant aux petites valeurs de k (<= 5 ou 10) sont proches # - que les scores diminuent sensiblement pour des valeurs de k supérieures
df = pd.read_csv( "C:/Users/Sangameswaran/WebstormProjects/WonderWoman/PythonScripts/crime.csv" ) df = df.drop(['crimetime'], axis=1) X = np.array(df.drop(['type'], 1)) y = np.array(df['type']) elliptic = EllipticEnvelope(contamination=0.15) elliptic.fit(X) prediction = elliptic.predict([[latitude, longitude]]) if prediction == -1: possibility = "Safe zone" else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = neighbors.KNeighborsClassifier(n_neighbors=5) clf.fit(X_train, y_train) clf.score(X_test, y_test) val = np.array([[latitude, longitude]]) p = clf.predict(val) if p == 0: possibility = "Sexual abuse" elif p == 1: possibility = "Robbery" elif p == 2: possibility = "Rape" elif p == 3: possibility = "Homicide" print(possibility)
import numpy as np from sklearn import preprocessing, cross_validation, neighbors import pandas as pd accuracies = [] for i in range(25): df = pd.read_csv('breast-cancer-wisconsin.data') df.replace('?', -99999, inplace=True) df.drop(['id'], 1, inplace=True) X = np.array(df.drop(['class'], 1)) y = np.array(df['class']) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2) clf = neighbors.KNeighborsClassifier(n_jobs=-1) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) accuracies.append(accuracy) print(sum(accuracies) / len(accuracies)) # print(accuracy) # example_measures = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1], [4, 2, 1, 2, 2, 2, 3, 2, 1]]) # example_measures = example_measures.reshape(len(example_measures), -1) # prediction = clf.predict(example_measures) # print(prediction)
def compute_accuracy_nn(data_train, labels_train, data_test, labels_test, k=5): clf = neighbors.KNeighborsClassifier(k, weights="distance") return compute_accuracy_classifier(clf, data_train, labels_train, data_test, labels_test)
#Classificando laranjas, macas e peras from sklearn import neighbors #Features: peso(g), laranja,vermelho,verde, textura(0=liso, 1=enrugado) features = [ [120, 0, 1, 0, 0], [110, 0, 1, 0, 0], [125, 0, 1, 0, 0], #macas [150, 1, 0, 0, 1], [170, 1, 0, 0, 1], [145, 1, 0, 0, 1], #laranjas [80, 0, 0, 1, 0], [70, 0, 0, 1, 0], [90, 0, 0, 1, 0] ] #peras labels = [ 'maca', 'maca', 'maca', 'laranja', 'laranja', 'laranja', 'pera', 'pera', 'pera' ] clf = neighbors.KNeighborsClassifier(3) #numero de vizinhos clf = clf.fit(features, labels) print clf.predict([[90, 0, 0, 1, 0]])
print("Producing KFold indexes") kfold = cv.KFold(amount, n_folds=10, shuffle=True) print("Evaluating model with KFold") counter = 0 errors = numpy.zeros(len(kfold)) wrongs = [] for train_index, test_index in kfold: print(counter) trainFeatures = [features[i] for i in train_index] trainClasses = [classes[i] for i in train_index] testFeatures = [features[i] for i in test_index] testClasses = [classes[i] for i in test_index] model = neighbors.KNeighborsClassifier(n_neighbors=1) model.fit(trainFeatures, trainClasses) predictedClasses = model.predict(testFeatures) errors[counter - 1] = errorRate(testClasses, predictedClasses) for i in range(len(testClasses)): if testClasses[i] != predictedClasses[i]: wrongs.insert(0, (predictedClasses[i], testClasses[i])) print(errors[counter - 1]) counter = counter + 1 wrongDict = dict() for pred, actual in wrongs: if actual in wrongDict: wrongDict[actual].insert(0, pred) else:
for i in range(len(x_mean)): for j in range(len(x_mean[i])): sentence.append(str(x_mean[i][j])) sentence.append(str(y[i])) ch_dfa.write(' '.join(sentence)) ch_dfa.write('\n') sentence = [] ch_dfa.flush() TP, FP, FN, TN = 0, 0, 0, 0 x_array = np.array(x) y_array = np.array(y) usx = x_array usy = y_array x_train, x_test, y_train, y_test = train_test_split( usx, usy, test_size=0.2) #test_size: proportion of train/test data clf = neighbors.KNeighborsClassifier(algorithm='kd_tree') clf.fit(x_train, y_train) y_predict = clf.predict(x_test) for i in xrange(len(y_predict)): if y_test[i] == 1 and y_predict[i] == 1: TP += 1 if y_test[i] == 0 and y_predict[i] == 1: FP += 1 if y_test[i] == 1 and y_predict[i] == 0: FN += 1 if y_test[i] == 0 and y_predict[i] == 0: TN += 1 print 'TP: ' + str(TP) print 'FP: ' + str(FP) print 'FN: ' + str(FN) print 'TN: ' + str(TN)
def algorithm_compare(): # 数据读入 data = [] labels = [] factors = Factor.objects.all() for factor in factors: temp = [] temp.append(factor.organic_matter) temp.append(factor.total_nitrogen) temp.append(factor.available_P) temp.append(factor.available_K) data.append(temp) labels.append(factor.land_capability) x = np.array(data) y = np.array(labels) # print x # print y # 拆分训练数据与测试数据 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) # Decision Tree Fit dt_clf = tree.DecisionTreeClassifier(criterion='entropy') # print(dt_clf) dt_clf.fit(x_train, y_train) # fname = "decisiontree.dot" # joblib.dump(dt_clf, sys.path[0] + '\\static\\model_file\\' + fname,compress=3) # KNN Fit knn_clf = neighbors.KNeighborsClassifier(algorithm='kd_tree') # print(knn_clf) knn_clf.fit(x_train, y_train) # fname = "knntree.dot" # joblib.dump(knn_clf, sys.path[0] + '\\static\\model_file\\' + fname,compress=3) # SVM Fit # C represent the request of the precision , but too large may cause overfitting svm_clf_linear = svm.LinearSVC(C=3.5) # print(svm_clf_rbf) svm_clf_linear.fit(x_train, y_train) # fname = "svmtree.dot" # joblib.dump(svm_clf_linear, sys.path[0] + '\\static\\model_file\\' + fname,compress=3) # LR Fit lr_clf = linear_model.LogisticRegression() # print(lr_clf) lr_clf.fit(x_train, y_train) # fname = "lrtree.dot" # joblib.dump(lr_clf, sys.path[0] + '\\static\\model_file\\' + fname,compress=3) # 测试结果的打印 dt_answer = dt_clf.predict(x_train) # print(dt_answer) # print(y_train) # 准确率与召回率 precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, dt_clf.predict(x_test)) dt_x_answer = dt_clf.predict(x) # print dt_x_answer # print y # print(classification_report(y, dt_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6'])) dt_report = classification_report(y, dt_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6']) # 测试结果的打印 knn_answer = knn_clf.predict(x_train) # print(x_train) # print(knn_answer) # print(y_train) # print(np.mean(answer == y_train)) # 准确率与召回率 precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, knn_clf.predict(x_test)) knn_x_answer = knn_clf.predict(x) # print knn_x_answer # print y # print(classification_report(y, knn_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6'])) knn_report = classification_report(y, knn_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6']) # 测试结果的打印 svm_answer = svm_clf_linear.predict(x_train) # print(x_train) # print(svm_answer) # print(y_train) # print(np.mean(answer == y_train)) # 准确率与召回率 precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, svm_clf_linear.predict(x_test)) svm_x_answer = svm_clf_linear.predict(x) # print svm_x_answer # print y # print(classification_report(y, svm_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6'])) svm_report = classification_report(y, svm_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6']) # 测试结果的打印 lr_answer = lr_clf.predict(x_train) # print(x_train) # print(lr_answer) # print(y_train) # print(np.mean(answer == y_train)) # 准确率与召回率 precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, lr_clf.predict(x_test)) lr_x_answer = lr_clf.predict(x) # print lr_x_answer # print y # print(classification_report(y, lr_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6'])) lr_report = classification_report(y, lr_x_answer,target_names=['class 1', 'class 2', 'class 3', 'class 4', 'class 5','class 6']) data = [] data.append(x) data.append(y) data.append(x_train) data.append(y_train) data.append(x_test) data.append(y_test) clf = [] clf.append(dt_clf) clf.append(knn_clf) clf.append(svm_clf_linear) clf.append(lr_clf) clf_data = [] clf_data.append(dt_answer) clf_data.append(dt_x_answer) clf_data.append(dt_report) clf_data.append(knn_answer) clf_data.append(knn_x_answer) clf_data.append(knn_report) clf_data.append(svm_answer) clf_data.append(svm_x_answer) clf_data.append(svm_report) clf_data.append(lr_answer) clf_data.append(lr_x_answer) clf_data.append(lr_report) return data, clf, clf_data
print(f"target_names: {iris.target_names}") X = iris.data[:, :2] print(f'X: {len(X)}, {type(X)}') print(f'X head: {(X[:5])}, {type(X)}') y = iris.target print(f"y: {len(y)}, {type(y)}") d = dict() for i in y: if i in d: d[i] += 1 else: d[i] = 1 print(f"y counts: {d}, {type(d)}") n_neighbors = 15 clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, algorithm='kd_tree') clf.fit(X, y) print(clf.get_params()) results = clf.predict([[4.8, 3.3]]) print(results) nearest = clf.kneighbors([[4.8, 3.3]]) nearest_neighbors = [] for each in nearest[1][0]: nearest_neighbors.append((X[each], y[each])) print(nearest_neighbors) # [0] # [(array([4.8, 3.4]), 0), (array([4.8, 3.4]), 0), (array([4.7, 3.2]), 0), # (array([4.7, 3.2]), 0), (array([4.8, 3.1]), 0), (array([5. , 3.3]), 0), # (array([4.9, 3.1]), 0), (array([4.6, 3.2]), 0), (array([4.9, 3.1]), 0), # (array([5. , 3.2]), 0), (array([4.6, 3.4]), 0), (array([5. , 3.4]), 0), # (array([5. , 3.4]), 0), (array([4.6, 3.1]), 0), (array([5. , 3.5]), 0)]