class NearestMeanClassifier(BaseClassifier): def __init__(self, feature_length, num_classes): super().__init__(feature_length, num_classes) self.num_classes = num_classes # model build # shrink_threshold = True for Nearest Shrunken Centroid Classifier self.model = NearestCentroid(metric='manhattan') def train(self, features, labels): """ Using a set of features and labels, trains the classifier and returns the training accuracy. :param features: An MxN matrix of features to use in prediction :param labels: An M row list of labels to train to predict :return: Prediction accuracy, as a float between 0 and 1 """ labels = self.labels_to_categorical(labels) self.model.fit(features, labels) accuracy = self.model.score(features, labels) return accuracy def get_prediction(self,features): ''' this function get the prediction from the :param features: sample to predict :return: prediction from the model ''' return self.model.predict(features) def predict(self, features, labels): """ Using a set of features and labels, predicts the labels from the features, and returns the accuracy of predicted vs actual labels. :param features: An MxN matrix of features to use in prediction :param labels: An M row list of labels to test prediction accuracy on :return: Prediction accuracy, as a float between 0 and 1 """ labels = self.labels_to_categorical(labels) accuracy = self.model.score(features, labels) return accuracy def labels_to_categorical(self, labels): ''' convert the labels from string to number :param labels: labels list of string :return: labels converted in number ''' _, IDs = unique(labels, return_inverse=True) return IDs
def test_kernel_sef(): """ Performs some basic testing using the KernelSEF :return: """ np.random.seed(1) train_data = np.random.randn(100, 50) train_labels = np.random.randint(0, 2, 100) proj = KernelSEF(train_data, 50, output_dimensionality=12, kernel_type='rbf') proj._initialize(train_data) proj_data = proj.transform(train_data, batch_size=8) assert proj_data.shape[0] == 100 assert proj_data.shape[1] == 12 ncc = NearestCentroid() ncc.fit(proj_data, train_labels) acc_before = ncc.score(proj_data, train_labels) loss = proj.fit(data=train_data, target_labels=train_labels, epochs=200, target='supervised', batch_size=8, regularizer_weight=0, learning_rate=0.0001, verbose=False) # Ensure that loss is reducing assert loss[0] > loss[-1] proj_data = proj.transform(train_data, batch_size=8) assert proj_data.shape[0] == 100 assert proj_data.shape[1] == 12 ncc = NearestCentroid() ncc.fit(proj_data, train_labels) acc_after = ncc.score(proj_data, train_labels) assert acc_after > acc_before
def test_pickle(): import pickle # classification obj = NearestCentroid() obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_array_equal(score, score2, "Failed to generate same score" " after pickling (classification).")
def test_pickle(): import pickle # classification obj = NearestCentroid() obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_array_equal( score, score2, "Failed to generate same score" " after pickling (classification).")
def nc_fit(Xtrain, Xtest, Xtrain_lbls, Xtest_lbls, name, data, t0=time()): #Create a nearest centroid clf = NearestCentroid() # Train with the data clf.fit(Xtrain, Xtrain_lbls) # Create prediction for test data y_pred_test = clf.predict(Xtest) # How well does it fit score = clf.score(Xtest, Xtest_lbls) print('%-9s\t%.2fs\t%-9s\t%-9s' % (name, (time() - t0), score, data)) return y_pred_test
class NearestCentroidClassfier(Classifier): def __init__(self, train_set=None, val_set=None, data_file=None, header=0, test_size=0.2, feature_col_range=[2, 9], label_col=-1, features_degree=3): Classifier.__init__(self, train_set, val_set, data_file, header, test_size, feature_col_range, label_col, features_degree, True) def fit(self, metric='manhattan'): print('Using Nearest Centroid Classfier...') self.model = Model(metric=metric, shrink_threshold=-17) self.model.fit(self.X_train, self.y_train) print('\nTrain Set Accuracy: ', self.model.score(self.X_train, self.y_train) * 100) # Predicting the Test set results if len(self.X_test) > 0: print('\nEvaluating on test set...') y_pred = self.predict(self.X_test) self.score = self.evaluate(X=self.X_test, y=self.y_test) # Making the Confusion Matrix self.cm = confusion_matrix(self.y_test, y_pred, labels=[i for i in range(num_labels)]) def probality(self, X=None, data_file=None, header=0, feature_col_range=[1, 9]): print('probality() is not supported') return None
def test_disambiguator_store(self): # Create a silly classifier that disambiguates between "stam" (tree # trunk) or "romp" (body trunk) as the Dutch translation of the # English noun "trunk" lempos = u"trunk/n" # FIXME: store_fit() should only accept unicode strings target_names = u"stam romp".encode("utf-8").split() vocab = u"boom hoofd".split() X = np.array([[0,1], [1,0], [0,1], [1,0]]) y = np.array([1,0,1,0]) estimator = NearestCentroid() estimator.fit(X, y) centroids = estimator.centroids_ score = estimator.score(X, y) # Store estimator fname = tempfile.NamedTemporaryFile().name f = DisambiguatorStore(fname, "w") f.save_estimator(NearestCentroid()) f.save_vocab(vocab) f.store_fit(lempos, estimator) f.save_target_names(lempos, target_names) f.close() # Restore estimator f2 = DisambiguatorStore(fname) estimator2 = f2.load_estimator() vocab2 = f2.load_vocab() f2.restore_fit(lempos, estimator2) target_names2 = f2.load_target_names(lempos) centroids2 = estimator2.centroids_ score2 = estimator2.score(X, y) assert_array_equal(centroids, centroids2) assert target_names == target_names2 assert vocab == vocab2 assert score == score2
class scikit_NearestCentroid(MLAlgo): def __init__(self): self.clf = NearestCentroid() self.className = self.__class__.__name__ def train(self, train_data): train_X = train_data[:, :-1] train_Y = train_data[:, -1] self.clf.fit(train_X, train_Y) print("NearestCentroid model built.") return self.className + " Training finished...\n" def test(self, test_data): test_X = test_data[:, :-1] test_Y = test_data[:, -1] print("Accuracy: ", self.clf.score(test_X, test_Y)) return self.className + " Testing finished...\n" def predict(self, predict_data): print("Predictions: ", self.clf.predict(predict_data)) return self.className + " Prediction finished...\n" def cross_validate(self, train_data): X_ = train_data[:, :-1] Y_ = train_data[:, -1] predicted = cross_val_predict(self.clf, X_, Y_, cv=10) print("Cross-validation accuracy: ", metrics.accuracy_score(Y_, predicted)) if metrics.accuracy_score(Y_, predicted) > MLAlgo.cross_validate_accuracy: MLAlgo.cross_validate_accuracy = metrics.accuracy_score( Y_, predicted) MLAlgo.classifier = self.clf MLAlgo.trained_instance = self return self.className + " Cross validation finished...\n"
print("Nested Scores:\n{}".format(knnc_nested_scores)) print("Max score: {} (index {})\n".format(knnc_nested_scores.max(), np.argmax(knnc_nested_scores))) #---------------------------------------------------------------------------------------------- #Confusion Matrix KNN NC shrink_threshold = 0 # Split the data into a training set and a test set X_train, X_test, y_train, y_test = train_test_split(X_knnc, y, random_state=3) knnc_classifier = NearestCentroid(shrink_threshold=shrink_threshold).fit( X_train, y_train) print("KNN Nearest Centroid Confusion Matrix using the best parameters") print("Train score:{}".format(knnc_classifier.score(X_train, y_train))) print("Test score:{}\n".format(knnc_classifier.score(X_test, y_test))) class_names = [ 'grab', 'hit', 'massage', 'pat', 'pinch', 'poke', 'press', 'rub', 'scratch', 'slap', 'squeeze', 'stroke', 'tap', 'tickle' ] np.set_printoptions(precision=2) # Plot non-normalized confusion matrix titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = plot_confusion_matrix(knnc_classifier, X_test, y_test,
def nearest_centroid(x_train, y_train, x_test, y_test): from sklearn.neighbors import NearestCentroid clf = NearestCentroid() clf.fit(x_train, y_train) value = clf.score(x_test, y_test) return "{0:.2f}".format(value)
# 21 jellemző print(x.shape) print(y.shape) # Train és test-re bontás #x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15,random_state=8, shuffle=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, shuffle=True) #Baseline - Nearest centroid tic = timeit.default_timer() nc = NearestCentroid() nc.fit(x_train, y_train) print('Train-acc:', nc.score(x_train, y_train)) print('Test-acc:', nc.score(x_test, y_test)) toc = timeit.default_timer() elapsed_time = toc - tic NC_time = elapsed_time print('Elapsed time: ', elapsed_time, 'seconds') #Gaussian Naive Bayes - Gaus eloszlást feltételezünk tic = timeit.default_timer() model = GaussianNB() model.fit(x_train, y_train) toc = timeit.default_timer() print('Train-acc:', model.score(x_train, y_train)) print('Test-acc:', model.score(x_test, y_test)) toc = timeit.default_timer() elapsed_time = toc - tic
def test_ncm(alg_list, datasets, seed=28): """ Evaluates the algorithms specified in the datasets provided. Parameters ---------- alg_list : list The list of algorithms. Each item must be a quadruple (alg, name, key, ks, cons), where 'alg' is the algorithm, 'name' is the string name, 'key' is a key-name for the alg, 'ks' is the list of neighbors to consider in k-NN, and cons is the initialization code of the algorithm. datasets : list The list of datasets to use. Each item must be a pair (str, frac), where 'str' is the name of the dataset and 'frac' is the fraction of the dataset to take (for big datasets). """ print("* NEAREST CENTROIDS TEST STARTED") mms = MinMaxScaler() rownames = ["FOLD " + str(i + 1) for i in range(10)] results = {} for dset, f in datasets: print("** DATASET ", dset) folds, [n, d, c] = ds.reduced_dobscv10(dset, f) print("** SIZE ", n, " x ", d, " [", c, " classes]") results[dset] = {} norm_folds = [] for i, (xtr, ytr, xtst, ytst) in enumerate(folds): print("*** NORMALIZING FOLD ", i + 1) # Normalizing xtr = mms.fit_transform(xtr) xtst = mms.transform(xtst) norm_folds.append((xtr, ytr, xtst, ytst)) for j, (dml, dml_name, dml_key, ks, cons) in enumerate(alg_list): print("*** EVALUATING DML ", dml_name) results[dset] = defaultdict(lambda: np.zeros([12, 3])) for i, (xtr, ytr, xtst, ytst) in enumerate(norm_folds): print("**** FOLD ", i + 1) np.random.seed(seed) try: print("***** TRAINING") start = time.time() # Start timer dml.fit(xtr, ytr) # Fitting distance end = time.time() # Stop timer elapsed = end - start # Timer measurement xtr2 = dml.transform() xtst2 = dml.transform(xtst) for namek, keyk, k in zip(dml_name, dml_key, ks): print("****** TEST NCM [", k, " CTRD]") if k == 1: ncm = NearestCentroid() else: ncm = NCMC_Classifier(k) ncm.fit(xtr2, ytr) results[dset][keyk][i, 0] = ncm.score(xtr2, ytr) # Train score results[dset][keyk][i, 1] = ncm.score(xtst2, ytst) # Test score results[dset][keyk][i, 2] = elapsed # Time score except: print("--- ERROR IN DML ", dml_name) for keyk in dml_key: results[dset][keyk][i, 0] = np.nan # Train score results[dset][keyk][i, 1] = np.nan # Test score results[dset][keyk][i, 2] = np.nan # Time score traceback.print_exc() for keyk, namek in zip(dml_key, dml_name): results[dset][keyk][10, :] = np.mean( results[dset][keyk][:10, :], axis=0) results[dset][keyk][11, :] = np.std( results[dset][keyk][:10, :], axis=0) # Saving results r = pd.DataFrame(results[dset][keyk], columns=['TRAIN', 'TEST', 'TIME'], index=rownames + ["MEAN", "STD"]) r.to_csv("../results/cv-ncm-" + keyk + "-" + dset + ".csv") r.to_html("../results/cv-ncm-" + keyk + "-" + dset + ".html", classes=[table_css(), "kfoldtable meanstd"]) print("RESULTS: ", dset, ", dml = ", namek) print(r)
y_predicted_train = model.predict(x_train) print("Accuracy Score - train dataset:", metrics.accuracy_score(y_train, y_predicted_train)) #print(metrics.accuracy_score(y_train, y_predicted_train)) print(metrics.confusion_matrix(y_test, y_pred), "\n") print(classification_report(y_pred, y_test)) # Nearest Centroid Classifier nc = NearestCentroid() nc.fit(x_train, y_train) score = nc.score(x_train, y_train) print("Score: ", score) cv_scores = cross_val_score(nc, x_train, y_train, cv=10) print("CV average score: %.2f" % cv_scores.mean()) y_pred = nc.predict(x_test) cm = confusion_matrix(y_test, y_pred) print("Confusion Matrix:\n", cm) cr = classification_report(y_test, y_pred) print(cr) ##################################################
def evaluate_ncc(train_data, train_labels, test_data, test_labels): ncc = NearestCentroid() ncc.fit(train_data, train_labels) ncc_test = ncc.score(test_data, test_labels) return ncc_test
# checking which k value is the best to use, only went to 100 cause I have a bad machine, the curve should go back down after a while because k checking too many neighbours isn't efficient def check_results_different_k(from_k, to_k, X_train, X_val): scores = [] k_values = [] for k in range(from_k, to_k): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, y_train) scores.append(knn.score(X_val, y_val)) k_values.append(k) plt.plot(k_values, scores) plt.show() start_time = time.time() check_results_different_k(2, 20, X_train_image_flatten, X_val_image_flatten) print("--- %s seconds ---" % (time.time() - start_time)) k_best_result = 100 knn = KNeighborsClassifier(n_neighbors=k_best_result) knn.fit(X_train_image_flatten, y_train) print(knn.score(X_test_image_flatten, y_test)) # nearest centroind nc = NearestCentroid() nc.fit(X_train_image_flatten, y_train) nc.score(X_test_image_flatten, y_test)
# Splitting data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=44, shuffle=True ) # ---------------------------------------------------- # Applying LogisticRegression Model NearestNeighbors = NearestCentroid() NearestNeighbors.fit(X_train, y_train) # Calculating Details print("NearestNeighbors Train Score is : ", NearestNeighbors.score(X_train, y_train)) print("NearestNeighbors Test Score is : ", NearestNeighbors.score(X_test, y_test)) print("NearestNeighbors Classes are : ", NearestNeighbors.classes_) print("----------------------------------------------------") # Calculating Prediction y_pred = NearestNeighbors.predict(X_test) print("Predicted Value for NearestNeighbors is : ", y_pred[:10]) # ---------------------------------------------------- # Calculating Confusion Matrix CM = confusion_matrix(y_test, y_pred) print("Confusion Matrix is : \n", CM) # drawing confusion matrix sns.heatmap(CM, center=True)
performances = [] # go through adding features $increment at a time for attempt in range(0, train.shape[1]+1, increment): cnt += 1 i += increment used_features.append(i) loo = LeaveOneOut() X = train[:, 0:i] scores = [] for train_index, test_index in loo.split(X): sample_train, sample_test = X[train_index], X[test_index] outcome_train, outcome_test = y[train_index], y[test_index] classifier.fit(sample_train, outcome_train) loo_score = classifier.score(sample_test, outcome_test) scores.append(loo_score) performance = mean(scores) # break when the performance has not improved in the last 5 iterations if cnt >= 6: last_5 = performances[-5:] # break if the performance has not improved in the last 5 iterations if min(last_5) >= performance: break performances.append(performance) # select the number of features with the highest performance np.array(performances) #index of highest performing fit max_performance = np.where(performances == np.amax(performances))[0][0]
print(knn.score(X_test_image, y_test)) # checking which k value is the best to use, only went to 100 cause I have a bad machine, the curve should go back down after a while because k checking too many neighbours isn't efficient def check_results_different_k(from_k, to_k, X_train, X_val): scores = [] k_values = [] for k in range(from_k, to_k): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, y_train) scores.append(knn.score(X_val, y_val)) k_values.append(k) plt.plot(k_values, scores) plt.show() start_time = time.time() check_results_different_k(2, 20, X_train_image, X_val_image) print("--- %s seconds ---" % (time.time() - start_time)) k_best_result = 15 knn = KNeighborsClassifier(n_neighbors=k_best_result) knn.fit(X_train_image, y_train) print(knn.score(X_test_image, y_test)) # nearest centroind nc = NearestCentroid() nc.fit(X_train_image, y_train) nc.score(X_test_image, y_test)
def NearestCentroidClassifier(x, y): clf = NearestCentroid() clf.fit(x, y) ac = clf.score(x, y) return ac
(X_train, y_train), (X_test, y_test) = mnist.load_data() x_train=X_train.reshape(60000,28*28) x_test=X_test.reshape(10000,28*28) scaler= preprocessing.MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test ) model= NearestCentroid() model.fit(x_train,y_train) print ("Accuracy for the test set: ", model.score(x_test,y_test) ) print ( "Accuracy for the train set: ", model.score(x_train,y_train))
# Standardizing the features scaler.fit(x_test) x_test = scaler.transform(x_test) x_train = scaler.transform(x_train) # Make an instance of the Model pca = PCA(.95) # apply PCA inorder to get fewer dimensions to work with x_train_pca = pca.fit_transform(x_train) x_test_pca = pca.fit_transform(x_test) # making our predictions predictions = [] kNearestNeighbor(x_train_pca, y_train, x_test_pca, predictions, 1) # transform the list into an array predictions = np.asarray(predictions) # evaluating accuracy accuracy = accuracy_score(y_test, predictions) print('\nThe accuracy of our classifier is %d%%' % accuracy * 100) # train using K-NN ncc = NearestCentroid() ncc.fit(x_train, y_train) # get the model accuracy modelscore = ncc.score(x_test, y_test)
labels_process_r = labels_process[indexes] # Extracting sets test_index = int(test_ratio * data_full.shape[0]) test_data, train_data = np.split(data_full_r, [test_index]) test_labels_f, train_labels_f = np.split(labels_full_r, [test_index]) test_labels_m, train_labels_m = np.split(labels_merged_r, [test_index]) test_labels_p, train_labels_p = np.split(labels_process_r, [test_index]) # Normalizing sets train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) # Traning & Testing Model with Full labels model.fit(train_data, train_labels_f) acc_full.append(model.score(test_data, test_labels_f)) # Traning & Testing Model with Merged labels model.fit(train_data, train_labels_m) acc_merged.append(model.score(test_data, test_labels_m)) # Training & Testing Model with Process labels model.fit(train_data, train_labels_p) acc_process.append(model.score(test_data, test_labels_p)) # Traing with set A and Testing with test B model.fit(data_A, labels_half) acc_cross_ab = model.score(data_B, labels_half) # Traing with set A and Testing with test B model.fit(data_B, labels_half)
def evaluate(task, represent, prepare=None, batchsize=None, invariant=False, verbose=False, params=[10**i for i in range(-2, 3)], intercept=False, n_folds=2, n_jobs=-1, random_state=0, mean_clf=False): '''evaluates representation method on given task Args: task: string name of task represent: function that transforms list of documents to a matrix with len(documents) rows prepare: returns aggregate information used by represent (should be limited to n-gram vocab, NOT feature counts, etc.) batchsize: number of documents the represent should process at a time invariant: representation method does not depend on the batch (unlike e.g. SIF weighted features); if False must have batchsize is None verbose: print progress information params: cross-validation parameters intercept: whether to fit intercept in linear model n_folds: number of folds to use when cross-validating n_jobs: number of threads to run when cross-validating random_state: cross-validation seed mean_clf: use mean classifier instead of logit Returns: if accuracy task: (train acc, test acc); if regression: (Pearson r, Spearman rho); if retrieval: (acc, F1) ''' assert batchsize is None or invariant, "cannot construct in batches if not invariant" if task in TASKMAP['train-test split']: (dtrain, ltrain), (dtest, ltest) = TASKMAP['train-test split'][task]() info = () if prepare is None else prepare(dtrain + dtest) root = '\rBuilding ' + task.upper() + ' Train' if verbose else '' Xtrain = batched_build(dtrain, represent, info, root, batchsize) Ytrain = np.array(ltrain) root = '\rBuilding ' + task.upper() + ' Test' if verbose else '' Xtest = batched_build(dtest, represent, info, root, batchsize) Ytest = np.array(ltest) if mean_clf: clf = NearestCentroid() else: clf = LogitCV(Cs=params, fit_intercept=intercept, cv=n_folds, dual=np.less(*Xtrain.shape), solver='liblinear', n_jobs=n_jobs, random_state=random_state) if verbose: write('\rCross-Validating and Fitting ' + task.upper() + 10 * ' ') clf.fit(Xtrain, Ytrain) train = 100.0 * clf.score(Xtrain, Ytrain) test = 100.0 * clf.score(Xtest, Ytest) elif task in TASKMAP['cross-validation']: documents, labels = TASKMAP['cross-validation'][task]() info = () if prepare is None else prepare(documents) train = 0.0 test = 0.0 Y = np.array(labels) if invariant: root = '\rBuilding ' + task.upper() if verbose else '' X = batched_build(documents, represent, info, root, batchsize) for i, (tr, te) in enumerate( StratifiedKFold(n_splits=10, random_state=random_state).split(X, Y)): if mean_clf: clf = NearestCentroid() else: if verbose: write('\rCross-Validating and Fitting ' + task.upper() + ' Fold ' + str(i + 1) + 10 * ' ') clf = LogitCV(Cs=params, fit_intercept=intercept, cv=n_folds, dual=np.less(*X.shape), solver='liblinear', n_jobs=n_jobs, random_state=random_state) clf.fit(X[tr], Y[tr]) train += clf.score(X[tr], Y[tr]) test += clf.score(X[te], Y[te]) else: for i, (tr, te) in enumerate( StratifiedKFold(n_splits=10, random_state=random_state).split( documents, Y)): root = '\rBuilding ' + task.upper() + ' Fold ' + str( i + 1) + ' Train' if verbose else '' Xtrain = batched_build([documents[i] for i in tr], represent, info, root, batchsize) root = '\rBuilding ' + task.upper() + ' Fold ' + str( i + 1) + ' Test' if verbose else '' Xtest = batched_build([documents[i] for i in te], represent, info, root, batchsize) if mean_clf: clf = NearestCentroid() else: if verbose: write('\rCross-Validating and Fitting ' + task.upper() + ' Fold ' + str(i + 1) + 10 * ' ') clf = LogitCV(Cs=params, fit_intercept=intercept, cv=n_folds, dual=np.less(*Xtrain.shape), solver='liblinear', n_jobs=n_jobs, random_state=random_state) clf.fit(Xtrain, Y[tr]) train += clf.score(Xtrain, Y[tr]) test += clf.score(Xtest, Y[te]) train *= 10.0 test *= 10.0 elif task in TASKMAP['pairwise task']: (d1train, d2train, ltrain), (d1test, d2test, ltest) = TASKMAP['pairwise task'][task]() info = () if prepare is None else prepare(d1train + d2train + d1test + d2test) root = '\rBuilding ' + task.upper() + ' Train' if verbose else '' Xtrain = batched_build(d1train + d2train, represent, info, root, batchsize) m = int(Xtrain.shape[0] / 2) if task == 'sts': Ptrain = np.zeros(m) nz = norm(Xtrain[:m], axis=1) * norm(Xtrain[m:], axis=1) > 0.0 Ptrain[nz] = np.sum(normalize(Xtrain[:m][nz]) * normalize(Xtrain[m:][nz]), axis=1) else: Xtrain = np.hstack( [abs(Xtrain[:m] - Xtrain[m:]), Xtrain[:m] * Xtrain[m:]]) root = '\rBuilding ' + task.upper() + ' Test' if verbose else '' Xtest = batched_build(d1test + d2test, represent, info, root, batchsize) m = int(Xtest.shape[0] / 2) if task == 'sts': Ptest = np.zeros(m) nz = norm(Xtest[:m], axis=1) * norm(Xtest[m:], axis=1) > 0.0 Ptest[nz] = np.sum(normalize(Xtest[:m][nz]) * normalize(Xtest[m:][nz]), axis=1) else: Xtest = np.hstack( [abs(Xtest[:m] - Xtest[m:]), Xtest[:m] * Xtest[m:]]) if verbose: write('\rCross-Validating and Fitting ' + task.upper() + 10 * ' ') if task in {'sick_r', 'sts'}: if task == 'sts': Ytest = np.array([float(y) for y in ltrain + ltest]) P = np.concatenate([Ptrain, Ptest]) else: Ytrain = np.array([float(y) for y in ltrain]) Ytest = np.array([float(y) for y in ltest]) reg = RidgeCV(alphas=params, fit_intercept=intercept) reg.fit(Xtrain, Ytrain) P = reg.predict(Xtest) r = 100.0 * pearsonr(Ytest, P)[0] rho = 100.0 * spearmanr(Ytest, P)[0] if verbose: write('\r' + task.upper() + ': r=' + str(r) + ', rho=' + str(rho) + 10 * ' ' + '\n') return r, rho else: clf = LogitCV(Cs=params, fit_intercept=intercept, cv=n_folds, dual=np.less(*Xtrain.shape), solver='liblinear', n_jobs=n_jobs, random_state=random_state) if task == 'mrpc': Ytrain = np.array([int(y) for y in ltrain]) Ytest = np.array([int(y) for y in ltest]) clf.fit(Xtrain, Ytrain) acc = 100.0 * clf.score(Xtest, Ytest) f1 = 100.0 * f1_score(Ytest, clf.predict(Xtest)) if verbose: write('\r' + task.upper() + ': Acc=' + str(acc) + ', F1=' + str(f1) + 10 * ' ' + '\n') return acc, f1 else: Ytrain = np.array(ltrain) Ytest = np.array(ltest) clf.fit(Xtrain, Ytrain) train = 100.0 * clf.score(Xtrain, Ytrain) test = 100.0 * clf.score(Xtest, Ytest) else: raise (NotImplementedError) if verbose: write('\r' + task.upper() + ': Train Acc=' + str(train) + ', Test Acc=' + str(test) + 10 * ' ' + '\n') return train, test
cnt += 1 i += increment # how many features to add each time used_features.append(i) X = train[:, 0:i] loo = LeaveOneOut( ) # function to compute the indices which split the data so that each sample is used for testing once scores = [] for train_index, test_index in loo.split( X ): # this method gives the indices to use each sample as the test once X_train, X_test = X[train_index], X[test_index] #print(X_train) y_train, y_test = y[train_index], y[test_index] classifier.fit(X_train, y_train) score = classifier.score(X_test, y_test) scores.append(score) performance = mean(scores) if cnt >= 6: last_5 = performances[-5:] # break if the performance has not improved in the last 5 iterations if min(last_5) >= performance: break performances.append(performance) # find model with best performance and extract the features used f_cnt = performances.index(max(performances)) best_features = used_features[f_cnt]
w1 = n_samples / (n_classes * n_samples1) X_train = X_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) weights = y_train.map(lambda y: w0 if y == 0 else w1) from sklearn.neighbors import NearestCentroid from sklearn.metrics import classification_report # Creating the Nearest Centroid Clissifier model = NearestCentroid() # Training the classifier model.fit(X_train, y_train.values.ravel()) model.score(X_train, y_train, sample_weight=weights) # Printing Accuracy on Training and Test sets print(f"Training Set Score : {model.score(X_train, y_train) * 100} %") print(f"Test Set Score : {model.score(X_test, y_test) * 100} %") # Printing classification report of classifier on the test set set data print( f"Model Classification Report : \n{classification_report(y_test, model.predict(X_test))}" ) ''' Extra Tree Classifier ''' from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import cross_val_score, KFold