def fit_and_predict(self): print "****** train start ******" print "sample:", list(self.y_train).count(0), list(self.y_train).count(1) print "train percent is %s" % (float(list(self.y_train).count(1)) / len(list(self.y_train))) start_at = time.time() x_train_features = self.scaler.fit_transform(self.x_train) # x_train_features = self.x_train self.classifier.fit(x_train_features, self.y_train) print "train done. score %s. spend %s s" \ % (self.classifier.score( x_train_features, self.y_train), time.time() - start_at) x_test_features = self.scaler.fit_transform(self.x_test) # x_test_features = self.x_test y_test_pred = self.classifier.predict(x_test_features) print "****** test result *******" print y_test_pred, len(y_test_pred) print list(y_test_pred).count(0), list(y_test_pred).count(1),\ set(y_test_pred), len(set(y_test_pred)) print "test percent is %s" % (float(list(y_test_pred).count(1)) / len(list(y_test_pred))) print accuracy_score(self.y_test, y_test_pred) x_pred_features = self.scaler.fit_transform(self.p_x) # x_pred_features = self.p_x y_pred = self.classifier.predict(x_pred_features) print "****** predict result *******" print y_pred, len(y_pred) print list(y_pred).count(0), list(y_pred).count(1), set(y_pred), len(set(y_pred)) print "predict percent is %s" % (float(list(y_pred).count(1)) / len(list(y_pred))) return y_pred
def readout_sk(self, X_train, X_test, y_train, y_test, **kwargs): from sklearn.linear_model import LogisticRegression lr = LogisticRegression(**kwargs) lr.fit(X_train.T, y_train.T) y_train_predictions = lr.predict(X_train.T) y_test_predictions = lr.predict(X_test.T) return accuracy_score(y_train_predictions, y_train.T), accuracy_score(y_test_predictions, y_test.T)
def supportvector(C, gamma = 'default'): from sklearn.svm import SVC from sklearn.metrics import accuracy_score if gamma == 'default': clf = SVC(kernel="rbf", C = C) else: clf = SVC(kernel="rbf", C = C, gamma = gamma) clf.fit(features_train, labels_train) t_fit = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t_fit, 3), "s" t_pred = time() pred = clf.predict(features_test) print "predict time:", round(time()-t_pred, 3), "s" print accuracy_score(pred, labels_test) try: prettyPicture(clf, features_test, labels_test) except NameError: pass
def main(): f = open("me.stdout", "r").read() print f (confusionMatrix, labels, ytrue, ypred, trueCount) = readConfusionMatrix.readText(f) for row in confusionMatrix: print row precisionMicro = np.float(metrics.precision_score(ytrue, ypred, average="micro")) recallMicro = np.float(metrics.recall_score(ytrue, ypred, average="micro")) f1Micro = np.float(metrics.f1_score(ytrue, ypred, average="micro")) f1Macro = np.float(metrics.f1_score(ytrue, ypred, pos_label=1, average="macro")) precisionMacro = np.float(metrics.precision_score(ytrue, ypred, average="macro")) recallMacro = np.float(metrics.recall_score(ytrue, ypred, average="macro")) mConf = metrics.confusion_matrix(ytrue, ypred) print mConf print labels print len(ytrue) print len(ypred) print trueCount print metrics.accuracy_score(ytrue, ypred) print precisionMicro print recallMicro print f1Micro print f1Macro print precisionMacro print recallMacro
def test_one_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(n_estimators=500, max_depth=200, min_samples_split=10, oob_score=True, n_jobs=-1,verbose=1,class_weight='balanced')), ]) ############# train pipeline.fit(Xtrain_raw,ytrain_raw) ############# check result rf = pipeline.steps[-1][1] rf.oob_score_ ############# training error ytrain_predict = pipeline.predict(Xtrain_raw) print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict) print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict) ############# testing error Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = pipeline.predict(Xtest_raw) accuracy_score(y_true=ytest_raw,y_pred=ytest_predict) print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def score(self, K, y, sample_weight=None): """Returns the coefficient of determination R^2 of the prediction. The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum(). The best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0. Parameters ---------- X : array-like, shape = (n_samples, n_features) Test samples. y : array-like, shape = (n_samples) or (n_samples, n_outputs) True values for X. sample_weight : array-like, shape = [n_samples], optional Sample weights. Returns ------- score : float R^2 of self.predict(X) wrt. y. """ y_pred = self.predict(K) if sample_weight is None: return np.mean([accuracy_score( y[j], y_pred[j]) for j in range(len(K))]) else: return np.mean([ accuracy_score(y[j], y_pred[j], sample_weight=sample_weight[j]) for j in range(len(K))])
def ranforest(n_estimators, min_samples_split): from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score clf = RandomForestClassifier(n_estimators = n_estimators, min_samples_split = min_samples_split, bootstrap = True) clf.fit(features_train, labels_train) t_fit = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t_fit, 3), "s" t_pred = time() pred = clf.predict(features_test) print "predict time:", round(time()-t_pred, 3), "s" print accuracy_score(pred, labels_test) try: prettyPicture(clf, features_test, labels_test) except NameError: pass
def main(): """ Main function :return: """ # Load the csv file into pandas dataframe dataset = pd.read_csv(OUTPUT_PATH) # Get basic statistics of the loaded dataset dataset_statistics(dataset) # Filter missing values dataset = handel_missing_values(dataset, HEADERS[6], '?') train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1]) # Train and Test dataset size details print "Train_x Shape :: ", train_x.shape print "Train_y Shape :: ", train_y.shape print "Test_x Shape :: ", test_x.shape print "Test_y Shape :: ", test_y.shape # Create random forest classifier instance trained_model = random_forest_classifier(train_x, train_y) print "Trained model :: ", trained_model predictions = trained_model.predict(test_x) for i in xrange(0, 5): print "Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]) print "Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)) print "Test Accuracy :: ", accuracy_score(test_y, predictions) print " Confusion matrix ", confusion_matrix(test_y, predictions)
def on_epoch_end(self, epoch, logs={}): p = model.predict(X, verbose=1) #p = model.predict({'input': X}, verbose=1)['output'] train_acc = accuracy_score(Y[:t1], np.round(p[:t1])) dev_acc = accuracy_score(Y[t1:t2], np.round(p[t1:t2])) test_acc = accuracy_score(Y[t2:], np.round(p[t2:])) print('Accuracy | train:', train_acc, 'dev:', dev_acc, 'test:', test_acc) train_recall = (recall_score(Y[:t1], np.round(p[:t1])) + recall_score(Y[:t1], np.round(p[:t1]), pos_label=0)) / 2 dev_recall = (recall_score(Y[t1:t2], np.round(p[t1:t2])) + recall_score(Y[t1:t2], np.round(p[t1:t2]), pos_label=0)) / 2 test_recall = (recall_score(Y[t2:], np.round(p[t2:])) + recall_score(Y[t2:], np.round(p[t2:]), pos_label=0)) / 2 print('Recall | train:', train_recall, 'dev:', dev_recall, 'test:', test_recall) self.accuracy[epoch, :] = np.array([train_acc, dev_acc, test_acc]) self.recall[epoch, :] = np.array([train_recall, dev_recall, test_recall]) plt.clf() plt.subplot(211) lines = plt.plot(range(1, epoch+2), self.accuracy[:epoch+1]) plt.legend(iter(lines), ('train', 'dev', 'test'), loc=4) plt.xlabel('Epoch') plt.ylabel('Accuracy') plt.axis([1, epoch+2, 0, 1]) plt.subplot(212) lines = plt.plot(range(1, epoch+2), self.recall[:epoch+1]) plt.legend(iter(lines), ('train', 'dev', 'test'), loc=4) plt.xlabel('Epoch') plt.ylabel('Average recall') plt.axis([1, epoch+2, 0, 1]) plt.savefig('results.png')
def feature_scaled_nn_acc(mds, type): train, validation = validation_split(mds) # Multiply by 1 to convert to bool y_train = train['Up'] * 1 X_train = train.drop('Up', axis=1) y_validation = validation['Up'] * 1 X_validation = validation.drop('Up', axis=1) pre = PCA(n_components=19, whiten=True) X_train_pca = pre.fit_transform(X_train) X_validation_pca = pre.fit_transform(X_validation) model = create_model(X_train_pca.shape[1], type) # Convert to Keras format y_train = to_categorical(y_train.values) y_validation = to_categorical(y_validation.values) model.fit(X_train_pca, y_train, nb_epoch=5, batch_size=16) time.sleep(0.1) # Fit and guess guess_train = model.predict_classes(X_train_pca) guess_train = to_categorical(guess_train) guess_validation = model.predict_classes(X_validation_pca) guess_validation = to_categorical(guess_validation) train_acc = accuracy_score(y_train, guess_train) validation_acc = accuracy_score(y_validation, guess_validation) print "\n neural net train accuracy is {}".format(train_acc) print "\n neural net validation accuracy is {}".format(validation_acc) return guess_validation
def nn_acc(mds, type, epoch=5, batch=16): train, validation = validation_split(mds) # Multiply by 1 to convert to bool y_train = train['Up'] * 1 X_train = train.drop('Up', axis=1) y_validation = validation['Up'] * 1 X_validation = validation.drop('Up', axis=1) # Create Model model = create_model(X_train.shape[1], type) # Convert to Keras format X_train = (X_train).as_matrix() X_validation = (X_validation).as_matrix() y_train = to_categorical(y_train.values) y_validation = to_categorical(y_validation.values) # Fit and guess model.fit(X_train, y_train, nb_epoch=epoch, batch_size=batch) guess_train = model.predict_classes(X_train) guess_train = to_categorical(guess_train) guess_validation = model.predict_classes(X_validation) guess_validation = to_categorical(guess_validation) train_acc = accuracy_score(y_train, guess_train) validation_acc = accuracy_score(y_validation, guess_validation) print "\n neural net train accuracy is {}".format(train_acc) print "\n neural net validation accuracy is {}".format(validation_acc) return guess_validation
def Adaboost(TrainData,TestData): features=['Time','Season','Hour','Minute','District'] clf = AdaBoostClassifier(tree.DecisionTreeClassifier(),n_estimators=30) size=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for i in range(0,len(size)): train,validation= train_test_split(TrainData, train_size=size[i]) while len(set(train['Category'])) != len(set(validation['Category'])): train,validation= train_test_split(TrainData, train_size=size[i]) clf = clf.fit(train[features], train['Category']) """stop = timeit.default_timer() print "Runnin time adaboost is ", stop-start""" predicted=np.array(clf.predict_proba(validation[features])) model=clf.predict(train[features]) model1=clf.predict(validation[features]) #scores = cross_val_score(clf, validation[features], validation['Category']) #print "Scores mean is",scores.mean() #accuracy print "Training accuracy is", accuracy_score(train['Category'].values.tolist(),model) print "Validation accuracy is",accuracy_score(validation['Category'].values.tolist(),model1) print "Precision is ",precision_score(validation['Category'].values.tolist(),model1,average='macro') print "Recall is ",recall_score(validation['Category'].values.tolist(),model1,average='macro') print "Log loss is", log_loss(validation['Category'].values.tolist(),predicted,eps=1e-15, normalize=True, sample_weight=None) #writing to file """Category_new=[]
def sub_analysis(subs, sub_data_generator): for sub in subs: print("process sub: %0.0f" % sub) # X_tr, y_tr, X_te, y_te, pipe = sub_data_generator.next() X, y, _, _, pipe = sub_data_generator.next() cv_scores = [] te1_scores = [] te2_scores = [] for state in range(3): # X_tr, X_te, y_tr, y_te = prep.stratified_split(X, y, state) # train on nmsk: algo, algo_params, score = prep.params.model clf = GridSearchCV(algo, algo_params, cv=3, scoring=score, n_jobs=-1, verbose=0) clf.fit(prep.convert(pipe, X_tr), y_tr) # test on msk: if prep.params.collapse_opacities: X_te_high, y_te_high, X_te_low, y_te_low = prep.split_high_low_sets(X_te, y_te) y_te_high = prep.collapse_lbls(y_te_high, (3,4,5), (0,1,2)) y_true_high, y_pred_high = y_te_high, clf.predict(prep.convert(pipe, X_te_high)) y_true_low, y_pred_low = y_te_low, clf.predict(prep.convert(pipe, X_te_low)) cv_scores.append(clf.best_score_) te1_scores.append(accuracy_score(y_true_high, y_pred_high)) te2_scores.append(accuracy_score(y_true_low, y_pred_low)) else: y_true, y_pred = y_te, clf.predict(prep.convert(pipe, X_te)) cv_scores.append(clf.best_score_) te1_scores.append(accuracy_score(y_true, y_pred)) te2_scores = None yield cv_scores, te1_scores, te2_scores
def assess_classification_performance(model, X_train, y_train, X_test, y_test, short = False): accuracy_train = metrics.accuracy_score(y_train, model.predict(X_train)) accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test)) print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test)) if not short: # confusion matrix # rows: actual group # columns: predicted group print('Confusion_matrix (training data):') print(metrics.confusion_matrix(y_train, model.predict(X_train))) print('Confusion_matrix (test data):') print(metrics.confusion_matrix(y_test, model.predict(X_test))) # precision = tp / (tp + fp) # recall = tp / (tp + fn) (= sensitivity) # F1 = 2 * (precision * recall) / (precision + recall) print('\nPrecision - recall (training data):') print(metrics.classification_report(y_train, model.predict(X_train))) print('\nPrecision - recall (test data):') print(metrics.classification_report(y_test, model.predict(X_test)))
def testIrisStreaming(self): iris = datasets.load_iris() def iris_data(): while True: for x in iris.data: yield x def iris_predict_data(): for x in iris.data: yield x def iris_target(): while True: for y in iris.target: yield y classifier = skflow.TensorFlowLinearClassifier(n_classes=3, steps=100) classifier.fit(iris_data(), iris_target()) score1 = accuracy_score(iris.target, classifier.predict(iris.data)) score2 = accuracy_score(iris.target, classifier.predict(iris_predict_data())) self.assertGreater(score1, 0.5, "Failed with score = {0}".format(score1)) self.assertEqual(score2, score1, "Scores from {0} iterator doesn't " "match score {1} from full " "data.".format(score2, score1))
def main(unused_args): ### Download and load MNIST dataset. mnist = learn.datasets.load_dataset('mnist') ### Linear classifier. feature_columns = learn.infer_real_valued_columns_from_input( mnist.train.images) classifier = learn.LinearClassifier( feature_columns=feature_columns, n_classes=10) classifier.fit(mnist.train.images, mnist.train.labels.astype(np.int32), batch_size=100, steps=1000) score = metrics.accuracy_score(mnist.test.labels, list(classifier.predict(mnist.test.images))) print('Accuracy: {0:f}'.format(score)) ### Convolutional network classifier = learn.Estimator(model_fn=conv_model) classifier.fit(mnist.train.images, mnist.train.labels, batch_size=100, steps=20000) score = metrics.accuracy_score(mnist.test.labels, list(classifier.predict(mnist.test.images))) print('Accuracy: {0:f}'.format(score))
def main(): input_train_file_ptr = sys.argv[1] input_test_file_ptr = sys.argv[2] # read the csv file and return the pandas dataframe with two column as tweets and sentiment as columns. train_tweests_with_sentiments = pre_process_input_data(input_train_file_ptr) test_tweets_data = pre_process_input_data(input_test_file_ptr) bigram_vectorizer = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b', min_df=1,lowercase=True) # print tweests_array tweets_array, sentiments_array = get_tweest_and_sentiments(train_tweests_with_sentiments) print("size of tweets array is %s and sentiment array is %s " % (tweets_array.size, sentiments_array.size)) test_tweets,test_sentiments = get_tweest_and_sentiments(test_tweets_data) test_sentiments = test_sentiments.flatten() print("size of test tweets array is %s and test sentiment array is %s " % (test_tweets.size, test_sentiments.size)) parsed_train_tweets = clean_data_to_feed_classifier(tweets_array) parsed_test_tweets = clean_data_to_feed_classifier(test_tweets) # print parsed_tweests x = bigram_vectorizer.fit_transform(parsed_train_tweets) print x.size # print bigram_vectorizer.get_feature_names() bigram_vectorizer.build_analyzer() print "done 1" # print bigram_vectorizer.get_feature_names() res = bigram_vectorizer.transform(parsed_test_tweets) print "done 2" clf = LinearSVC() gnb = MultinomialNB() print "done 2" trained_classifier = do_K_fold_cross_validation(clf,gnb,x,sentiments_array.flatten()) # trained_classifier.fit(x, sentiments_array.flatten()) print "done 3" output = trained_classifier.predict(res) # print output print accuracy_score(test_sentiments,output)
def do(self, n_pts): X, y = self.collect_pts(n_pts) print 'done collecting points' rbf_map = RBFSampler(n_components=n_pts, random_state=1) solver = HyperSolver(p=self.POS, n=self.NEG) rbf_solver = pipeline.Pipeline([("mapper", rbf_map), ("solver", solver)]) gamma_range = np.logspace(-15, 6, 22, base=2) param_grid = dict(mapper__gamma=gamma_range) cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=1) grid = GridSearchCV(rbf_solver, param_grid=param_grid, cv=cv, n_jobs=8) grid.fit(X, y) scores = [x[1] for x in grid.grid_scores_] scores = np.array(scores).reshape(len(gamma_range)) plt.figure(figsize=(8, 6)) plt.plot(gamma_range, scores) plt.xlabel('gamma') plt.ylabel('score') plt.title('Validation accuracy (RTiX, %s)' % os.path.basename(self.name)) plt.savefig(self.name + '-SLViF-grid-npts=%d.pdf' % n_pts) # final train g = grid.best_params_['mapper__gamma'] print 'best parameters are g=%f' % g rbf_svc2 = grid.best_estimator_ y_pred = rbf_svc2.predict(self.Xt) print 'SCORE: %f' % sm.accuracy_score(self.Yt, y_pred) return grid.best_score_, sm.accuracy_score(self.Yt, y_pred)
def detect_anomalies(): encoded_X_train = np.load("resources/files/encoded_X_train.npy") encoded_X_test = np.load("resources/files/encoded_X_test.npy") print(encoded_X_train.shape) print(encoded_X_test.shape) clf = svm.OneClassSVM(nu=0.1, kernel="linear") clf.fit(encoded_X_train) y_pred_train = clf.predict(encoded_X_train) y_pred_test = clf.predict(encoded_X_test) y_pred_outliers = clf.predict(np.full((100,hidden_dimensions[1]),4)) # print y_pred_train[y_pred_train == -1].size # print y_pred_test[y_pred_test == -1].size # print y_pred_outliers[y_pred_outliers == -1].size # n_normal_points_test = X_test[y_pred_test == 1] # n_anomalies_test = X_test[y_pred_test == -1] # print(n_normal_points_test.shape) # print(n_anomalies_test.shape) print("Train Accuracy: %f"%(accuracy_score(Y_train, y_pred_train))) print("Test Accuracy: %f"%( accuracy_score(Y_test, y_pred_test))) print("Precision: %f" % (precision_score(Y_test, y_pred_test,pos_label=1))) #print("Recall: %f" % (precision_score(Y_test, y_pred_test, pos_label=-1))) print "Confusion Matrix: (Anomalies, Normal)" print confusion_matrix(Y_test,y_pred_test,labels=[-1,1]) fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_test, pos_label=1) print "AUC: %f"%metrics.auc(fpr, tpr)
def grid_search(self): C_range = np.logspace(-5, 15, 21, base=2) param_grid = dict(C=C_range) cv = StratifiedShuffleSplit(self.y_ex, n_iter=5, test_size=0.2, random_state=42) grid = GridSearchCV(LinearSVC(dual=False, max_iter=10000), param_grid=param_grid, cv=cv, n_jobs=1, verbose=0) logger.info('start grid search for Linear') grid.fit(self.X_ex, self.y_ex) logger.info('end grid search for Linear') scores = [x[1] for x in grid.grid_scores_] # final train rbf_svc2 = grid.best_estimator_ pred_train = rbf_svc2.predict(self.X_ex) pred_val = rbf_svc2.predict(self.val_x) pred_test = rbf_svc2.predict(self.test_x) r = Result(self.name + ' (X)', 'Linear', len(self.X_ex), sm.accuracy_score(self.y_ex, pred_train), sm.accuracy_score(self.val_y, pred_val), sm.accuracy_score(self.test_y, pred_test)) return r
def getAccuracy(labels,predictedLabels, positive_label): totalExamples = labels.shape[0] accuracy = (getTP(labels,predictedLabels,positive_label) + getTN(labels,predictedLabels,positive_label)) / float(totalExamples) print ("Built-in accuracy = {}".format(metrics.accuracy_score(labels, predictedLabels))) print ("Accuracy = {}".format(accuracy)) return metrics.accuracy_score(labels, predictedLabels)
def train(self): X_train = self.dataset.df_train['ids_string'].values y_train = self.dataset.df_train['label'].values X_test = self.dataset.df_test['ids_string'].values y_test = self.dataset.df_test['label'].values print(datetime.datetime.now(), 'Vectorizing') if opt['bow_tfidf'] == False: self.cv = CountVectorizer(ngram_range=opt['bow_ngram_range'], min_df=opt['bow_min_df']) X_train = self.cv.fit_transform(X_train) X_test = self.cv.transform(X_test) else: self.tfidf = TfidfVectorizer(ngram_range=opt['bow_ngram_range'], min_df=opt['bow_min_df']) X_train = self.tfidf.fit_transform(X_train) X_test = self.tfidf.transform(X_test) #TODO: use sparse.vstack X_train = np.concatenate((X_train.todense(), self.dataset.df_train[self.dataset.features].values), axis=1) X_test = np.concatenate((X_test.todense(), self.dataset.df_test[self.dataset.features].values), axis=1) print(datetime.datetime.now(), 'Traing') self.lr = LogisticRegression() self.lr.fit(X_train, y_train) y_predict = self.lr.predict(X_train) accuracy_train = accuracy_score(y_train, y_predict) y_predict = self.lr.predict(X_test) accuracy_test = accuracy_score(y_test, y_predict) print(datetime.datetime.now(), (accuracy_train, accuracy_test))
def testsvm2(): genre_list = ["classical", "jazz", "country", "pop", "rock", "metal"] Xtrain,ytrain,Xtest,ytest = getSplitData() Xtrain, Xtest = getScaledData(Xtrain, Xtest) traindata = Xtrain trainlabel = ytrain testdata = Xtest testlabel = ytest X = np.vstack((Xtrain,Xtest)) y = np.hstack((ytrain,ytest)) accuracy1 = [] kf = KFold(600, n_folds=10) for train, test in kf: #print("%s %s" % (train, test)) X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] traindata = X_train trainlabel = y_train testdata = X_test testlabel = y_test clf = SVC(C = 1, gamma = 0.125, kernel = 'rbf') clf.fit(traindata, trainlabel) pred = clf.predict(testdata) print "svm classification accuracy: ", accuracy_score(testlabel,pred) cm = confusion_matrix(testlabel, pred) print cm plot_confusion_matrix(cm,genre_list) accuracy1.append(accuracy_score(testlabel,pred)) print np.mean(accuracy1)
def main(unused_argv): x,y=load_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0) vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1) x_train = np.array(list(vp.fit_transform(x_train))) x_test = np.array(list(vp.transform(x_test))) n_words=len(vp.vocabulary_) print('Total words: %d' % n_words) gnb = GaussianNB() y_predict = gnb.fit(x_train, y_train).predict(x_test) score = metrics.accuracy_score(y_test, y_predict) print('NB Accuracy: {0:f}'.format(score)) feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x_train) classifier = tf.contrib.learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[500,10], n_classes=2) classifier.fit(x_train, y_train, steps=5000, batch_size=10) y_predict=list(classifier.predict(x_test, as_iterable=True)) score = metrics.accuracy_score(y_test, y_predict) print('DNN Accuracy: {0:f}'.format(score))
def getScores(y, yPredTrain, yTest, yPredTest): scores = dict() scores['f1Train'] = f1_score(y, yPredTrain) scores['f1Test'] = f1_score(yTest, yPredTest) scores['accTrain'] = accuracy_score(y, yPredTrain) scores['accTest'] = accuracy_score(yTest, yPredTest) scores['rocTrain'] = roc_auc_score(y, yPredTrain) scores['rocTest'] = roc_auc_score(yTest, yPredTest) scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain) scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest) proba = float(len(np.where(y==1)[0]))/len(y) if proba < 0.50: proba = 1 - proba scores['random'] = proba return scores
def main(): #cleanAndWrite() df_gtd_imp = pd.read_pickle('df_gtd_imp.pkl') df_test = df_gtd_imp.query("gname == 'Unknown'") df_test = df_test.drop('gname',1) df_train = df_gtd_imp.query("gname != 'Unknown'") df_train_x = pd.DataFrame(df_train) df_train_x = df_train_x.drop('gname',1) df_train_y = df_train.gname print df_train_x.shape print df_train_y.shape print df_test.shape df_train_x, df_test = selectRelFeatures(df_train_x, df_train_y, df_test) print 'training model' X_train, X_test, y_train, y_test = \ train_test_split(df_train_x, df_train_y, test_size=0.3, random_state=0) clf_rf = RandomForestClassifier(n_estimators=5) clf_rf.fit(X_train, y_train) y_pred = clf_rf.predict(X_test) print accuracy_score(y_test, y_pred)
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring): ''' Run normal SVM classification without cross-fold validation. ''' x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation # feature selection since we have a small sample space fs = SelectPercentile(scoring, percentile=20) pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)]) pipeline = OneVsRestClassifier(pipeline) clfer = pipeline.fit(x_train, y_train) y_predict_train = clfer.predict(x_train) print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train) y_predict_test = clfer.predict(x_test) print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test) print "\nClassification Report:" print metrics.classification_report(y_test, y_predict_test) print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_predict_test)
def testNetwork(self,X, Y): """ Test the neural network """ YPred = self.nn.predict(X) self.testError = 1 - accuracy_score(Y, YPred) print accuracy_score(Y, YPred)
def fitMdl(nFitObs = 50): mdl = linear_model.LogisticRegression(verbose = 1) mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], (nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])), glbObsTrnRsp[0:nFitObs]) print mdl.get_params() print mdl.coef_.shape print ' coeff stats:' for lblIx in xrange(len(dspLabels)): print ' label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' % (dspLabels[lblIx], mdl.coef_[lblIx,:].argmin() / glbImgSz, mdl.coef_[lblIx,:].argmin() % glbImgSz, mdl.coef_[lblIx,:].min(), mdl.coef_[lblIx,:].argmax() / glbImgSz, mdl.coef_[lblIx,:].argmax() % glbImgSz, mdl.coef_[lblIx,:].max()) train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], (nFitObs , glbImgSz ** 2))) accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs]) print ' accuracy train:%0.4f' % (accuracy_train) print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels) valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr, (glbObsVldFtr.shape[0], glbImgSz ** 2))) accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp) print ' accuracy valid:%0.4f' % (accuracy_valid) print metrics.confusion_matrix(glbObsVldRsp , valid_pred_labels) test_pred_labels = mdl.predict(np.reshape(glbObsNewFtr, (glbObsNewFtr.shape[0], glbImgSz ** 2))) accuracy_test = metrics.accuracy_score( test_pred_labels, glbObsNewRsp) print ' accuracy test:%0.4f' % (accuracy_test) test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp, test_pred_labels), index = dspLabels, columns = dspLabels) print test_conf return(mdl, (accuracy_train, accuracy_valid, accuracy_test))
def main(): indata = np.load(inputs) training_data = indata['data_training'] training_scaled = preprocessing.scale(training_data) training_labels = indata['label_training'] validation_data = indata['data_val'] validation_scaled = preprocessing.scale(validation_data) validation_labels = indata['label_val'] ts = range(-12,6) cs = [pow(10, t) for t in ts] accuracy_results = [] accuracy_results_scaled = [] for c in cs: lin_clf = svm.LinearSVC(C=c) lin_clf.fit(training_data, training_labels) predictions = lin_clf.predict(validation_data) accuracy = metrics.accuracy_score(validation_labels, predictions) accuracy_results.append(accuracy) lin_clf.fit(training_scaled, training_labels) predictions = lin_clf.predict(validation_scaled) accuracy_scaled = metrics.accuracy_score(validation_labels, predictions) accuracy_results_scaled.append(accuracy_scaled) plt.plot(range(len(cs)), accuracy_results, label='un-scaled') plt.plot(range(len(cs)), accuracy_results_scaled, label='scaled') plt.xticks(range(len(cs)), cs, size='small') plt.legend() plt.show() print accuracy_results print accuracy_results_scaled
print("Model 2 prediction started") data_train = pd.read_csv('model2.csv') # read file generated by preprocess missing_vals= data_train.isnull().sum().sort_values(ascending=False) print(missing_vals) print(data_train.shape[0]) print(data_train['failure'].value_counts()) Y = data_train['failure'] # remove cols serial=data_train['serial_number'] data_train.drop('failure', axis=1, inplace=True) data_train.drop('serial_number', axis=1, inplace=True) from imblearn.over_sampling import SMOTE # Apply SMOTE smt = SMOTE() data_train, Y = smt.fit_sample(data_train, Y) #Apply test train split into partition test and train data from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(data_train,Y,test_size=0.5,random_state=3) from sklearn import tree #use decision tree classifier clf = tree.DecisionTreeClassifier() clf.fit(x_train,y_train) y_pred=clf.predict(x_test) #predicted values from sklearn.metrics import confusion_matrix,accuracy_score print(confusion_matrix(y_test,y_pred)) print(accuracy_score(y_test,y_pred)) #get Accuracy
X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=0.2, random_state=seed) results = [] names = [] models = [('LR', LogisticRegression()), ('LDA', LinearDiscriminantAnalysis()), ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier()), ('SVM', SVC())] for name, model in models: kfold = StratifiedKFold(n_splits=10, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='roc_auc') results.append(cv_results) names.append(name) final_results = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(final_results) knn = KNeighborsClassifier() knn.fit(X_train, Y_train) predictions = knn.predict(X_validation) print(" ") print(accuracy_score(Y_validation, predictions)) print(confusion_matrix(Y_validation, predictions)) print(classification_report(Y_validation, predictions))
from sklearn.neighbors import KNeighborsClassifier from sklearn import preprocessing import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score le = preprocessing.LabelEncoder() data = pd.read_csv("mining.csv") print data.head() for c in data.columns.values: if data[c].dtypes == 'object': le.fit(data[c].values) data[c] = le.transform(data[c]) data = data.as_matrix() X = data[:, 1:10] y = data[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) knn = KNeighborsClassifier(n_neighbors=4) knn.fit(X_train, y_train) yp = knn.predict(X_test) print accuracy_score(yp, y_test) print confusion_matrix(yp, y_test)
from sklearn.metrics import accuracy_score import pandas as pd #read the csv files for both training and testing ftrain = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/train.csv') ltrain = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/trainVar.csv') ftest = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/test.csv') ltest = pd.read_csv('C:/Users/user/Documents/hitachi pentaho/testVar.csv') #creating data frames for training purposes #x will take all the features and y will take the labels x = ftrain.values.tolist() y = list(ltrain['molecules']) #creating data frames for testing purposes #p will takeall the features and q will be the expected classification p = ftest.values.tolist() q = list(ltest['molecules']) clf = GaussianNB() clf.fit(x, y) output = clf.predict(p) print(output) print(accuracy_score(q, output)) # In[ ]:
baseline_acc = round(Y_test[Y_test == 0].shape[0] / Y_test.shape[0], 3) print(f'The baseline accuracy of a naive model is {baseline_acc}') # #### Random Forest Model # I also build Random Forests in order to predict if a datapoint is a pick-up location or not. RF outperforms the baseline approach significantly. # # Random forest, like its name implies, consists of a large number of individual decision trees that operate as an ensemble. Each individual tree in the random forest spits out a class prediction and the class with the most votes becomes our model’s prediction. # In[62]: clf = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=0) clf.fit(X_train, Y_train) Y_pred = clf.predict(X_test) acc = round(accuracy_score(Y_test, Y_pred), 3) print(f'The accuracy is {acc}') # ##### Feature Importance # We can also inspect and interpret the trained Random Forest classifier by analyzing the importance of each feature. Coordindates are the most important features to classify a pick-up point whereas the day feature does not help the classifier. # In[63]: importances = clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Plot the feature importances of the forest plt.figure() plt.title("Feature importances")
del csv[0] # 데이터 셔플하기(섞기) random.shuffle(csv) # 학습 전용 데이터와 테스트 전용 데이터 분할하기 (2:1 비율) total_len = len(csv) train_len = int(total_len * 2 / 3) train_data = [] train_label = [] test_data = [] test_label = [] for i in range(total_len) : data = csv[i][0:4] label = csv[i][4] if i < train_len : train_data.append(data) train_label.append(label) else : test_data.append(data) test_label.append(label) # 데이터를 학습시키고 예측하기 clf = svm.SVC(gamma='auto') clf.fit(train_data, train_label) pre = clf.predict(test_data) # 정답률 구하기 ac_score = metrics.accuracy_score(test_label, pre) print("전체 데이터 수: %d" %total_len) print("학습 전용 데이터 수: %d" %train_len) print("테스트 데이터 수: %d" %(len(test_data))) print("정답률 =", ac_score)
def _test(metric_device): metric_device = torch.device(metric_device) acc = Accuracy(is_multilabel=True, device=metric_device) torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long() y = torch.randint(0, 2, size=(4, 5, 8, 10), device=device).long() acc.update((y_pred, y)) assert ( acc._num_correct.device == metric_device ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}" # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel(y_pred.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) np_y = to_numpy_multilabel(y.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) acc.reset() torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long() y = torch.randint(0, 2, size=(4, 7, 10, 8), device=device).long() acc.update((y_pred, y)) assert ( acc._num_correct.device == metric_device ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}" # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel(y_pred.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) np_y = to_numpy_multilabel(y.cpu()) # (N, C, H, W, ...) -> (N * H * W ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) # check that result is not changed res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res) # Batched Updates acc.reset() torch.manual_seed(10 + rank) y_pred = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long() y = torch.randint(0, 2, size=(80, 5, 8, 10), device=device).long() batch_size = 16 n_iters = y.shape[0] // batch_size + 1 for i in range(n_iters): idx = i * batch_size acc.update((y_pred[idx : idx + batch_size], y[idx : idx + batch_size])) assert ( acc._num_correct.device == metric_device ), f"{type(acc._num_correct.device)}:{acc._num_correct.device} vs {type(metric_device)}:{metric_device}" # gather y_pred, y y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) np_y_pred = to_numpy_multilabel(y_pred.cpu()) # (N, C, L, ...) -> (N * L * ..., C) np_y = to_numpy_multilabel(y.cpu()) # (N, C, L, ...) -> (N * L ..., C) assert acc._type == "multilabel" n = acc._num_examples res = acc.compute() assert n * idist.get_world_size() == acc._num_examples assert isinstance(res, float) assert accuracy_score(np_y, np_y_pred) == pytest.approx(res)
#训练样本特征因子化 MONTH_t =pd.to_datetime(train['EVENT_DATE']).dt.month MONTH_t = pd.get_dummies(MONTH_t) number_t = pd.get_dummies(test['事件数']) SHENFEN_t = pd.get_dummies(test['ADMIN1']) test_set = pd.concat([MONTH_t,number,SHENFEN_t],axis=1) x = train_set.loc[:,train_set.columns!='crime_type'] y = train_set['crime_type'] x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=1) model = BernoulliNB()#伯努利模型 model.fit(x_train,y_train) y_pred = model.predict(x_test) #预测值#将预测值与实际值进行对比,输出精确度的值 print("伯努利model accuracy: ",metrics.accuracy_score(y_test,y_pred)) model_LR = LogisticRegression(C=0.1)#逻辑回归 model_LR.fit(x_train,y_train) y_pred = model_LR.predict(x_test) print("逻辑回归model accuracy: ",metrics.accuracy_score(y_test,y_pred)) model_RF = RandomForestClassifier() model_RF.fit(x_train,y_train) y_pred = model_RF.predict(x_test) print("随机森林model accuracy: ",metrics.accuracy_score(y_test,y_pred))#随机森林
#image data mechine learning from sklearn import model_selection, svm, metrics # CSV 파일을 읽어 들이고 가공(이미지 데이터의 각 픽셀을 실수 벡터로 치환) def load_csv(fname): labels = [] images = [] with open(fname, "r") as f: for line in f: cols = line.split(",") if len(cols) < 2: continue labels.append(int(cols.pop(0))) vals = list(map(lambda n: int(n) / 256, cols)) images.append(vals) return {"labels":labels, "images":images} data = load_csv("./mnist/train.csv") test = load_csv("./mnist/t10k.csv") clf = svm.SVC() clf.fit(data["images"], data["labels"]) predict = clf.predict(test["images"]) # 결과 확인 ac_score = metrics.accuracy_score(test["labels"], predict) cl_report = metrics.classification_report(test["labels"], predict) print("정답률 =", ac_score) print("리포트 =") print(cl_report)
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='%s' % score) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() print("Detailed classification report:") print() print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") print() y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print("Detailed confusion matrix:") print(confusion_matrix(y_true, y_pred)) print("Accuracy Score: \n") print(accuracy_score(y_true, y_pred)) print()
X_train, X_test, y_train, y_test = train_test_split( X_data, Y_data, \ test_size = 0.3, \ train_size = 0.7, \ random_state = 100) # Estimating the tree model - such as the ensemble comparison RFC_Model = RandomForestClassifier(max_depth=20, random_state=100,\ n_estimators=15) RFC_Model.fit(X_train, y_train.ravel()) # print(RFC_Model.feature_importances_) # Creating the model for automatic web api pk.dump(RFC_Model, open('model.pkl', 'wb')) # Predicting the output data y_RFC_pred = RFC_Model.predict(X_test) # Performance measure accuracy_score(y_test, y_RFC_pred) * 100 # Another one from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_RFC_pred) acc = 100 * (cm[0, 0] + cm[1, 1]) / (cm[0, 0] + cm[0, 1] + cm[1, 0] + cm[1, 1]) print(acc) # Predicting all data classification y_RFC_pred_train = RFC_Model.predict(X_train) accuracy_score(y_train, y_RFC_pred_train) * 100
#import pandas import pandas as pd col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label'] # load dataset pima = pd.read_csv("diabetess.txt", header=None, names=col_names) print(pima.head()) #split dataset in features and target variable feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree'] X = pima[feature_cols] # Features y = pima.label # Target variable from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0) from sklearn.linear_model import LogisticRegression # instantiate the model (using the default parameters) logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=45, solver='liblinear', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) # fit the model with data logreg.fit(X_train,y_train) y_pred=logreg.predict(X_test) from sklearn import metrics cnf_matrix = metrics.confusion_matrix(y_test, y_pred) print(cnf_matrix) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) print("Precision:",metrics.precision_score(y_test, y_pred)) print("Recall:",metrics.recall_score(y_test, y_pred))
def run(): #reading the dataseet dfx= pd.read_csv(config.TRAINING_FILE).fillna("none") dfx.sentiment= dfx.sentiment.apply( lambda x:1 if x=="positive" else 0 ) #splitting into training and validation set df_train,df_valid= model_selection.train_test_split( dfx, test_size=0.1, random_state=42, stratify= dfx.sentiment.values ) df_train= df_train.reset_index(drop=True) df_valid= df_valid.reset_index(drop=True) train_dataset=dataset.BERTDataset( review=df_train.review.values, target=df_train.sentiment.values ) train_data_loader= torch.utils.data.DataLoader( train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = dataset.BERTDataset( review=df_valid.review.values, target=df_valid.sentiment.values ) valid_data_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=1 ) cuda= torch.cuda.is_available() if cuda: device= torch.device("cuda") else: device= torch.device("cpu") model= BERTBaseUncased() model.to(device) param_optimizer=list(model.named_parameters()) no_decay=["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_parameters=[ {'params':[p for n , p in param_optimizer if not any(nd in n for nd in no_decay)],'weigt_decay':0.001}, {'params':[p for n , p in param_optimizer if any(nd in n for nd in no_decay)],'weigt_decay':0.001} ] # print("Printing optimizer parameters......******") # print(optimizer_parameters) # print("Printing optimizer parameters......******") num_train_steps= int(len(df_train)/config.TRAIN_BATCH_SIZE*config.EPOCHS) optimizer=AdamW(optimizer_parameters,lr=3e-5) schedular=get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps ) best_accuracy=0 for epochs in range(config.EPOCHS): #calling the training function in engine.py file engine.train_fn(train_data_loader,model,optimizer,device,schedular) #calling the evaluation function from the engine.py file to compute evaluation outputs,targets=engine.eval_fn(valid_data_loader,model,device) outputs=np.array(outputs)>=0.5 #calculating the accuracy after every epoch accuracy=metrics.accuracy_score(targets,outputs) print(f"Accuracy Score = {accuracy}") #updating the accuracy if accuracy > best_accuracy: torch.save(model.state_dict(), config.MODEL_PATH) best_accuracy = accuracy
import numpy as np import pandas as pd from sklearn import datasets data = pd.read_csv( "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data") X = data.iloc[:, :-1].values y = data.iloc[:, 4].values from sklearn.cross_validation import train_test_split # Splitting the data for training(70%) and testing(30%) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) print("The training data is\n", X_train) print("The testing data is\n", X_test) print("The expected result is\n", y_test) # By using LogisticRegression from sklearn.linear_model import LogisticRegression clf = LogisticRegression() # Training the dataset using Logistic Regression Model clf.fit(X_train, y_train) # Prediction prediction = clf.predict(X_test) print("The prediction by the machine is\n", prediction) from sklearn.metrics import accuracy_score # Finding the accuracy of the model a = accuracy_score(y_test, prediction) print("The accuracy of the model is:", a)
def forward(self, inputs, mask=None): output = self.gcn_1(inputs, mask=mask) output = self.dropout(output) output = self.gcn_2([output] + inputs[1:], mask=mask) return output if __name__ == "__main__": model = GraphClassifier(A[0].shape[0], HIDDEN, output_dimension, BASES, DO, len(A)) model.to(device) optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=L2) criterion = nn.CrossEntropyLoss() X = sparse.csr_matrix(A[0].shape).todense() for epoch in range(NB_EPOCH): t = time.time() output = model([X] + A) gold = y_train[idx_train].argmax(dim=-1) loss = criterion(output[idx_train], gold) score = accuracy_score(output[idx_train].argmax(dim=-1), gold) optimizer.zero_grad() loss.backward() optimizer.step() print("train_accuracy:", score, "loss:,", loss.item(), "time:", time.time() - t) test_gold = y_test[idx_test].argmax(dim=-1) test_output = output[idx_test] test_score = accuracy_score(test_output.argmax(dim=-1), test_gold) test_loss = criterion(test_output, test_gold) print("test_accuracy:", test_score, "loss:", test_loss.item())
Change_W,Change_B = BackPropagation(MATY,MATY_o,Netj,Oj,WEIGHTS) # print(len(WEIGHTS)) # print(len(Change_W)) for i in range(0,len(WEIGHTS)): WEIGHTS[i] -= alpha*Change_W[len(WEIGHTS)-i-1] BIAS[i] -= alpha*Change_B[len(WEIGHTS)-i-1] # print(batch) # break; print(epoch) end3 = time.time() y_pred = prediction(MATX1,WEIGHTS,BIAS) accuracy = accuracy_score(y_test, y_pred) print(accuracy) confusion_matrix = confusion_matrix(y_test,y_pred) print(confusion_matrix) end4 = time.time() print("reading time" , end2-start) print("training time" , end3-end2) print("testing time" , end4-end3)
# In[ ]: for clf in classifiers: print("="*30) clf_name = clf.__class__.__name__ print(clf_name) clf.fit(x_train, y_train) #Training Accuracy y_train_pred = clf.predict(x_train) train_acc = accuracy_score(y_train, y_train_pred) #Validation Accuracy y_valid_pred = clf.predict(x_valid) valid_acc = accuracy_score(y_valid, y_valid_pred) print("Validation Accuracy: {:.4%}".format(valid_acc)) log_entry = pd.DataFrame([[clf_name, train_acc, valid_acc]], columns=log_cols) log = log.append(log_entry) # In[ ]: log.sort_values('Validation Accuracy', ascending=True).plot.barh(x='Classifier', y='Validation Accuracy', figsize=(16,7))
print(X.head(n=20)) print(Y.head()) # Split the 'features' and 'income' data into training and testing sets X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y,test_size = 0.2,random_state = 0) clf = SVC() clf.fit(X_train1, y_train1) predicted = clf.predict(X_test1) print("Accuracy = {}\nPrecision = {}\nRecall = {}\nF1 Score = {}".format(metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted))) score_p.append([metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)]) print(confusion_matrix(y_test1, predicted)) neigh = KNeighborsClassifier(n_neighbors=2) neigh.fit(X_train1, y_train1) predicted = neigh.predict(X_test1) print("Accuracy = {}\nPrecision = {}\nRecall = {}\nF1 Score = {}".format(metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted))) score_p.append([metrics.accuracy_score(y_test1, predicted), metrics.precision_score(y_test1, predicted),metrics.recall_score(y_test1, predicted),metrics.f1_score(y_test1, predicted)]) print(confusion_matrix(y_test1, predicted))
# Importing the required libraries. from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve from sklearn.metrics import classification_report # In[106]: # Multinomial Naive Bayes from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB() mnb.fit(x_train,y_train) predmnb = mnb.predict(x_test) print("Confusion Matrix for Multinomial Naive Bayes:") print(confusion_matrix(y_test,predmnb)) print("Score:",round(accuracy_score(y_test,predmnb)*100,2)) print("Classification Report:",classification_report(y_test,predmnb)) # **The performance score of Naive Bayes classifier is 86.06. Since it is high score, I will treat this model as my baseline.** # # 5.4.2 Random Forest Classifier # There is no correlation between our feature(text) and target(review_stars) and this is the reason for choosing Random Forest Classifier. # The vital thing for a Random Forest Classifier model to make an accurate class prediction is the trees of the forest and more importantly their predictions need to be uncorrelated (or at least have low correlations with each other). # # Random forests are an ensemble learning method for classification. It operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. # In[107]:
def main(): global start_dens st.title("Multi Variate Contamination in Fuel") menu = ["Home", "Model Training", "Contaminant Prediction"] choice = st.sidebar.selectbox("Menu", menu) if choice == "Home": st.subheader("Home") image_file = st.file_uploader("Upload Image", type=['png', 'jpeg', 'jpg']) if image_file is not None: # To See Details # st.write(type(image_file)) # st.write(dir(image_file)) file_details = { "Filename": image_file.name, "FileType": image_file.type, "FileSize": image_file.size } st.write(file_details) img = load_image(image_file) st.image(img, width=250) if choice == "Model Training": if st.button("Start Train"): allResults = glob.glob( 'batch31-38_with_target_diff_prx_2000tampered/*.csv', recursive=True) allResults = sorted(allResults, key=lambda x: (x.split("/")[-1])) #st.write(allResults) newpath1 = 'batch31-38_with_target_diff_prx_2000tampered/' # newpath1='/content/drive/MyDrive/OIL SAMPLES DATA1/' folder = newpath1 ## data directory tag = str('.csv') ## format to import initString = '-' ## string in csv file name to search for category (normal, sludge, water, together) fileList = directorySearch(folder, tag) # print(fileList) final_filelist = pd.DataFrame( index=range(0, len(fileList)), columns=['file', 'Target', 'file_dir', 'window_id']) for i in range(0, (len(fileList))): fileName = fileList[i] res1 = fileName.find(initString) if res1 == -1: res1 = fileName.find('_') if res1 == -1: print(res1) res1 = 5 c1 = int(res1 + 1) c5 = int(res1 + 12) wloc = fileName.rfind('W', c1, c5) sloc = fileName.rfind('S', c1, c5) tloc = fileName.rfind('T', c1, c5) finalCat = max([wloc, sloc, tloc]) strCat = fileName[finalCat] # print(strCat) classLabel = int(0) if strCat == 'S': final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat # print(fileName,'---Sludge') classLabel = int(1) if strCat == 'W': final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat # print(fileName,'---Water') classLabel = int(2) if strCat == 'T': final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat # print(fileName,'--- Mix') classLabel = int(3) if strCat not in ['S', 'T', 'W']: final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat final_filelist['file_dir'][i] = allResults[i] final_filelist['window_id'][i] = i + 1 ll = [] for i, j in enumerate(final_filelist['file']): # print(i,j) head, tail = os.path.split(j) r1 = re.split('_', tail) r2 = re.split('-', r1[0]) print(r2) # if len(r2)==3 and int(r2[1]) < 37 and int(r2[1])<37 and not 'A' in r1[0] : if len(r2) == 3 and 'A' not in r2[2]: ll.append(tail) elif len(r2) == 2 and 'A' not in (r2[1]): ll.append(tail) elif len(r2) == 4 and 'A' not in (r2[3]): ll.append(tail) dff = pd.DataFrame({'file': ll}) dff['file'].count() df4 = pd.DataFrame() c = 0 # for i,j in enumerate(allResults): for i, j in enumerate(dff['file']): # print(i,j) df = pd.read_csv( 'batch31-38_with_target_diff_prx_2000tampered/' + j) head, tail = os.path.split(j) # print(i,df.shape[1]) df4[tail] = (df['Pressure_tmp'].rolling(300).std()) df9 = pd.DataFrame(index=range(0, len(df4.columns)), columns=[ 'file', 'pre-trans_mean', 'trans_mean', 'post-trans_mean', 'transient_width' ]) for z, col in enumerate(df4.columns): start = 0 end = 0 a = df4[col] b = a.quantile(0.7) # threshold set here : 70 percentile x = df4[col] > b # find values greater than threshold # print(a) for i, j in enumerate(a): # print(i,j) if j > b: # find value greater than threshold start = i # get the position of value greater than threshold break for k, l in enumerate( a[start:] ): # now start checking from position that was marked earlier # print(i,j) if l < b and abs( k ) > 200: # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part) end = start + k break df9['file'][z] = col df9['pre-trans_mean'][z] = (df4[col].iloc[:start].mean()) df9['trans_mean'][z] = (df4[col].iloc[start:end].mean()) df9['post-trans_mean'][z] = (df4[col].iloc[end:].mean()) if (end - start) > 0: df9['transient_width'][z] = end - start else: df9['transient_width'][z] = 0 df5 = df4.describe().transpose() df5 = df5.reset_index() df10 = pd.merge(df9, df5[['index', 'std', 'max']], left_on='file', right_on='index', how='left') del df10['index'] df10 = df10.set_index('file') df11 = pd.merge(df10, final_filelist[['file', 'Target']], left_on='file', right_on='file', how='left') df11 = df11.set_index('file') df11 = df11.astype({ 'pre-trans_mean': 'float64', 'trans_mean': 'float64', 'post-trans_mean': 'float64', 'transient_width': 'float64' }) df12 = pd.DataFrame() for i, j in enumerate(dff['file']): # print(i,j) df = pd.read_csv( 'batch31-38_with_target_diff_prx_2000tampered/' + j) head, tail = os.path.split(j) # print(i,df.shape[1]) df12[tail] = (df['Density'].rolling(300).std()) df13 = pd.DataFrame(index=range(0, len(df12.columns)), columns=[ 'file', 'pre-trans_mean-density', 'trans_mean-density', 'post-trans_mean-density', 'transient_width-density' ]) for z, col in enumerate(df12.columns): start = 0 end = 0 print(col) # file name a = df12[col] b = a.quantile(0.7) # threshold set here : 70 percentile x = df12[col] > b # find values greater than threshold # print(a) for i, j in enumerate(a): # print(i,j) if j > b: # find value greater than threshold start = i # get the position of value greater than threshold break for k, l in enumerate( a[start:] ): # now start checking from position that was marked earlier # print(i,j) if l < b and abs( k ) > 200: # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part) end = start + k break df13['file'][z] = col df13['pre-trans_mean-density'][z] = ( df12[col].iloc[:start].mean()) df13['trans_mean-density'][z] = ( df12[col].iloc[start:end].mean()) df13['post-trans_mean-density'][z] = ( df12[col].iloc[end:].mean()) if (end - start) > 0: df13['transient_width-density'][z] = end - start else: df13['transient_width-density'][z] = 0 df13 = df13.astype({ 'pre-trans_mean-density': 'float64', 'trans_mean-density': 'float64', 'post-trans_mean-density': 'float64', 'transient_width-density': 'float64' }) df11.drop(['std'], axis=1, inplace=True) df14 = df13[[ 'file', 'pre-trans_mean-density', 'post-trans_mean-density' ]] df14['pre-trans_mean-density'] = df14[ 'pre-trans_mean-density'].fillna(0) df11.dropna(inplace=True) le = preprocessing.LabelEncoder() df11['Target'] = le.fit_transform(df11['Target']) df11.loc[:, 'Target'] df15 = df11.merge(df14, how='inner', on='file') del df15['file'] df15 = df15[[ 'pre-trans_mean', 'trans_mean', 'post-trans_mean', 'transient_width', 'max', 'pre-trans_mean-density', 'post-trans_mean-density', 'Target' ]] st.write(df15) col = df15.columns features = col.tolist() feature = features[:-1] target = features[-1] # x=dff_tr.loc[:,feature].values # y=dff_tr.loc[:,target].values x = df15.loc[:, feature].values y = df15.loc[:, target].values x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=98) ost = SMOTE() os_data_X, os_data_y = ost.fit_resample(x_train, y_train) os_data_X = pd.DataFrame(data=os_data_X, columns=feature) os_data_y = pd.DataFrame(data=os_data_y, columns=['Target']) # print('After Oversampling:') os_data_X, os_data_y = ost.fit_resample(x_train, y_train) clf_rf_bal = RandomForestClassifier(n_estimators=10, random_state=99) clf_rf_bal = clf_rf_bal.fit(os_data_X, os_data_y) #from sklearn.inspection import permutation_importance #results = permutation_importance(clf_rf_bal, x, y, scoring='accuracy') #importance = results.importances_mean # summarize feature importance #print('using permutaiton feature importance') #for i, v in enumerate(importance): # print('Feature: %0d, Score: %.5f' % (i, v)) #importance = clf_rf_bal.feature_importances_ # summarize feature importance #print('using feature importance') #for i, v in enumerate(importance): # print('Feature: %0d, Score: %.5f' % (i, v)) bal_cm = confusion_matrix(y_test, clf_rf_bal.predict(x_test)) y_pred_bal = clf_rf_bal.predict(x_test) print('balanced classification report') cls_rpt = classification_report(y_test, y_pred_bal) st.write(f'classification report : {cls_rpt}') bal_ac = accuracy_score(y_test, clf_rf_bal.predict(x_test)) st.write(f'accuracy score : {bal_ac}') filename = 'finalized_model1.pkl' pickle.dump(clf_rf_bal, open(os.path.join(os.getcwd(), filename), 'wb')) if choice == "Contaminant Prediction": st.subheader("Dataset") data_file = st.file_uploader("Upload CSV", type=['csv']) if st.button("Process") and data_file is not None: file_details = { "Filename": data_file.name, "FileType": data_file.type, "FileSize": data_file.size } st.write(file_details) df = pd.read_csv(data_file) st.dataframe(df) tag = str('.csv') ## format to import initString = '-' ## string in csv file name to search for category (normal, sludge, water, together) fileName = data_file.name # print(fileList) final_filelist = pd.DataFrame(columns=['file', 'Target']) res1 = fileName.find(initString) if res1 == -1: res1 = fileName.find('_') if res1 == -1: print(res1) res1 = 5 c1 = int(res1 + 1) c5 = int(res1 + 12) wloc = fileName.rfind('W', c1, c5) sloc = fileName.rfind('S', c1, c5) tloc = fileName.rfind('T', c1, c5) finalCat = max([wloc, sloc, tloc]) strCat = fileName[finalCat] st.write(f'FileName:{fileName}') if strCat not in ['S', 'T', 'W']: strCat = 'No Contaminant' st.write('No Contaminant') if strCat in ['S', 'T', 'W']: st.write('Contaminant Exists') if strCat == 'S': st.write('Type of Contaminant: Sludge') if strCat == 'W': st.write('Type of Contaminant: Water') if strCat == 'T': st.write('Type of Contaminant: Sludge+Water') df4 = pd.DataFrame() df4['roll_std'] = df['Pressure_tmp'].rolling(300).std() df5 = df4.describe().transpose() df5 = df5.reset_index() maxx = df5['max'][0] df9 = pd.DataFrame(columns=[ 'file', 'pre_trans_mean', 'trans_mean', 'post_trans_mean', 'transient_width' ]) # for col in df4.columns: # end = 0 # print(col) # file name # a = df4[col] a = df4['roll_std'] st.write(a) b = a.quantile(0.7) # threshold set here : 70 percentile # print(b) st.write(b) # x = df4[col] > b x = df4['roll_std'] > b # find values greater than threshold # print(x.value_counts()) # print(a) st.write(x) for i, j in enumerate(a): # print(i,j) if j > b: # find value greater than threshold start = i # get the position of value greater than threshold break for k, l in enumerate( a[start:] ): # now start checking from position that was marked earlier # print(i,j) if l < b and abs( k ) > 200: # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part) end = start + k break file = data_file.name pre_trans_mean = (df4['roll_std'].iloc[:start].mean()) trans_mean = (df4['roll_std'].iloc[start:end].mean()) post_trans_mean = (df4['roll_std'].iloc[end:].mean()) if (end - start) > 0: transient_width = end - start else: transient_width = 0 df12 = pd.DataFrame() df12['roll_std_den'] = (df['Density'].rolling(300).std()) df13 = pd.DataFrame() df13 = pd.DataFrame(columns=[ 'file', 'pre_trans_mean_dens', 'trans_mean_dens', 'post_trans_mean_dens', 'transient_width_dens' ]) # for col in df4.columns: # end = 0 # print(col) # file name # a = df4[col] p = df12['roll_std_den'] q = p.quantile(0.7) # threshold set here : 70 percentile # print(b) # st.write(b) # x = df4[col] > b xx = df12['roll_std_den'] > q # find values greater than threshold # print(x.value_counts()) # print(a) # st.write(xx) for i, j in enumerate(p): # print(i,j) if j > q: # find value greater than threshold start_dens = i # get the position of value greater than threshold break for k, l in enumerate( p[start_dens:]): # now start checking from position # print(i,j) if l < q and abs( k) > 200: # find values that are less than threshold end_dens = start_dens + k break pre_trans_mean_dens = df12['roll_std_den'].iloc[:start_dens].mean() trans_mean_dens = df12['roll_std_den'].iloc[ start_dens:end_dens].mean() post_trans_mean_dens = df12['roll_std_den'].iloc[end_dens:].mean() if (end_dens - start_dens) > 0: transient_width_dens = end_dens - start_dens else: transient_width_dens = 0 zz = { 'file': file, 'pre_trans_mean': pre_trans_mean, 'trans_mean': trans_mean, 'post_trans_mean': post_trans_mean, 'pre_trans_mean_dens': pre_trans_mean_dens, 'trans_mean_dens': trans_mean_dens, 'post_trans_mean_dens': post_trans_mean_dens } pre_trans_mean1 = pre_trans_mean trans_mean1 = trans_mean post_trans_mean1 = post_trans_mean transient_width1 = transient_width max1 = maxx pre_trans_mean_dens1 = pre_trans_mean_dens post_trans_mean_dens1 = post_trans_mean_dens st.write(zz) # load the model from disk loaded_model = pickle.load(open('finalized_model1.pkl', 'rb')) result = loaded_model.predict([[ pre_trans_mean1, trans_mean1, post_trans_mean1, transient_width1, max1, pre_trans_mean_dens1, post_trans_mean_dens1 ]]) if result == 0: st.write(f'Predicted Contaminant: Sludge') if result == 1: st.write(f'Predicted Contaminant: Water') if result == 2: st.write(f'Predicted Contaminant: Water+Sludge') if result == 3: st.write('No Contaminant')
def scorer_01loss(estimator, X, y): y_pred = estimator.predict(X) return 1 - accuracy_score(y, y_pred)
print(x) x.columns = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth'] colormap = n.array(['red', 'lime', 'black']) # giving color for each y = p.DataFrame(iris.target) print(y) y.columns = ['Target'] plt.figure(figsize=(14, 7)) plt.scatter(x.sepallength, x.sepalwidth, c=colormap[y.Target], s=40) plt.title("Sepal data before model") plt.show() plt.figure(figsize=(14, 7)) plt.scatter(x.petallength, x.petalwidth, c=colormap[y.Target]) plt.title("petal data before model") plt.show() model = KMeans(n_clusters=2) model.fit(x) centroids = model.cluster_centers_ #give the centroid print("centroids", centroids) labels = model.labels_ ##labels of result print(labels) plt.figure(figsize=(14, 7)) plt.scatter(x.petallength, x.petalwidth, c=colormap[labels]) plt.title("petal data after model") plt.show() pred_y = n.choose(labels, [1, 0, 2]).astype(n.int64) #print(labels) print(pred_y) print(so.accuracy_score(y, pred_y)) print(so.confusion_matrix(y, pred_y))
"""confusion matrix""" results_for_conf_matrix = model.predict(x_test) results_for_conf_matrix results = [] for i in range(len(results_for_conf_matrix)): results.append(list(results_for_conf_matrix[i]).index(max(results_for_conf_matrix[i]))) cm=confusion_matrix(y_test, results) cm class_names=['Cloudy' ,'Rain' ,'Sun_shine','Sunrise'] """accuracy: 89.7""" from sklearn.metrics import accuracy_score accuracy_score(y_test, results) """confusion_matrix_without_normalisation""" def plot_confusion_matrix(cm, classes, normalize=False, title=None, cmap=plt.cm.Blues): if not title: if normalize: title = 'Normalized confusion matrix' else: title = 'Confusion matrix, without normalization' #Compute confusion matrix
def judge_model(x_train, x_test, y_train, y_test, model): print('-' * 20) print('Baseline Performance') print('-> Acc:', accuracy_score(y_train, model.predict(x_train))) print('-> Acc:', accuracy_score(y_test, model.predict(x_test)))
joblib.dump(classifier, 'knn_model.sav') # some time later... # load the model from disk loaded_model = joblib.load('knn_model.sav') print('Accuracy of loaded model') result = loaded_model.score(X_test, y_test) print(result) from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(y_pred) from sklearn.metrics import accuracy_score print('Accuracy Score: ', accuracy_score(y_pred, y_test)) #y_pred is the output from sklearn.metrics import f1_score f1_metric = f1_score(y_test, y_pred, average='macro') print("f1 score macro:", f1_metric) from sklearn.metrics import f1_score f1_metric_micro = f1_score(y_test, y_pred, average='micro') print("f1 score micro:", f1_metric_micro) #take input from the loaded model input_sepal_length = float(input("Enter sepal length: ")) input_sepal_width = float(input("Enter sepal width:")) input_petal_length = float(input("Enter petal Length: ")) input_petal_width = float(input("Enter petal width: ")) output = loaded_model.predict([[
def scorer(estimator, X, y): y_pred = estimator.predict(X) return accuracy_score(y, y_pred)
def get_accu(model, data, y_gt): y_pred = model.predict_proba(data)[:,1] y_class= (y_pred>0.5) accu = accuracy_score(y_gt, y_class) return accu
train_data, test_data, train_labels, test_labels = train_test_split( all_tweets, labels, test_size=0.2, random_state=1) print(len(train_data)) print(len(test_data)) #Making the Count Vectors counter = CountVectorizer() counter.fit(train_data) train_counts = counter.transform(train_data) test_counts = counter.transform(test_data) print(train_data[3]) print(train_counts[3]) #Train and Test the Naive Bayes Classifier classifier = MultinomialNB() classifier.fit(train_counts, train_labels) predictions = classifier.predict(test_counts) #Evaluating Your Model via accuracy score (and confusion matrix) from sklearn.metrics import accuracy_score print(accuracy_score(test_labels, predictions)) #Accuracy lies around 67.8%, which is ok, but not great from sklearn.metrics import confusion_matrix print(confusion_matrix(test_labels, predictions)) #Testing the prediction function with own tweet tweet = 'Pierre, the baquette is tasty' tweet_counts = counter.transform([tweet]) print(classifier.predict(tweet_counts))
# In[] #trainn = train.drop(['Name','Age', 'Ticket', 'Fare', 'Cabin'], axis = 1) # In[] X = traincopy[['Pclass', 'Sex', 'SibSp', 'Parch', 'Age']] Y = traincopy['Survived'] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0) # In[] for k in range(1, 21): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, Y_train) pred = knn.predict(X_test) print("Accuracy is", accuracy_score(Y_test, pred) * 100, "for k = ", k) k = k + 1 # In[] from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=8) # In[] knn.fit(X_train, Y_train) knn.score(X_test, Y_test) # In[] pred = knn.predict(testcopy)
target = dataset['Class'] df=df.iloc[0:177,[1,12]] sc=StandardScaler() df=sc.fit_transform(df) pca = PCA(n_components=2) pca_x=pca.fit_transform(df) pca_df = pd.DataFrame(data=pca_x,columns=['comp1','comp2']) KModel = KMeans(n_clusters=3,random_state=2) KModel.fit_predict(pca_df) KModel.labels_ colormap = np.array(['Red','Blue','Green']) z = plt.scatter(pca_df.comp1,pca_df.comp2,c = colormap[KModel.labels_]) KModel.labels_ accuracy_score(target,KModel.labels_)