def evaluate_resampling(X, y, X_test, y_test, clf=None): """For evaluating various resampling methods""" if clf is None: clf = RandomForestClassifier(n_estimators=400, random_state=5, n_jobs=-1) probas = cross_val_predict( clf, X, y, cv=StratifiedKFold(n_splits=3), n_jobs=-1, method="predict_proba", verbose=2, ) pred_indices = np.argmax(probas, axis=1) classes = np.unique(y) preds = classes[pred_indices] print("Cross validation on training data: ") print("Log loss: {}".format(log_loss(y, probas))) print("Accuracy: {}".format(balanced_accuracy_score(y, preds))) print("F1 score: {}".format(f1_score(y, preds, average="micro"))) print("Validation on testing data: ") clf.fit(X, y) ytest = clf.predict(X_test) yprobas_test = clf.predict_proba(X_test) print("Log loss: {}".format(log_loss(y_test, yprobas_test))) print("Accuracy: {}".format(balanced_accuracy_score(y_test, ytest))) print("F1 score: {}".format(f1_score(y_test, ytest, average="micro")))
def all_metrics(num, model, train, test, target, target_test): """ Calculating metric scores for all models""" ytrain = model.predict(train) yprobas = model.predict_proba(train) ytest = model.predict(test) yprobas_test = model.predict_proba(test) logloss_train = log_loss(target, yprobas) logloss_test = log_loss(target_test, yprobas_test) print("Training Log Loss: ", logloss_train) print("Testing Log Loss: ", logloss_test) acc_train = round(balanced_accuracy_score(target, ytrain) * 100, 3) acc_test = round(balanced_accuracy_score(target_test, ytest) * 100, 3) print("Training Accuracy: ", acc_train) print("Testing Accuracy: ", acc_test) f1score_train = f1_score(target, ytrain, average="micro") f1score_test = f1_score(target_test, ytest, average="micro") print("Training f1 Score: ", f1score_train) print("Testing f1 Score: ", f1score_test)
def predict_using_random_model(x_test, y_test, x_cv, y_cv): # We need to generate 9 numbers and the sum of numbers should be 1 # one solution is to genarate 9 numbers and divide each of the numbers by their sum # ref: https://stackoverflow.com/a/18662466/4084039 test_data_len = x_test.shape[0] cv_data_len = x_cv.shape[0] # we create a output array that has exactly same size as the CV data cv_predicted_y = np.zeros((cv_data_len, 9)) for i in range(cv_data_len): rand_probs = np.random.rand(1, 9) cv_predicted_y[i] = (rand_probs / sum(sum(rand_probs)))[0] print( "Log loss on Cross Validation Data using Random Model", log_loss(y_cv, cv_predicted_y, eps=1e-15), ) # Test-Set error. # We create a output array that has exactly same as the test data test_predicted_y = np.zeros((test_data_len, 9)) for i in range(test_data_len): rand_probs = np.random.rand(1, 9) test_predicted_y[i] = (rand_probs / sum(sum(rand_probs)))[0] print( "Log loss on Test Data using Random Model", log_loss(y_test, test_predicted_y, eps=1e-15), ) predicted_y = np.argmax(test_predicted_y, axis=1) plot_confusion_matrix(y_test, predicted_y + 1) return predicted_y, y_test
def text_only_model(result, y): ext_vectorizer = CountVectorizer( min_df=4 ) # In Feature we choose only those words with greater then 3 times occurence x = text_vectorizer.fit_transform(Result['Text']) X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset, y, test_size=0.3) X_tr, X_test, y_tr, y_test = cross_validation.train_test_split( X_tr, y_tr, test_size=0.3) Dataset = normalize(x, axis=0) tunes_para = [10**x for x in range(-5, 1)] cv_array_loss = [] # want to tune for alpha in these code for i in tunes_para: print("for alpha =", i) clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42) clf.fit(X_tr, y_tr) clf2 = CalibratedClassifierCV(clf, method="sigmoid") clf2.fit(X_tr, y_tr) clf2_probs = clf2.predict_proba(X_test) cv_array_loss.append( log_loss(y_test, clf2_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :", log_loss(y_test, clf2_probs))
def train_boost(booster, seed, oversampling=-1.0, use_tfidf=False, enable_cv=False, use_alldata=False, num_trees=-1): train, y, features = prepare_train() if use_tfidf: print 'Using raw tf-idf sparse matrix ... ' features = 'auto' train_sparse = sparse.csr_matrix(train.values) # tfidf_sparse = load_sparse_csr('tfidf_stem_train.npz') bm25_sparse = load_sparse_csr('bm25_train.npz') # bm25_sparse = bm25_sparse[404290 - 50000:, :] # train = sparse.hstack([train_sparse, tfidf_sparse]) # common_words = load_sparse_csr('train_tfidf_commonwords.npz') # symmdif = load_sparse_csr('train_tfidf_symmdiff.npz') train = sparse.hstack([train_sparse, bm25_sparse]) del train_sparse, bm25_sparse print 'Train shape: ', train.shape if enable_cv: train, y = shuffle(train, y) booster.cv(train, y) exit() if use_alldata: print 'Using all data to fit classifier ... ' assert num_trees > 0 results = booster.fit_all(train, y, num_trees, features) else: print 'Using train/dev split to fit classifier ... ' X_train, X_eval, y_train, y_eval = train_test_split(train, y, stratify=y, test_size=0.20, random_state=seed) if oversampling > 0: print 'Oversampling X_train, X_eval datasets ... ' X_train, y_train = oversample_sparse(X_train, y_train, p=oversampling) X_eval, y_eval = oversample_sparse(X_eval, y_eval, p=oversampling) results = booster.fit(X_train, X_eval, y_train, y_eval, features) y_pred = booster.predict(X_eval) print log_loss(y_eval, y_pred) print y_pred train = None y = None del train del y return results
def logreg(train,test,cv): tunes_para=[10 ** x for x in range(-5, 1)] cv_array_loss=[] # want to tune for alpha in these code for i in tunes_para: print("for alpha =", i) clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42) clf.fit(train,y_tr) clf2 = CalibratedClassifierCV(clf, method="sigmoid") clf2.fit(train,y_tr) clf2_probs = clf2.predict_proba(cv) cv_array_loss.append(log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :",log_loss(y_cv, clf2_probs))
def get_sgd_lr_model_cross_grid_search(): ###logistic regression with hyper parameter tuning alpha = [10 ** x for x in range(-5, 6)] # hyperparam for SGD classifier. f1_score_error_array = [] for i in alpha: clf = SGDClassifier(alpha=i, loss='log', penalty='l2', random_state=42) lr_clf = OneVsRestClassifier(clf) lr_clf.fit(xtrain_tfidf, ytrain) y_pred_new = lr_clf.predict(xval_tfidf) f1_score_error_array.append(f1_score(yval, y_pred_new, average="micro")) print('For values of alpha = ', i, "The f1_score is:", f1_score(yval, y_pred_new, average="micro")) print('For values of alpha = ', i, "The log-loss is:", log_loss(yval, y_pred_new, eps=1e-15)) fig, ax = plt.subplots() ax.plot(alpha, f1_score_error_array, c='g') for i, txt in enumerate(np.round(f1_score_error_array, 3)): ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], f1_score_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() best_alpha = int(np.argmax(f1_score_error_array)) clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42) lr_clf = OneVsRestClassifier(clf) lr_clf.fit(xtrain_tfidf, ytrain) y_pred_new = lr_clf.predict(xval_tfidf) final_f1_score = f1_score_error_array.append(f1_score(yval, y_pred_new, average="micro")) print('f1_score from SGDClassifier model after CV grid search is : {}'.format(final_f1_score)) return lr_clf, final_f1_score
def get_scores(y_true,y_pred): brier_score = brier_score_loss(y_true,y_pred) log_score = log_loss(y_true,y_pred) roc_score = roc_auc_score(y_true, y_pred) pr_score = average_precision_score(y_true,y_pred) r2score = r2_score(y_true,y_pred) return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
def get_scores(shots): y_true = [shot.result for shot in shots] y_pred = [shot.pred for shot in shots] brier_score = brier_score_loss(y_true,y_pred) log_score = log_loss(y_true,y_pred) roc_score = roc_auc_score(y_true, y_pred) pr_score = average_precision_score(y_true,y_pred) r2score = r2_score(y_true,y_pred) return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
def predict_and_plot_confusion_matrix(train_x, train_y, test_x, test_y, clf): clf.fit(train_x, train_y) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x, train_y) pred_y = sig_clf.predict(test_x) print("log loss :", log_loss(test_y, sig_clf.predict_proba(test_x))) print("Number of mis-classified points :", np.count_nonzero((pred_y - test_y)) / test_y.shape[0]) plot_confusion_matrix(test_y, pred_y)
def predict_and_plot_confusion_matrix(train_x, train_y, test_x, test_y, clf): clf.fit(train_x, train_y) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x, train_y) pred_y = sig_clf.predict(test_x) # for calculating log_loss we willl provide the array of probabilities belongs to each class print("Log loss :", log_loss(test_y, sig_clf.predict_proba(test_x))) # calculating the number of data points that are misclassified print("Number of mis-classified points :", np.count_nonzero((pred_y - test_y)) / test_y.shape[0]) plot_confusion_matrix(test_y, pred_y)
def variation_only_model(X_tr, X_cv, y_tr, y_cv, X_test, y_test): #Let us do one hot convert of this(for test, train, cv) gene_vectorizer = CountVectorizer() train_Variation_feature_onehotCoding = gene_vectorizer.fit_transform( X_tr['Variation']) test_Variation_feature_onehotCoding = gene_vectorizer.transform( X_test['Variation']) cv_Variation_feature_onehotCoding = gene_vectorizer.transform( X_cv['Variation']) tunes_para = [10**x for x in range(-5, 1)] cv_array_loss = [] # want to tune for alpha in these code for i in tunes_para: print("for alpha =", i) clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42) clf.fit(train_Variation_feature_onehotCoding, y_tr) clf2 = CalibratedClassifierCV(clf, method="sigmoid") clf2.fit(train_Variation_feature_onehotCoding, y_tr) clf2_probs = clf2.predict_proba(cv_Variation_feature_onehotCoding) cv_array_loss.append( log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :", log_loss(y_cv, clf2_probs)) clf = SGDClassifier(class_weight='balanced', alpha=0.001, penalty='l2', loss='log', random_state=42) clf.fit(train_Variation_feature_onehotCoding, y_tr) clf2 = CalibratedClassifierCV(clf, method="sigmoid") clf2.fit(train_Variation_feature_onehotCoding, y_tr) clf2_probs = clf2.predict_proba(cv_Variation_feature_onehotCoding) cv_array_loss.append( log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :", log_loss(y_cv, clf2_probs))
def evaluate_features(X, y, X_test, y_test, clf=None, kfold=StratifiedKFold(n_splits=3)): """Can be used to evaluate features on training data and testing data; also compare model performance when specifying clf""" if clf is None: clf = RandomForestClassifier(n_estimators=400, random_state=5, n_jobs=-1) probas = cross_val_predict( clf, X, y, cv=kfold, n_jobs=-1, method="predict_proba", verbose=2, ) pred_indices = np.argmax(probas, axis=1) classes = np.unique(y) preds = classes[pred_indices] print("Cross validation on training data: ") print("Log loss: {}".format(log_loss(y, probas))) print("Accuracy: {}".format(balanced_accuracy_score(y, preds))) print("MCC: {}".format(f1_score(y, preds, average="micro"))) print("Validation on testing data: ") clf.fit(X, y) ytest = clf.predict(X_test) yprobas_test = clf.predict_proba(X_test) print("Log loss: {}".format(log_loss(y_test, yprobas_test))) print("Accuracy: {}".format(balanced_accuracy_score(y_test, ytest))) print("MCC: {}".format(f1_score(y_test, ytest, average="micro")))
def mea_metrics_calc(num, model, train, test, target, target_test): """ For the calculation and storage of accuracy and log loss """ global mea_all ytrain = model.predict(train) yprobas = model.predict_proba(train) ytest = model.predict(test) yprobas_test = model.predict_proba(test) print("target = ", target[:5]) print("ytrain = ", ytrain[:5]) print("target_test =", target_test[:5]) print("ytest =", ytest[:5]) num_mea = 0 for x in metrics_now: if x == 1: # log loss mea_train = log_loss(target, yprobas) mea_test = log_loss(target_test, yprobas_test) elif x == 2: # accuracy mea_train = round(balanced_accuracy_score(target, ytrain) * 100, 3) mea_test = round( balanced_accuracy_score(target_test, ytest) * 100, 3) elif x == 3: # f1 score mea_train = f1_score(target, ytrain, average="micro") mea_test = f1_score(target_test, ytest, average="micro") print("Measure of", metrics_all[x], "for train =", mea_train) print("Measure of", metrics_all[x], "for test =", mea_test) mea_all[num_mea].append(mea_train) # train mea_all[num_mea + 1].append(mea_test) # test num_mea += 2 return plot_confusion_matrix(model, target_test, ytest)
def find_exclude(self, n_splits=5): if not self.model_dict or not self.data_dict: print('Stoped: no models or data') return None for c in self.countries: self.model_dict[c].load_data(data=self.data_dict[c], balance=self.balances[c]) exclude_list = [] finish = False logloss_dict = {} while not finish: self.model_dict[c].set_exclude_list(exclude_list) self.model_dict[c].train() exclude_list_prev = exclude_list.copy() columns = [ x for x in self.model_dict[c].get_train().columns if x not in exclude_list_prev ] exclude_list = [ x for (x, y) in zip( columns, self.model_dict[c].get_feature_importances()) if y == 0 ] if not exclude_list: finish = True exclude_list = exclude_list_prev + exclude_list logloss_iter = [] splits = self.model_dict[c].data.get_train_valid( n_splits=n_splits, balance=self.balances[c]) for i in range(0, n_splits): self.model_dict[c].set_random_seed(i) train, valid = splits[i] self.model_dict[c].set_exclude_list(exclude_list) self.model_dict[c].train(train[0], train[1]) pred = self.model_dict[c].predict(valid[0]) logloss_iter.append( log_loss(valid[1].astype(int), pred['poor'])) logloss = np.mean(logloss_iter) logloss_dict[logloss] = exclude_list print('loglos: {0} exclude length: {1}'.format( logloss, len(exclude_list))) self.exclude_dict[c] = logloss_dict[np.min( list(logloss_dict.keys()))] print('Country: {0} exclude length: {1}'.format( c, len(self.exclude_dict.get(c)))) return logloss_dict
def Baseline_model(X_tr, X_cv, y_tr, y_cv, X_test, y_test): # We need some a list of size 9 which will sum to 1 test_data_len = X_test.shape[0] cv_data_len = X_cv.shape[0] # we create a output array that has exactly same size as the CV data cv_predicted = np.zeros((cv_data_len, 9)) j = 1 for i in range(cv_data_len): rand_probs = np.random.rand(1, 9) # array of size 9 sum to 1. cv_predicted[i] = ((rand_probs / sum(sum(rand_probs))))[0] # Lie between 0 to 1 print("Log Loss at cross-validation step is %d", log_loss(y_cv, cv_predicted, eps=1e-15)) # we create a output array that has exactly same size as the CV data test_predicted = np.zeros((test_data_len, 9)) j = 1 for i in range(test_data_len): rand_probs = np.random.rand(1, 9) # array of size 9 sum to 1. test_predicted[i] = ((rand_probs / sum(sum(rand_probs))))[0] # Lie between 0 to 1 print("Log Loss at test step is %d", log_loss(y_test, test_predicted, eps=1e-15))
def evaluate(config, model, valid_loader, test=False): model.eval() loss_fn = nn.CrossEntropyLoss( weight=torch.tensor(config.class_wts, dtype=torch.float).to(config.device) ) total_ts_labels = np.array([], dtype=int) total_ts_preds = np.empty(shape=(0, 9), dtype=int) avg_ts_loss = tnt.meter.AverageValueMeter() with torch.no_grad(): for i, batch in enumerate(valid_loader): batch = [r.to(config.device) for r in batch] x_batch, y_batch = batch y_pred = model(x_batch) loss = loss_fn(y_pred, y_batch) avg_ts_loss.add(loss.item()) y_pred = F.softmax(y_pred, dim=1).detach().cpu().numpy() # (batch_size, 9) total_ts_labels = np.append( total_ts_labels, y_batch.cpu().numpy() ) # (batch_size, 1) total_ts_preds = np.append( total_ts_preds, y_pred, axis=0 ) # (batch_size, 9) encoded_ts_labels = pd.get_dummies(total_ts_labels) # (N, 9) # Accuracy val_acc = balanced_accuracy_score( total_ts_labels, total_ts_preds.argmax(axis=1) ) # argmax -> numeric labels (batch_size, 1) # log loss val_log_loss = log_loss(encoded_ts_labels, total_ts_preds) # f1 score val_f1score = f1_score( total_ts_labels, total_ts_preds.argmax(axis=1), average="micro" ) if test == True: return ( total_ts_labels, total_ts_preds, encoded_ts_labels, avg_ts_loss.value()[0], val_acc, val_log_loss, val_f1score, ) else: return avg_ts_loss.value()[0], val_acc, val_log_loss, val_f1score
def do_model(model, X_train, y_train, X_test, y_test, class_weight=None): if class_weight == 'balanced': sample_weight = unbalanced_sample_weight(y_train) else: sample_weight = None model.fit(X_train, y_train, sample_weight=sample_weight) predict_proba = model.predict_proba(X_test) proba = [x[1] for x in predict_proba] if class_weight == 'balanced': sample_weight = unbalanced_sample_weight(y_test) else: sample_weight = None loss = log_loss(y_test, proba, sample_weight=sample_weight) logger.debug('loss is %f', loss) return model, loss
def print_metrics(y_true, y_pred): print('auc:', roc_auc_score(y_true, y_pred)) print('accuracy:', classification.accuracy_score(y_true, y_pred)) confusion_matrix = classification.confusion_matrix(y_true, y_pred) # print('confusion matrix:') # print('report:', classification.classification_report(y_true, y_pred)) tn, fp, fn, tp = confusion_matrix.ravel() sensitivity = tp / (tp + fn) print('sensitivity: {}'.format(sensitivity)) specificity = tn / (tn + fp) print('specificity: {}'.format(specificity)) print('precision: {}'.format(tp / (tp + fp))) total_acc = (tp + tn) / (tp + tn + fp + fn) random_acc = (((tn + fp) * (tn + fn) + (fn + tp) * (fp + tp)) / (tp + tn + fp + fn)**2) kappa = (total_acc - random_acc) / (1 - random_acc) print('Cohen\'s kappa: {}'.format(kappa)) youdens = sensitivity - (1 - specificity) print('Youden\'s index: {}'.format(youdens)) print('log loss:', classification.log_loss(y_true, y_pred))
]] from scipy import sparse train_q = sparse.hstack([train_vec_1, df3]) train_y = df['is_duplicate'] #======================================== from tqdm.auto import tqdm from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics.classification import accuracy_score, log_loss from sklearn.linear_model import SGDClassifier #best_alpha = np.argmin(log_error_array) clf = SGDClassifier(alpha=1, penalty='l2', loss='log', random_state=42) clf.fit(train_q, train_y) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_q, train_y) predict_y = sig_clf.predict_proba(train_q) print('For values of best alpha = ', 0.1, "The train log loss is:", log_loss(train_y, predict_y, labels=clf.classes_, eps=1e-15)) predicted_y = np.argmax(predict_y, axis=1) print(accuracy_score(train_y, predicted_y)) #================================================= import joblib joblib.dump(tf_idf_vect, 'tf_idf_vect.pkl') joblib.dump(sig_clf, 'sig_clf.pkl')
optimizer=None) gp_fix.fit(X[:train_size], y[:train_size]) gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0)) gp_opt.fit(X[:train_size], y[:train_size]) print("Log Marginal Likelihood (initial): %.3f" % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)) print("Log Marginal Likelihood (optimized): %.3f" % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)) print("Accuracy: %.3f (initial) %.3f (optimized)" % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])))) print("Log-loss: %.3f (initial) %.3f (optimized)" % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]), log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]))) # Plot posteriors plt.figure(0) plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data", edgecolors=(0, 0, 0)) plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data", edgecolors=(0, 0, 0)) X_ = np.linspace(0, 5, 100) plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r', label="Initial kernel: %s" % gp_fix.kernel_) plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b', label="Optimized kernel: %s" % gp_opt.kernel_) plt.xlabel("Feature")
for clf_id, clf in enumerate(base_classifiers): print "Training base classifier #{0} -- {1}".format( clf_id, clf.__class__.__name__) dataset_blend_test_j = np.zeros((XTest.shape[0], N_FOLDS)) for fold_id, (train_indexes, predict_indexes) in enumerate(splits): print "Fold", fold_id # Fit on train part clf.fit(XTrain[train_indexes], YTrain[train_indexes]) # Predict on the rest of data y_pred = clf.predict(XTrain[predict_indexes]) df_blend_train[predict_indexes, clf_id] = y_pred lloss = log_loss(YTrain[predict_indexes], y_pred) oof_loglosses[clf_id, fold_id] = lloss print 'LogLoss: ', lloss # Predict on entire test set dataset_blend_test_j[:, fold_id] = clf.predict(XTest) # Average predictions for test set df_blend_test[:, clf_id] = dataset_blend_test_j.mean(1) print "Out of fold logloss-es:\n", oof_loglosses np.save('lgbstacking_train_82features.csv', df_blend_train) np.save('lgbstacking_test_82features.csv', df_blend_test) # print "\nBlending ..."
def report_log_loss(train_x, train_y, test_x, test_y, clf): clf.fit(train_x, train_y) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x, train_y) sig_clf_probs = sig_clf.predict_proba(test_x) return log_loss(test_y, sig_clf_probs, eps=1e-15)
# class_weight=None, warm_start=False, average=False, n_iter=None) # some of methods # fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent. # predict(X) Predict class labels for samples in X. ############################################################################### log_error_array = [] for i in alpha: clf = SGDClassifier(alpha=i, penalty='l1', loss='hinge', random_state=42) clf.fit(X_train, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(X_train, y_train) predict_y = sig_clf.predict_proba(X_test) log_error_array.append( log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) print('For values of alpha = ', i, "The log loss is:", log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15)) ############################################################################### fig, ax = plt.subplots() ax.plot(alpha, log_error_array, c='g') for i, txt in enumerate(np.round(log_error_array, 3)): ax.annotate((alpha[i], np.round(txt, 3)), (alpha[i], log_error_array[i])) plt.grid() plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show()
cv_x_responseCoding.shape) # ## Naive Bayes # http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000] cv_log_error_array = [] for i in alpha: print("for alpha =", i) clf = MultinomialNB(alpha=i) clf.fit(train_x_onehotCoding, train_y) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_onehotCoding, train_y) sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) cv_log_error_array.append( log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :", log_loss(cv_y, sig_clf_probs)) fig, ax = plt.subplots() ax.plot(np.log10(alpha), cv_log_error_array, c='g') for i, txt in enumerate(np.round(cv_log_error_array, 3)): ax.annotate((alpha[i], str(txt)), (np.log10(alpha[i]), cv_log_error_array[i])) plt.grid() plt.xticks(np.log10(alpha)) plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show()
alpha = [0.000001, 0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000] cv_log_error_array = [] for i in alpha: print("for alpha = ", i) clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='hinge', random_state=42) clf.fit(train_df, y_train) sig_clf = CalibratedClassifierCV(clf, method='sigmoid') sig_clf.fit(train_df, y_train) sig_clf_probs = sig_clf.predict_proba(cv_df) cv_log_error_array.append( log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15)) print("Log Loss : ", log_loss(y_cv, sig_clf_probs)) best_alpha = np.argmin(cv_log_error_array) print("The best alpha : ", alpha[best_alpha]) clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42) clf.fit(train_df, y_train) sig_clf = CalibratedClassifierCV(clf, method='sigmoid') sig_clf.fit(train_df, y_train) sig_clf_probs = sig_clf.predict_proba(train_df) print("For best alpha, the training log Loss : ", log_loss(y_train, sig_clf_probs))
print("One hot encoding features :") print("(number of data points * number of features) in train data = ", train_x_onehotCoding.shape) print("(number of data points * number of features) in test data = ", test_x_onehotCoding.shape) print("(number of data points * number of features) in cross validation data =", cv_x_onehotCoding.shape) alpha=[0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000] cv_log_error_array=[] for i in alpha: print("for alpha =", i) clf=MultinomialNB(alpha=i) clf.fit(train_x_onehotCoding,train_y) sig_clf=CalibratedClassifierCV(clf,method="sigmoid") sig_clf.fit(train_x_onehotCoding,train_y) sig_clf_probs=sig_clf.predict_proba(cv_x_onehotCoding) cv_log_error_array.append(log_loss(cv_y,sig_clf_probs,labels=clf.classes_,eps=1e-15)) print("Log Loss :",log_loss(cv_y,sig_clf_probs)) fig,ax=plt.subplots() ax.plot(np.log10(alpha),cv_log_error_array,c='g') for i, txt in enumerate(np.round(cv_log_error_array,3)): ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i])) plt.grid() plt.xticks(np.log10(alpha)) plt.title("Cross Validation Error for each alpha") plt.xlabel("Alpha i's") plt.ylabel("Error measure") plt.show() best_alpha=np.argmin(cv_log_error_array) clf=MultinomialNB(alpha=alpha[best_alpha])
def logloss(act, pred, class_weight=None): if class_weight == 'balanced': sample_weight = unbalanced_sample_weight(act) else: sample_weight = None return log_loss(act, pred, sample_weight=sample_weight)
plt.ylabel('Number of data points') plt.title('Distribution') plt.grid() plt.show() """ #check distribution in all sets i.e train ,test,cv and plot graph and find percentage test_data_len = test_df.shape[0] cv_data_len = cv_df.shape[0] #create random model for apllying log loss cv_predicted_y = np.zeros((cv_data_len, 9)) for i in range(cv_data_len): rand_probs = np.random.rand(1, 9) cv_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0]) print("log loss on cv data using random model", log_loss(y_cv, cv_predicted_y, eps=1e-15)) test_predicted_y = np.zeros((test_data_len, 9)) for i in range(test_data_len): rand_probs = np.random.rand(1, 9) test_predicted_y[i] = ((rand_probs / sum(sum(rand_probs)))[0]) print("log loss on test data using random model", log_loss(y_test, test_predicted_y, eps=1e-15)) predicted_y = np.argmax(test_predicted_y, axis=1) print(predicted_y) predicted_y = predicted_y + 1 C = confusion_matrix(y_test, predicted_y) labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]
inter_cat = df_full['code'].astype('category') inter_cat = pd.get_dummies(inter_cat) p1 = pd.DataFrame(inter_cat.iloc[split[0]], index=X.index.values) result = pd.concat([X, p1], axis=1, ignore_index=True) p2 = pd.DataFrame(inter_cat.iloc[split[1]], index=X_test.index.values) result1 = pd.concat([X_test, p2], axis=1, ignore_index=True) # Train Logistic regression logreg = linear_model.LogisticRegression(C=1e5, penalty='l1', multi_class='ovr') logreg.fit(result, y) clf_probs = logreg.predict_proba(result1) print("Logistic score", logreg.score(result1, y_true), log_loss(y_true, clf_probs)) # add to test dataframes predicted = logreg.predict(result) df['logreg'] = predicted predicted = logreg.predict(result1) dft['logreg'] = predicted ''' # SVM # train logreg = svm.LinearSVC() logreg.fit(result, y) print("SVM score", logreg.score(result1, y_true)) # add to test dataframes
optimizer=None) gp_fix.fit(X[:train_size], y[:train_size]) gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0)) gp_opt.fit(X[:train_size], y[:train_size]) print("Log Marginal Likelihood (initial): %.3f" % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)) print("Log Marginal Likelihood (optimized): %.3f" % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)) print("Accuracy: %.3f (initial) %.3f (optimized)" % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])))) print("Log-loss: %.3f (initial) %.3f (optimized)" % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]), log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]))) # Plot posteriors plt.figure() plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data", edgecolors=(0, 0, 0)) plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data", edgecolors=(0, 0, 0))
def text_only_model(result,y): ext_vectorizer = CountVectorizer(min_df=4)# In Feature we choose only those words with greater then 3 times occurence x=text_vectorizer.fit_transform(Result['Text']) X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset, y, test_size=0.3) X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3) Dataset=normalize(x,axis=0) tunes_para=[10 ** x for x in range(-5, 1)] cv_array_loss=[] # want to tune for alpha in these code for i in tunes_para: print("for alpha =", i) clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42) clf.fit(X_tr,y_tr) clf2 = CalibratedClassifierCV(clf, method="sigmoid") clf2.fit(X_tr,y_tr) clf2_probs = clf2.predict_proba(X_test) cv_array_loss.append(log_loss(y_test, clf2_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :",log_loss(y_test, clf2_probs)) ################################################################ def Combine_features(X_tr, X_cv, y_tr, y_cv,X_tr, X_test, y_tr, y_test,result): #Let us do one hot convert of this(for test, train, cv) gene_vectorizer = CountVectorizer() train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Gene']) test_gene_feature_onehotCoding = gene_vectorizer.transform(X_test['Gene']) cv_gene_feature_onehotCoding = gene_vectorizer.transform(X_cv['Gene']) #Let us do one hot convert of this(for test, train, cv) gene_vectorizer = CountVectorizer() train_Variation_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Variation']) test_Variation_feature_onehotCoding = gene_vectorizer.transform(X_test['Variation']) cv_Variation_feature_onehotCoding = gene_vectorizer.transform(X_cv['Variation']) text_vectorizer = CountVectorizer(min_df=4)# In Feature we choose only those words with greater then 3 times occurence x=text_vectorizer.fit_transform(Result['Text']) Dataset=normalize(x,axis=0) X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset, y, test_size=0.3) X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3) # Lets print the shape of all the three Features print(train_gene_feature_onehotCoding.shape) print(test_gene_feature_onehotCoding.shape) print(cv_gene_feature_onehotCoding.shape) # Lets print the shape of all the three Features print(train_Variation_feature_onehotCoding.shape) print(test_Variation_feature_onehotCoding.shape) print(cv_Variation_feature_onehotCoding.shape) print(X_tr.shape) print(X_test.shape) print(X_cv.shape) X_tr=pd.DataFrame(X_tr.todense()) X_test=pd.DataFrame(X_test.todense()) X_cv=pd.DataFrame(X_cv.todense()) train_Variation_feature_onehotCoding=pd.DataFrame(train_Variation_feature_onehotCoding.todense()) test_Variation_feature_onehotCoding=pd.DataFrame(test_Variation_feature_onehotCoding.todense()) cv_Variation_feature_onehotCoding=pd.DataFrame(cv_Variation_feature_onehotCoding.todense()) train_gene_feature_onehotCoding=pd.DataFrame(train_gene_feature_onehotCoding.todense()) test_gene_feature_onehotCoding=pd.DataFrame(test_gene_feature_onehotCoding.todense()) cv_gene_feature_onehotCoding=pd.DataFrame(cv_gene_feature_onehotCoding.todense()) train = X_tr.join(train_gene_feature_onehotCoding,lsuffix="_X_tr",rsuffix="_train_gene_feature_onehotCoding") train = train.join(train_Variation_feature_onehotCoding,lsuffix="_train",rsuffix="_train_Variation_feature_onehotCoding") print(train.shape) test = X_test.join(test_gene_feature_onehotCoding,lsuffix="_X_test",rsuffix="_test_gene_feature_onehotCoding") test = test.join(test_Variation_feature_onehotCoding,lsuffix="_test",rsuffix="_test_Variation_feature_onehotCoding") print(test.shape) cv = X_cv.join(test_gene_feature_onehotCoding,lsuffix="_X_cv",rsuffix="_cv_gene_feature_onehotCoding") cv = cv.join(test_Variation_feature_onehotCoding,lsuffix="_cv",rsuffix="_cv_Variation_feature_onehotCoding") print(cv.shape) # Before appliing model lets remove all nan value features=train.columns pd.options.mode.chained_assignment = None for i in features: print("Done") train[i].fillna(0, inplace=True) features=test.columns pd.options.mode.chained_assignment = None for i in features: test[i].fillna(0, inplace=True) features=cv.columns pd.options.mode.chained_assignment = None for i in features: cv[i].fillna(0, inplace=True) return train,test,cv ########################################################## def logreg(train,test,cv): tunes_para=[10 ** x for x in range(-5, 1)] cv_array_loss=[] # want to tune for alpha in these code for i in tunes_para: print("for alpha =", i) clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42) clf.fit(train,y_tr) clf2 = CalibratedClassifierCV(clf, method="sigmoid") clf2.fit(train,y_tr) clf2_probs = clf2.predict_proba(cv) cv_array_loss.append(log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :",log_loss(y_cv, clf2_probs)) ########################################################### ################################################################ if __name__=="__main__": main() nltk.download('stopwords') stop=set(stopwords.words('english')) #function to clean the word # Lets see all the stop words print(stop) # loading stop words from nltk library stop_words = set(stopwords.words('english')) def nlp_preprocessing(total_text, index, column): if type(total_text) is not int: string = "" # replace every special char with space total_text = re.sub('[^a-zA-Z0-9\n]', ' ', str(total_text)) # replace multiple spaces with single space total_text = re.sub('\s+',' ', str(total_text)) # converting all the chars into lower-case. total_text = total_text.lower() for word in total_text.split(): # if the word is a not a stop word then retain that word from the data if not word in stop_words: string += word + " " data_text[column][index] = string #text processing stage. start_time = time.clock() for index, row in data_text.iterrows(): nlp_preprocessing(row['Text'], index, 'Text') print('Time took for preprocessing the text :',time.clock() - start_time, "seconds") #merging both gene_variations and text data based on ID data = pd.read_csv("training_variants.csv") result = pd.merge(data, data_text,on='ID', how='left') result.head() # Lets split the data into train and test Result=result y=result["Class"].values X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(result, y, test_size=0.3) X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3) #Let us do one hot convert of this(for test, train, cv) gene_vectorizer = CountVectorizer() train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Gene']) test_gene_feature_onehotCoding = gene_vectorizer.transform(X_test['Gene']) cv_gene_feature_onehotCoding = gene_vectorizer.transform(X_cv['Gene']) #Let us do one hot convert of this(for test, train, cv) gene_vectorizer = CountVectorizer() train_Variation_feature_onehotCoding = gene_vectorizer.fit_transform(X_tr['Variation']) test_Variation_feature_onehotCoding = gene_vectorizer.transform(X_test['Variation']) cv_Variation_feature_onehotCoding = gene_vectorizer.transform(X_cv['Variation']) text_vectorizer = CountVectorizer(min_df=4)# In Feature we choose only those words with greater then 3 times occurence x=text_vectorizer.fit_transform(Result['Text']) Dataset=normalize(x,axis=0) X_tr, X_cv, y_tr, y_cv = cross_validation.train_test_split(Dataset, y, test_size=0.3) X_tr, X_test, y_tr, y_test = cross_validation.train_test_split(X_tr, y_tr, test_size=0.3) # Lets print the shape of all the three Features print(train_gene_feature_onehotCoding.shape) print(test_gene_feature_onehotCoding.shape) print(cv_gene_feature_onehotCoding.shape) # Lets print the shape of all the three Features print(train_Variation_feature_onehotCoding.shape) print(test_Variation_feature_onehotCoding.shape) print(cv_Variation_feature_onehotCoding.shape) print(X_tr.shape) print(X_test.shape) print(X_cv.shape) X_tr=pd.DataFrame(X_tr.todense()) X_test=pd.DataFrame(X_test.todense()) X_cv=pd.DataFrame(X_cv.todense()) train_Variation_feature_onehotCoding=pd.DataFrame(train_Variation_feature_onehotCoding.todense()) test_Variation_feature_onehotCoding=pd.DataFrame(test_Variation_feature_onehotCoding.todense()) cv_Variation_feature_onehotCoding=pd.DataFrame(cv_Variation_feature_onehotCoding.todense()) train_gene_feature_onehotCoding=pd.DataFrame(train_gene_feature_onehotCoding.todense()) test_gene_feature_onehotCoding=pd.DataFrame(test_gene_feature_onehotCoding.todense()) cv_gene_feature_onehotCoding=pd.DataFrame(cv_gene_feature_onehotCoding.todense()) train = X_tr.join(train_gene_feature_onehotCoding,lsuffix="_X_tr",rsuffix="_train_gene_feature_onehotCoding") train = train.join(train_Variation_feature_onehotCoding,lsuffix="_train",rsuffix="_train_Variation_feature_onehotCoding") print(train.shape) test = X_test.join(test_gene_feature_onehotCoding,lsuffix="_X_test",rsuffix="_test_gene_feature_onehotCoding") test = test.join(test_Variation_feature_onehotCoding,lsuffix="_test",rsuffix="_test_Variation_feature_onehotCoding") print(test.shape) cv = X_cv.join(test_gene_feature_onehotCoding,lsuffix="_X_cv",rsuffix="_cv_gene_feature_onehotCoding") cv = cv.join(test_Variation_feature_onehotCoding,lsuffix="_cv",rsuffix="_cv_Variation_feature_onehotCoding") print(cv.shape) features=train.columns pd.options.mode.chained_assignment = None for i in features: print("Done") train[i].fillna(0, inplace=True) features=test.columns pd.options.mode.chained_assignment = None for i in features: test[i].fillna(0, inplace=True) features=cv.columns pd.options.mode.chained_assignment = None for i in features: cv[i].fillna(0, inplace=True) print(cv.shape) train.to_csv("rtrain.csv") test.to_csv("rtest.csv") cv.to_csv("rcv.csv") tunes_para=[10 ** x for x in range(-5, 1)] cv_array_loss=[] # want to tune for alpha in these code for i in tunes_para: print("for alpha =", i) clf = SGDClassifier(class_weight='balanced',alpha=i,penalty='l2',loss='log',random_state=42) clf.fit(train,y_tr) clf2 = CalibratedClassifierCV(clf, method="sigmoid") clf2.fit(train,y_tr) clf2_probs = clf2.predict_proba(cv) cv_array_loss.append(log_loss(y_cv, clf2_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :",log_loss(y_cv, clf2_probs))
from sklearn.metrics.classification import (hamming_loss, log_loss) # binary class y_pred = [1, 2, 3, 4] y_true = [1, 2, 3, 4] y_true = [2, 2, 3, 4] y_true = [5, 6, 7, 8] hamming_loss(y_true, y_pred) hamming_loss(list("ABFD"), list("ABCD")) #multi class hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2))) y_true = [0, 0, 1, 1] y_pred = [[.9, .1], [.8, .2], [.3, .7], [.01, .99]] # [Pr(0), Pr(1)] log_loss(y_true, y_pred) """ Receiver operating characteristic (ROC) Curve roc_curve? roc_curve(y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=True) Note: this implementation is restricted to the binary classification task. y_true : array, shape = [n_samples] True binary labels in range {0, 1} or {-1, 1}. If labels are not binary, pos_label should be explicitly given. y_score : array, shape = [n_samples] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions
def h2o_log_loss(y_actual, y_predict, eps=1e-15, normalize=True, sample_weight=None, y_type=None): """Log loss, aka logistic loss or cross-entropy loss. This is the loss function used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of the true labels given a probabilistic classifier's predictions. The log loss is only defined for two or more labels. For a single sample with true label yt in {0,1} and estimated probability yp that yt = 1, the log loss is -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp)) This method is adapted from the ``sklearn.metrics.classification.log_loss`` function for use with ``H2OFrame``s in skutil. Parameters ---------- y_actual : ``H2OFrame``, shape=(n_samples,) The one-dimensional ground truth y_predict : ``H2OFrame``, shape=(n_samples, [n_classes]) The predicted labels. Can represent a matrix. If ``y_predict.shape = (n_samples,)`` the probabilities provided are assumed to be that of the positive class. The labels in ``y_predict`` are assumed to be ordered ordinally. eps : float, optional (default=1e-15) Log loss is undefined for p=0 or p=1, so probabilities are clipped to max(eps, min(1 - eps, p)). normalize : bool, optional (default=True) If true, return the mean loss per sample. Otherwise, return the sum of the per-sample losses. sample_weight : H2OFrame or float, optional (default=None) A frame of sample weights of matching dims with y_actual and y_predict. y_type : string, optional (default=None) The type of the column. If None, will be determined. Returns ------- loss : float Notes ----- The logarithm used is the natural logarithm (base-e). """ # SKIP THESE FOR NOW, SINCE VALIDATED IN SKLEARN PORTION # y_type, y_actual, y_predict = _check_targets(y_actual, y_predict, y_type) # _err_for_continuous(y_type) # this is restricted to classification tasks if sample_weight is not None: if isinstance(sample_weight, H2OFrame): _, _, sample_weight = _check_targets(y_actual, sample_weight, 'unknown') # we don't care about y_type here sample_weight = h2o_col_to_numpy(sample_weight) # else we just duck type it later # todo: do this better someday y_actual = h2o_col_to_numpy(y_actual) # this is supposed to be a ONE-dim vector y_predict = y_predict.as_data_frame(use_pandas=True).as_matrix() # this might be 2-dim # if it's a column, make it a vector. if len(y_predict.shape) == 2 and y_predict.shape[1] == 1: y_predict = y_predict.T[0] return log_loss(y_actual, y_predict, eps=eps, normalize=normalize, sample_weight=sample_weight)