def try_params(n_iterations, params): n_estimators = int(round(n_iterations * trees_per_iteration)) print "n_estimators:", n_estimators pprint(params) classifier = params['classifier'] del params['classifier'] clf = eval("{}( n_estimators = n_estimators, verbose = 0, n_jobs = -1, \ **params )".format(classifier)) clf.fit(x_train, y_train) p = clf.predict_proba(x_train)[:, 1] ll = log_loss(y_train, p) auc = AUC(y_train, p) acc = accuracy(y_train, np.round(p)) print "\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) # p = clf.predict_proba(x_test)[:, 1] ll = log_loss(y_test, p) auc = AUC(y_test, p) acc = accuracy(y_test, np.round(p)) print "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc) return {'loss': ll, 'log_loss': ll, 'auc': auc}
def score(self, submission_text: str): public_preds, private_preds, public_actuals, private_actuals = [ 1, 0 ], [1, 0], [1, 0], [1, 0] tmp_file = StringIO(submission_text) csv_reader = csv.DictReader(tmp_file) public_event_ids_pkl_name = '{}/public_validation_event_ids.pkl'.format( cur_dir_path) with open(public_event_ids_pkl_name, 'rb') as public_validation_event_ids_file: public_validation_event_ids = pickle.load( public_validation_event_ids_file) for row in csv_reader: if row['event_id'] in public_validation_event_ids: public_preds.append(float(row['conversion_probability'])) else: private_preds.append(float(row['conversion_probability'])) with open('{}/all_validation_labels.txt'.format(cur_dir_path), 'r') as all_validation_labels_file: for line in all_validation_labels_file: event_id, event_label = line.rstrip().split(' ') if event_id in public_validation_event_ids: public_actuals.append(float(event_label)) else: private_actuals.append(float(event_label)) public_score = AUC(public_actuals, public_preds) private_score = AUC(private_actuals, private_preds) return public_score, private_score, None
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_stop=100, retrain=True): model_name = os.path.splitext( os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) model = AutoLGB(objective='binary', metric='auc', n_random_col=0) model.tune(pd.DataFrame(X), pd.Series(y)) params = model.params n_est = model.n_best logging.info(f'params: {params}') logging.info(f'n_best: {n_est}') p = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) n_bests = [] for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info('Training model #{}'.format(i)) trn_lgb = lgb.Dataset(X[i_trn], label=y[i_trn]) val_lgb = lgb.Dataset(X[i_val], label=y[i_val]) logging.info('Training with early stopping') clf = lgb.train(params, trn_lgb, n_est, val_lgb, early_stopping_rounds=n_stop, verbose_eval=100) n_best = clf.best_iteration n_bests.append(n_best) logging.info('best iteration={}'.format(n_best)) p[i_val] = clf.predict(X[i_val]) logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val]))) p_tst += clf.predict(X_tst) / N_FOLD logging.info('CV: {:.4f}'.format(AUC(y, p))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, C, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='lr_{}_{}.log'.format( C, feature_name )) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) clf = SVC(C=C, class_weight='auto', random_state=2015, probability=True) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training CV #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.4f}'.format(AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.4f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.4f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, lrate=.1, l1=.0, l2=.0, n_fold=5): dir_feature = os.path.dirname(train_file) dir_val = os.path.dirname(predict_valid_file) feature_name = os.path.basename(train_file)[:-4] algo_name = 'xgl_{}_{}_{}_{}'.format(n_est, lrate, l1, l2) model_name = '{}_{}'.format(algo_name, feature_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) param = {'eta': lrate, 'objective': 'binary:logistic', 'colsample_bytree': .7, 'subsample': .5, 'eval_metric': 'auc', 'seed': 2015, 'booster': 'gblinear', 'alpha': l1, 'lambda': l2} logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) dtrain = xgb.DMatrix(X[i_trn], label=y[i_trn]) dvalid = xgb.DMatrix(X[i_val], label=y[i_val]) watchlist = [(dvalid, 'eval'), (dtrain, 'train')] clf = xgb.train(param, dtrain, n_est, watchlist) p_val[i_val] = clf.predict(dvalid) logging.info('AUC TRN = {:.6f}'.format(AUC(y[i_trn], clf.predict(dtrain)))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') dtrain = xgb.DMatrix(X, label=y) dtest = xgb.DMatrix(test_file) watchlist = [(dtrain, 'train')] clf = xgb.train(param, dtrain, n_est, watchlist) p_tst = clf.predict(dtest) logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, batch_size=1024, retrain=True): model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) dims = X.shape[1] logging.info('{} dims'.format(dims)) logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p = np.zeros_like(y) p_tst = np.zeros((X_tst.shape[0],)) for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info('Training model #{}'.format(i)) clf = nn_model(dims) clf.fit_generator(generator=batch_generator(X[i_trn], y[i_trn], batch_size, True), nb_epoch=n_est, samples_per_epoch=X[i_trn].shape[0], verbose=0) p[i_val] = clf.predict_generator(generator=batch_generatorp(X[i_val], batch_size, False), val_samples=X[i_val].shape[0])[:, 0] logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val]))) if not retrain: p_tst += clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False), val_samples=X_tst.shape[0])[:, 0] / N_FOLD logging.info('Saving validation predictions...') logging.info('CV: {:.4f}'.format(AUC(y, p))) np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') clf = nn_model(dims) clf.fit_generator(generator=batch_generator(X, Y, batch_size, True), nb_epoch=n_est) p_tst = clf.predict_generator(generator=batch_generatorp(X_tst, batch_size, False), val_samples=X_tst.shape[0])[:, 0] logging.info('Saving normalized test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def eval_pred(y_true, y_pred, eval_type=eval_type): if eval_type == 'logloss': #eval_typeはここに追加 print "logloss: ", ll(y_true, y_pred) return ll(y_true, y_pred) elif eval_type == 'auc': print "AUC: ", AUC(y_true, y_pred) return AUC(y_true, y_pred) elif eval_type == 'rmse': print "rmse: ", np.sqrt(mean_squared_error(y_true, y_pred)) return np.sqrt(mean_squared_error(y_true, y_pred))
def trainModels(): # SVM looks much better in validation print "training SVM..." # although one needs to choose these hyperparams C = 20 #173 gamma = 0.001 #1.31e-5 shrinking = True probability = True verbose = True svc = SVC(C=C, gamma=gamma, shrinking=shrinking, probability=probability, verbose=verbose) svc.fit(x_train, y_train) p = svc.predict_proba(x_test) print x_test[12] print svc.predict_proba(x_test[12]) print svc.predict_proba(x_test[13]) print svc.predict_proba(x_test[14]) auc = AUC(y_test, p[:, 1]) print "SVM AUC", auc print "training random forest..." n_trees = 100 max_features = int(round(sqrt(x_train.shape[1]) * 2)) # try more features at each split max_features = 'auto' verbose = 1 n_jobs = 1 rf = RF(n_estimators=n_trees, max_features=max_features, verbose=verbose, n_jobs=n_jobs) rf.fit(x_train, y_train) p = rf.predict_proba(x_test) print x_test[12] print rf.predict_proba(x_test[12]) print rf.predict_proba(x_test[13]) print rf.predict_proba(x_test[14]) auc = AUC(y_test, p[:, 1]) print "RF AUC", auc
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est=100, depth=4, lrate=.1, n_fold=5, n_bag=50, subrow=.5, subcol=.8): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='xg_bag{}_{}_{}_{}_{}_{}_{}.log'.format( n_bag, n_est, depth, lrate, subrow, subcol, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) xg = xgb.XGBClassifier(max_depth=depth, learning_rate=lrate, n_estimators=n_est, colsample_bytree=.8, subsample=.5, nthread=4) clf = BG(xg, n_estimators=n_bag, max_samples=subrow, max_features=subcol) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.6f}'.format( AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def calculate_statistics(y_pred, y_true): tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() print('Bayes model results:') print('True Positives: ' + str(tp)) print('True Negative: ' + str(tn)) print('False Positives: ' + str(fp)) print('False Negatives: ' + str(fn)) fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1) print('AUC ' + str(AUC(fpr, tpr))) print('Accuracy: ' + str((float(tp + tn) / (tp + tn + fp + fn) * 100)) + '%') return tn, fp, fn, tp, AUC(fpr, tpr), str( (float(tp + tn) / (tp + tn + fp + fn) * 100))
def SuperVisedMakeModel(self, model_name, train_f, train_t, test_f, test_t, is_valid=0, do_cv=2): """Training Model""" t1 = time.time() self._ModelSetting(model_name) print('------------------------------------') print('Model : %15s' % model_name) print('Parameters : %15s' % self.model_p) self.clf.fit(train_f, train_t) acc1 = float(self.clf.score(train_f, train_t)) #self.clf.fit(test_f, test_t) acc2 = float(self.clf.score(test_f, test_t)) t2 = time.time() print('Training time: %7s seconds' % str(np.round(t2 - t1, 0))) """Predicting""" train_pred1 = self.clf.predict(train_f) #train_pred2 = self.clf.predict_proba(train_f) test_pred1 = self.clf.predict(test_f) #test_pred2 = self.clf.predict_proba(test_f) t3 = time.time() print('Predicting time: %7s seconds' % str(np.round(t3 - t2, 0))) """CV""" if do_cv > 0: cv_scores = CV.cross_val_score(self.clf, train_f, train_t, cv=do_cv, scoring='accuracy') #cv_scores2 = CV.cross_val_score(self.clf, train_f, train_t, cv = 2, scoring = 'roc_auc', n_jobs = -1) #cv_scores = CV.cross_val_score(self.clf, train_f, train_t, cv = 5, scoring = 'log_loss', n_jobs = -1) t4 = time.time() print('Cross Validation time: %7s seconds' % str(np.round(t4 - t3, 0))) print('CV Accuracy : %.9f (+/- %0.5f)' % (cv_scores.mean(), cv_scores.std() * 2)) print(cv_scores) self.models_dict["%s_score" % model_name] = cv_scores.mean() """Scoring""" print('Training Accuracy: %.9f' % acc1) print('Training AUC : %.9f' % AUC(train_t, train_pred1)) #print('Training LogLoss : %.9f' % LOGLOSS(train_t, train_pred2, eps = 1e-15)) if is_valid: print('Valid Accuracy : %.9f' % acc2) print('Valid AUC : %.9f' % AUC(test_t, test_pred1))
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, retrain=True): model_name = os.path.splitext(os.path.splitext(os.path.basename(predict_test_file))[0])[0] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) logging.info('Loading CV Ids') cv = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=SEED) p = np.zeros(X.shape[0]) p_tst = np.zeros(X_tst.shape[0]) n_bests = [] for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1): logging.info('Training model #{}'.format(i)) glm = linear_model.LogisticRegression(solver='lbfgs', max_iter=2020, fit_intercept=True, penalty='none', verbose=0) glm.fit(X[i_trn], y[i_trn]) p[i_val] = glm.predict_proba(X[i_val])[:, 1] logging.info('CV #{}: {:.4f}'.format(i, AUC(y[i_val], p[i_val]))) if not retrain: p_tst += glm.predict_proba(X_tst)[:,1] / N_FOLD logging.info('CV: {:.4f}'.format(AUC(y, p))) logging.info('Saving validation predictions...') np.savetxt(predict_valid_file, p, fmt='%.6f', delimiter=',') if retrain: logging.info('Retraining with 100% training data') glm = linear_model.LogisticRegression(random_state=1, solver='lbfgs', max_iter=2020, fit_intercept=True, penalty='none', verbose=0) glb = glb.fit(X, y) p_tst = glb.predict_proba(X_tst) logging.info('Saving test predictions...') np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
def testNB(self): self.clf = GaussianNB() self.clf.fit(self.X_train, self.y) self.NB_p = self.clf.predict_proba(self.X_test) self.aucNB = AUC(self.y_test.values, self.NB_p[:, 1]) print self.aucNB return self.rf_p
def drift_detector(S, T, threshold=0.75): T = pd.DataFrame(T) S = pd.DataFrame(S) # Give slack variable in_target which is 1 for old and 0 for new T['in_target'] = 0 # in target set S['in_target'] = 1 # in source set # Combine source and target with new slack variable ST = pd.concat([T, S], ignore_index=True, axis=0) labels = ST['in_target'].values ST = ST.drop('in_target', axis=1).values # You can use any classifier for this step. We advise it to be a simple one as we want to see whether source # and target differ not to classify them. clf = LogisticRegression(solver='liblinear') predictions = np.zeros(labels.shape) # Divide ST into two equal chunks # Train LR on a chunk and classify the other chunk # Calculate AUC for original labels (in_target) and predicted ones skf = StratifiedKFold(n_splits=2, shuffle=True) for train_idx, test_idx in skf.split(ST, labels): X_train, X_test = ST[train_idx], ST[test_idx] y_train, y_test = labels[train_idx], labels[test_idx] clf.fit(X_train, y_train) probs = clf.predict_proba(X_test)[:, 1] predictions[test_idx] = probs auc_score = AUC(labels, predictions) # Signal drift if AUC is larger than the threshold if auc_score > threshold: return True else: return False
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, n_fold=5): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='rf_{}_{}_{}.log'.format(n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) clf = RF(n_estimators=n_est, max_depth=depth, random_state=2015) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) logging.info('Cross validation...') p_val = np.zeros_like(y) for i_trn, i_val in cv: clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC = {:.4f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def on_epoch_end(self, epoch, logs={}): if epoch % self.interval == 0: y_pred = self.model.predict_proba(self.X_val, verbose=0) score = AUC(self.y_val, y_pred) #logging.info("interval evaluation - epoch: {:d} - score: {:.6f}".format(epoch, score)) print "interval evaluation - epoch: {:d} - score: {:.6f}".format( epoch, score)
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_est, depth, n_fold=5, n_bag=50): feature_name = os.path.basename(train_file)[:-4] logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='et_bag{}_{}_{}_{}.log'.format( n_bag, n_est, depth, feature_name)) logging.info('Loading training and test data...') X, y = load_data(train_file) X_tst, _ = load_data(test_file) et = ET(n_estimators=n_est, max_depth=depth, random_state=2015, class_weight='auto', bootstrap=True) clf = BG(et, n_estimators=n_bag, max_samples=.8, max_features=.9) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) clf.fit(X[i_trn], y[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] logging.info('AUC TRN = {:.6f}'.format( AUC(y[i_trn], clf.predict_proba(X[i_trn])[:, 1]))) logging.info('AUC VAL = {:.6f}'.format(AUC(y[i_val], p_val[i_val]))) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) logging.info('Retraining with 100% data...') clf.fit(X, y) p_tst = clf.predict_proba(X_tst)[:, 1] logging.info('Saving predictions...') np.savetxt(predict_valid_file, p_val, fmt='%.6f') np.savetxt(predict_test_file, p_tst, fmt='%.6f')
def train_predict(train_file, test_file, predict_valid_file, predict_test_file, n_iter=100, dim=4, lrate=.1, n_fold=5): dir_feature = os.path.dirname(train_file) dir_val = os.path.dirname(predict_valid_file) feature_name = os.path.basename(train_file)[:-8] algo_name = 'libfm_{}_{}_{}'.format(n_iter, dim, lrate) model_name = '{}_{}'.format(algo_name, feature_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info('Loading training data') X, y = load_data(train_file) n_tst = sum(1 for line in open(test_file)) cv = StratifiedKFold(y, n_folds=n_fold, shuffle=True, random_state=2015) p_val = np.zeros_like(y) for i, (i_trn, i_val) in enumerate(cv, 1): logging.info('Training model #{}...'.format(i)) valid_train_file = os.path.join(dir_feature, '{}.trn{}.sps'.format(feature_name, i)) valid_test_file = os.path.join(dir_feature, '{}.val{}.sps'.format(feature_name, i)) valid_predict_file = os.path.join(dir_val, '{}.val{}.yht'.format(model_name, i)) # if there is no CV training or validation file, then generate them # first. if (not os.path.isfile(valid_train_file) or not os.path.isfile(valid_test_file)): dump_svmlight_file(X[i_trn], y[i_trn], valid_train_file, zero_based=False) dump_svmlight_file(X[i_val], y[i_val], valid_test_file, zero_based=False) subprocess.call(["libFM", "-task", "c", '-dim', '1,1,{}'.format(dim), '-init_stdev', str(lrate), '-iter', str(n_iter), '-train', valid_train_file, '-test', valid_test_file, '-out', valid_predict_file]) p_val[i_val] = np.loadtxt(valid_predict_file) os.remove(valid_predict_file) logging.info('AUC = {:.6f}'.format(AUC(y, p_val))) np.savetxt(predict_valid_file, p_val, fmt='%.6f') logging.info('Retraining with 100% data...') subprocess.call(["libFM", "-task", "c", '-dim', '1,1,{}'.format(dim), '-init_stdev', str(lrate), '-iter', str(n_iter), '-train', train_file, '-test', test_file, '-out', predict_test_file])
def testRF(self): forest = RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1) self.forest = forest.fit(self.X_train, self.y) self.rf_p = self.forest.predict_proba(self.X_test) self.aucRF = AUC(self.y_test.values, self.rf_p[:, 1]) print self.aucRF return self.rf_p
def train_and_eval_auc(train_x, train_y, test_x, test_y, model=LR()): model.fit(train_x, train_y) p = model.predict_proba(test_x) print p # hack p = p[:, 1] if p.shape[1] > 1 else p[:, 0] auc = AUC(test_y, p) print "AUC:", auc
def train_and_evaluate(y_train, x_train, y_val, x_val, alg): alg.fit(x_train, y_train) p = alg.predict_proba(x_val) p_bin = alg.predict(x_val) acc = accuracy(y_val, p_bin) auc = AUC(y_val, p[:, 1]) return (auc, acc)
def crossValidate(clf,x,y,folds=5,runs=5): ''' Function for doing K-Fold cross validation. clf = classifier x = training data, numpy NDarray y = labels, numpy 0D array folds = number of partitions to be made for the training data runs = number of times to repeat the cross validation process, each time with a different random partition. folds=5 and runs=10 will do a 5-fold cross validation 10 times on the dataset and calculate the AUC deviation accross these 50 instances. ''' ypred = np.zeros((len(y),runs)) fold_auc = np.zeros((runs,folds)) r=0 score = np.zeros(runs) for run in range(runs): i=0 x,y = shuffle(x,y,random_state=19*(run+3)) # some random seeding to be unique kf = KFold(y,n_folds=folds,random_state=18*(run+93)) print 'Cross Validating...' for train_ind,test_ind in kf: print 'CV Fold ' + str(i+1) + ' out of ' + str(folds) xtrain,ytrain = x[train_ind,:],y[train_ind] xtest,ytest = x[test_ind,:],y[test_ind] clf.fit(xtrain,ytrain) #a = 100*clf.feature_importances_ #print ["%0.3f" % f for f in a] fold_pred = clf.predict_proba(xtest)[:,1] fold_pred[xtest[:,1]<23]=0 fold_auc[r,i] = AUC(ytest,fold_pred) ypred[test_ind,r]=fold_pred i=i+1 score[r] = AUC(y,ypred[:,r]) r=r+1 print 'Fold AUC: ' + str(fold_auc) print 'Mean: ' + str(np.mean(fold_auc)) print 'Deviation: ' + str(np.std(fold_auc)) print '\nOverall AUC: '+ str(score) print 'Mean: ' + str(np.mean(score)) print 'Deviation: ' + str(np.std(score)) return score
def findBetterValidation(df, dft): traindf = df dft['Y'] = -1 testdf = dft traindf['target'] = 0 testdf['target'] = 1 datadf = pd.concat((traindf, testdf)) datadf = datadf.iloc[np.random.permutation(len(datadf))] datadf.reset_index(drop=True, inplace=True) x = datadf.drop(['target', 'Y'], axis=1) y = datadf.target n_estimators = 100 clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=16, random_state=0) scores = cross_val_score(clf, x, y, scoring='roc_auc', cv=5) print('old val scores', scores) clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=16, random_state=0) predictions = np.zeros(y.shape) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5678) cv.get_n_splits(x, y) for f, (train_i, test_i) in enumerate(cv.split(x, y)): x_train = x.iloc[train_i] x_test = x.iloc[test_i] y_train = y.iloc[train_i] y_test = y.iloc[test_i] clf.fit(x_train, y_train) p = clf.predict_proba(x_test)[:, 1] auc = AUC(y_test, p) print("# AUC: {:.2%}\n".format(auc), auc) print('p', p) predictions[test_i] = np.abs(p - 0.5) x['p'] = predictions x['target'] = datadf.target.copy() x['Y'] = datadf.Y.copy() index = predictions.argsort() train_sorted = x.iloc[index] vallen = int(len(train_sorted) * 0.7) clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=16, random_state=0) scores = cross_val_score(clf, train_sorted.drop(['target', 'Y'], axis=1).iloc[:vallen], train_sorted.target.iloc[:vallen], scoring='roc_auc', cv=5) print('new val scores', scores) train_sorted = train_sorted[train_sorted.target == 0] return train_sorted.drop(['target'], axis=1)
def train_and_evaluate( y_train, x_train, y_val, x_val ): lr = LR() lr.fit( x_train, y_train ) p = lr.predict_proba( x_val ) p_bin = lr.predict( x_val ) acc = accuracy( y_val, p_bin ) auc = AUC( y_val, p[:,1] ) return ( auc, acc )
def train_and_eval_sklearn_classifier(clf, data): x_train = data['x_train'] y_train = data['y_train'] x_test = data['x_test'] y_test = data['y_test'] clf.fit(x_train, y_train) try: p = clf.predict_proba(x_train)[:, 1] # sklearn convention except IndexError: p = clf.predict_proba(x_train) ll = log_loss(y_train, p) auc = AUC(y_train, p) acc = accuracy(y_train, np.round(p)) print("\n# training | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}". format(ll, auc, acc)) # try: p = clf.predict_proba(x_test)[:, 1] # sklearn convention except IndexError: p = clf.predict_proba(x_test) ll = log_loss(y_test, p) auc = AUC(y_test, p) acc = accuracy(y_test, np.round(p)) print( "# testing | log loss: {:.2%}, AUC: {:.2%}, accuracy: {:.2%}".format( ll, auc, acc)) #return { 'loss': 1 - auc, 'log_loss': ll, 'auc': auc } return {'loss': ll, 'log_loss': ll, 'auc': auc}
def evaluate(data, model, ver, cuda_flag, time_fn): from sklearn.metrics import roc_auc_score as AUC from sklearn.metrics import average_precision_score as AP correct_list = [] score_list = [] for (X, y) in data: outputs, targets = calculate(X, y, model, ver, cuda_flag, time_fn) correct_list.extend(y) score_list.extend(outputs.data.tolist()) auc = AUC(correct_list, score_list) ap = AP(correct_list, score_list) return correct_list, score_list, auc, ap
def test_metric(testset): frame_labels = [] frame_preds = [] video_labels = [] video_preds = [] for i in testset: frame_preds += i[2] frame_labels += [i[1]] * len(i[2]) video_preds.append(i[3]) video_labels.append(i[1]) video_thres, video_acc = acc_eval(video_labels, video_preds) frame_thres, frame_acc = acc_eval(frame_labels, frame_preds) video_auc = AUC(video_labels, video_preds) frame_auc = AUC(frame_labels, frame_preds) rs = { 'video_acc': video_acc, 'video_threshold': video_thres, 'video_auc': video_auc, 'frame_acc': frame_acc, 'frame_threshold': frame_thres, 'frame_auc': frame_auc } return rs
def eval_pred( y_true, y_pred, eval_type): if eval_type == 'logloss': loss = ll( y_true, y_pred ) print("logloss: ", loss) return loss elif eval_type == 'auc': loss = AUC( y_true, y_pred, multi_class='ovo') print("AUC: ", loss) return loss elif eval_type == 'rmse': loss = np.sqrt(mean_squared_error(y_true, y_pred)) print("rmse: ", loss) return loss
def eval_pred(y_true, y_pred, eval_type): if eval_type == 'logloss': #eval_typeはここに追加 loss = ll(y_true, y_pred) print "logloss: ", loss return loss elif eval_type == 'auc': loss = AUC(y_true, y_pred) print "AUC: ", loss return loss elif eval_type == 'rmse': loss = np.sqrt(mean_squared_error(y_true, y_pred)) print "rmse: ", loss return loss
def metrics(label_list, pred_list, pos_prob_list): metric_dict = dict() for m in config['metric']: if m == 'fbs': metric_dict[m] = FBS(label_list, pred_list, 1) elif m == 'acc': metric_dict[m] = ACC(label_list, pred_list) elif m == 'auc': metric_dict[m] = AUC(label_list, pos_prob_list) else: print('Error : No such metric. Implement it.') raise return metric_dict