def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1): if verbose > 0: print "[Start]", params thresholds, scores, decisions = [], [], [] for i in range(n_repeat): if verbose > 0: print "Fold", i X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X, y, w, train_size=0.5, random_state=i) X_train = np.asfortranarray(X_train, dtype=np.float32) w_train = rescale(w_train) w_train = rebalance(y_train, w_train) clf = Classifier(**params) try: clf = clf.fit(X_train, y_train, sample_weight=w_train) except: clf = clf.fit(X_train, y_train) threshold, score, d = find_threshold(clf, X_valid, y_valid, w_valid) print params, i, threshold, score thresholds.append(threshold) scores.append(score) decisions.append(d) if verbose > 0: print "[End]", params, np.mean(thresholds), np.mean(scores) return (np.mean(scores), np.mean(thresholds), params, thresholds, scores, decisions)
def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1): if verbose > 0: print "[Start]", params thresholds, scores = [], [] for i in range(n_repeat): if verbose > 0: print "Fold", i _, X_fold, _, y_fold, _, w_fold = train_test_split(X, y, w, train_size=0.5, random_state=i) X_pred = load_predictions("stack/*-fold%d.npy" % i) X_fold = np.hstack((X_fold, X_pred)) X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X_fold, y_fold, w_fold, train_size=0.33, random_state=i) X_train = np.asfortranarray(X_train, dtype=np.float32) w_train = rescale(w_train) w_train = rebalance(y_train, w_train) clf = Classifier(**params) try: clf = clf.fit(X_train, y_train, sample_weight=w_train) except: clf = clf.fit(X_train, y_train) threshold, score, _ = find_threshold(clf, X_valid, y_valid, w_valid) thresholds.append(threshold) scores.append(score) if verbose > 0: print "[End]", params, np.mean(thresholds), np.mean(scores) return (np.mean(scores), np.mean(thresholds), params, thresholds, scores)
def train_for_threshold(self, features, target='label', num=35000): train_df = self.train_[self.train_.ID < num] val_df = self.train_[self.train_.ID >= num] X_train, y_train = train_df[features].values, train_df[ target].values.astype('uint8') X_eval, y_eval = val_df[features].values, val_df[target].values.astype( 'uint8') lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) lgb_model = lgb.train(self.params, lgb_train, num_boost_round=10000, valid_sets=[lgb_train, lgb_eval], valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=1000) y_pred = lgb_model.predict(X_eval) ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词 gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred) ## 获取搜索得到的阈值结果 self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba) return self.threshold
def train_for_threshold(self, features, target='label', num=35000): train_df = self.train_[self.train_.ID < num] val_df = self.train_[self.train_.ID >= num] X_train, y_train = train_df[features].values, train_df[ target].values.astype('uint8') X_eval, y_eval = val_df[features].values, val_df[target].values.astype( 'uint8') xgb_train = xgb.DMatrix(X_train, y_train) xgb_eval = xgb.DMatrix(X_eval, y_eval) xgb_model = xgb.train(self.params, xgb_train, num_boost_round=1000, evals=[(xgb_train, 'train'), (xgb_eval, 'eval')], early_stopping_rounds=100, verbose_eval=100) y_pred = xgb_model.predict(xgb_eval) ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词 gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred) ## 获取搜索得到的阈值结果 self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba) return self.threshold
def valid(dataloader, model, criterion, args): model.eval() with torch.no_grad(): valid_loss = 0.0 preds, valid_labels = [], [] for i, (images, target) in enumerate(dataloader): bs, num_tiles, c, h, w = images.size() images = images.to(args.device) target = target.to(args.device) # dihedral TTA images = torch.stack([ images, images.flip(-1), images.flip(-2), images.flip(-1, -2), images.transpose(-1, -2), images.transpose(-1, -2).flip(-1), images.transpose(-1, -2).flip(-2), images.transpose(-1, -2).flip(-1, -2) ], 1) images = images.view(-1, num_tiles, c, h, w) output = model(images).view(bs, 8, -1).mean(1).view(-1) loss = criterion(output, target.float()) preds.append(output.detach().cpu().numpy()) valid_labels.append(target.detach().cpu().numpy()) valid_loss += loss.item() / len(dataloader) preds = np.concatenate(preds) valid_labels = np.concatenate(valid_labels) threshold = utils.find_threshold(y_true=valid_labels, y_pred=preds) isup_preds = pd.cut(preds, [-np.inf] + list(np.sort(threshold)) + [np.inf], labels=[0, 1, 2, 3, 4, 5]) score = utils.fast_qwk(isup_preds, valid_labels) cm = confusion_matrix(valid_labels, isup_preds) return valid_loss, score, cm, threshold
def train_for_threshold(self, features, target='label', num=35000): train_df = self.train_[self.train_.ID < num] val_df = self.train_[self.train_.ID >= num] X_train, y_train = train_df[features].values, train_df[target].values.astype('uint8') X_eval, y_eval = val_df[features].values, val_df[target].values.astype('uint8') cat_train = Pool(X_train, y_train) cat_eval = Pool(X_eval, y_eval) cat_model = catboost.train(cat_train, self.params, iterations=10000, eval_set=cat_eval, early_stopping_rounds=200, verbose=500) y_pred = cat_model.predict(cat_eval, prediction_type='Probability')[:,1] ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词 gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred) ## 获取搜索得到的阈值结果 self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba) return self.threshold
def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1): if verbose > 0: print "[Start]", params thresholds, scores = [], [] for i in range(n_repeat): if verbose > 0: print "Fold", i _, X_fold, _, y_fold, _, w_fold = train_test_split(X, y, w, train_size=0.5, random_state=i) X_pred = load_predictions("stack/*-fold%d.npy" % i) X_fold = np.hstack((X_fold, X_pred)) X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split( X_fold, y_fold, w_fold, train_size=0.33, random_state=i) X_train = np.asfortranarray(X_train, dtype=np.float32) w_train = rescale(w_train) w_train = rebalance(y_train, w_train) clf = Classifier(**params) try: clf = clf.fit(X_train, y_train, sample_weight=w_train) except: clf = clf.fit(X_train, y_train) threshold, score, _ = find_threshold(clf, X_valid, y_valid, w_valid) thresholds.append(threshold) scores.append(score) if verbose > 0: print "[End]", params, np.mean(thresholds), np.mean(scores) return (np.mean(scores), np.mean(thresholds), params, thresholds, scores)
######## 1.1 fs = 160 # Frequency of sampling, given by data resolution = 100 # Resolution of model (s.t. each bin has 1Hz of width) freq = 10 # Frequency of interest density = 0.2 # Density of the graph desired ###PDC # Fitting PDC models eo_pdc = ut.fit_model(eo, fs, resolution, "pdc", freq) ec_pdc = ut.fit_model(ec, fs, resolution, "pdc", freq) # Adjacency Matrices for 20% density networks ut.adjacency_matrix(eo_pdc, ut.find_threshold(eo_pdc,density), "eo_pdc_20") ut.adjacency_matrix(ec_pdc, ut.find_threshold(ec_pdc,density), "ec_pdc_20") ######## 1.2 # Fitting DTF models eo_dtf = ut.fit_model(eo, fs, resolution, "dtf", freq) ec_dtf = ut.fit_model(ec, fs, resolution, "dtf", freq) # Adjacency Matrices for 20% density networks ut.adjacency_matrix(eo_dtf, ut.find_threshold(eo_dtf,density), "eo_dtf_20") ut.adjacency_matrix(ec_dtf, ut.find_threshold(ec_dtf,density), "ec_dtf_20")
logging.info("Start training") history = model.fit_generator( datagen.flow(train_X, train_y, batch_size=config['train']['batch_size']), epochs=config['train']['epochs'], callbacks=cbs, steps_per_epoch=len(train_X) / config['train']['batch_size'], class_weight=class_weights, validation_data=(valid_X, valid_y)) logging.info("Finish training") model.save_weights(os.path.join(model_path, 'weights.h5')) logging.info("Finish saving model") # Find patch-based threshold from validation data valid_probs = model.predict_proba(valid_X) patch_threshold = find_threshold(valid_probs, valid_y) print("Patch threshold: {}".format(patch_threshold)) logging.info("Patch threshold: {}".format(patch_threshold)) # Fine patient-based threshold from training and validation data train_probs = model.predict_proba(train_X) train_pred = predict_binary(train_probs, patch_threshold) valid_pred = predict_binary(valid_probs, patch_threshold) patient_y = [] patient_probs = [] pre_idx = 0 cur_idx = 0 for patient in train_idx: cur_idx = cur_idx + patient[2] pred_y = train_pred[pre_idx:cur_idx] true_y = train_y[pre_idx:cur_idx]
model = eval(config['model']['name'])(config['dataset']['input_dim']) model_name = filename.split('/')[-1].split('.')[0] if trans == "trans": model.load_weights( os.path.join('../models', model_name + '_trans', 'weights.h5')) weights, biases = model.layers[0].get_weights() else: model.load_weights(os.path.join('../models', model_name, 'weights.h5')) weights, biases = model.layers[0].get_weights() model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(lr=config['optimizer']['lr'], amsgrad=True), metrics=['accuracy']) valid_probs = model.predict_proba(valid_X) patch_threshold = find_threshold(valid_probs, valid_y) # Test data probs = model.predict_proba(test_X) # plot roc fpr, tpr, threshold = metrics.roc_curve(test_y, probs) roc_auc = metrics.auc(fpr, tpr) # Draw figures plt.figure() plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1])
print('Updated baseline training complete in {:.4f}s'.format(time_updated_baseline)) temp_time_updated_based.append(time_updated_baseline) # print(clf.predict(utils.normalization(ud_data))) # print(ud_label.squeeze()) scoreD = utils.test(clf, utils.normalization(ud_data), ud_label.squeeze()) print("Updated model score: {}".format(scoreD)) # print("a)", scoreD, scoreA) temp_c0u.append(scoreD) ''' e) ''' # print(accs) weight = (len(accs) + 1) / 2 proba_result_all = clf.predict_proba(utils.normalization(ud_data)) * weight threshold = utils.find_threshold(accs) # threshold = 0.4 # threshold = utils.find_threshold(accs) - np.std(accs) # print(accs) # 数值大意味着中位数比平均大很多也就是大部分都很好或者个别特别特别好,这种情况就要严格些用中位值; # 数值小意味着中位数跟平均差不多,那就是大部分都不是很好,这种情况就可以宽泛些用均值 # 经过print发现,感觉这个差值大于0.8的话就用中位,小于0.8就用均值。 # print("249: ", np.median(accs) - np.mean(accs)) # print("250: ", np.median(accs)/np.mean(accs)) # print(utils.find_threshold(accs)) # print(np.mean(accs)) # print(np.std(accs)) # print(utils.find_threshold(accs) - np.std(accs)) print("Threshold: ", threshold) for i in range(len(clf_sources)):