Python find_threshold Examples, utils.find_threshold Python Examples

Example #1

0

Show file

File: grid.py Project: glouppe/kaggle-higgs

def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1):
    if verbose > 0:
        print "[Start]", params

    thresholds, scores, decisions = [], [], []

    for i in range(n_repeat):
        if verbose > 0:
            print "Fold", i

        X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X, y, w, train_size=0.5, random_state=i)
        X_train = np.asfortranarray(X_train, dtype=np.float32)

        w_train = rescale(w_train)
        w_train = rebalance(y_train, w_train)

        clf = Classifier(**params)
        try:
            clf = clf.fit(X_train, y_train, sample_weight=w_train)
        except:
            clf = clf.fit(X_train, y_train)

        threshold, score, d = find_threshold(clf, X_valid, y_valid, w_valid)
        print params, i, threshold, score

        thresholds.append(threshold)
        scores.append(score)
        decisions.append(d)

    if verbose > 0:
        print "[End]", params, np.mean(thresholds), np.mean(scores)

    return (np.mean(scores), np.mean(thresholds), params, thresholds, scores, decisions)

Example #2

0

Show file

File: stack.py Project: glouppe/kaggle-higgs

def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1):
    if verbose > 0:
        print "[Start]", params

    thresholds, scores = [], []

    for i in range(n_repeat):
        if verbose > 0:
            print "Fold", i

        _, X_fold, _, y_fold, _, w_fold = train_test_split(X, y, w, train_size=0.5, random_state=i)
        X_pred = load_predictions("stack/*-fold%d.npy" % i)
        X_fold = np.hstack((X_fold, X_pred))

        X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(X_fold, y_fold, w_fold, train_size=0.33, random_state=i)
        X_train = np.asfortranarray(X_train, dtype=np.float32)

        w_train = rescale(w_train)
        w_train = rebalance(y_train, w_train)

        clf = Classifier(**params)
        try:
            clf = clf.fit(X_train, y_train, sample_weight=w_train)
        except:
            clf = clf.fit(X_train, y_train)

        threshold, score, _ = find_threshold(clf, X_valid, y_valid, w_valid)

        thresholds.append(threshold)
        scores.append(score)

    if verbose > 0:
        print "[End]", params, np.mean(thresholds), np.mean(scores)

    return (np.mean(scores), np.mean(thresholds), params, thresholds, scores)

Example #3

0

Show file

    def train_for_threshold(self, features, target='label', num=35000):
        train_df = self.train_[self.train_.ID < num]
        val_df = self.train_[self.train_.ID >= num]

        X_train, y_train = train_df[features].values, train_df[
            target].values.astype('uint8')
        X_eval, y_eval = val_df[features].values, val_df[target].values.astype(
            'uint8')

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

        lgb_model = lgb.train(self.params,
                              lgb_train,
                              num_boost_round=10000,
                              valid_sets=[lgb_train, lgb_eval],
                              valid_names=['train', 'valid'],
                              early_stopping_rounds=100,
                              verbose_eval=1000)
        y_pred = lgb_model.predict(X_eval)
        ## 获取验证集的真实实体，以及按顺序排序预测的概率和对应的单词
        gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred)
        ## 获取搜索得到的阈值结果
        self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba)

        return self.threshold

Example #4

0

Show file

File: xgb_train.py Project: zhushaoquan/sohu_2019

    def train_for_threshold(self, features, target='label', num=35000):
        train_df = self.train_[self.train_.ID < num]
        val_df = self.train_[self.train_.ID >= num]

        X_train, y_train = train_df[features].values, train_df[
            target].values.astype('uint8')
        X_eval, y_eval = val_df[features].values, val_df[target].values.astype(
            'uint8')

        xgb_train = xgb.DMatrix(X_train, y_train)
        xgb_eval = xgb.DMatrix(X_eval, y_eval)

        xgb_model = xgb.train(self.params,
                              xgb_train,
                              num_boost_round=1000,
                              evals=[(xgb_train, 'train'), (xgb_eval, 'eval')],
                              early_stopping_rounds=100,
                              verbose_eval=100)
        y_pred = xgb_model.predict(xgb_eval)
        ## 获取验证集的真实实体，以及按顺序排序预测的概率和对应的单词
        gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred)
        ## 获取搜索得到的阈值结果
        self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba)

        return self.threshold

Example #5

0

Show file

File: train.py Project: lzj1769/PANDA

def valid(dataloader, model, criterion, args):
    model.eval()

    with torch.no_grad():
        valid_loss = 0.0
        preds, valid_labels = [], []
        for i, (images, target) in enumerate(dataloader):
            bs, num_tiles, c, h, w = images.size()

            images = images.to(args.device)
            target = target.to(args.device)

            # dihedral TTA
            images = torch.stack([
                images,
                images.flip(-1),
                images.flip(-2),
                images.flip(-1, -2),
                images.transpose(-1, -2),
                images.transpose(-1, -2).flip(-1),
                images.transpose(-1, -2).flip(-2),
                images.transpose(-1, -2).flip(-1, -2)
            ], 1)
            images = images.view(-1, num_tiles, c, h, w)

            output = model(images).view(bs, 8, -1).mean(1).view(-1)
            loss = criterion(output, target.float())

            preds.append(output.detach().cpu().numpy())
            valid_labels.append(target.detach().cpu().numpy())
            valid_loss += loss.item() / len(dataloader)

        preds = np.concatenate(preds)
        valid_labels = np.concatenate(valid_labels)

        threshold = utils.find_threshold(y_true=valid_labels, y_pred=preds)

        isup_preds = pd.cut(preds,
                            [-np.inf] + list(np.sort(threshold)) + [np.inf],
                            labels=[0, 1, 2, 3, 4, 5])
        score = utils.fast_qwk(isup_preds, valid_labels)
        cm = confusion_matrix(valid_labels, isup_preds)

        return valid_loss, score, cm, threshold

Example #6

0

Show file

    def train_for_threshold(self, features, target='label', num=35000):
        train_df = self.train_[self.train_.ID < num]
        val_df = self.train_[self.train_.ID >= num]

        X_train, y_train = train_df[features].values, train_df[target].values.astype('uint8')
        X_eval, y_eval = val_df[features].values, val_df[target].values.astype('uint8')

        cat_train = Pool(X_train, y_train)
        cat_eval = Pool(X_eval, y_eval)

        cat_model = catboost.train(cat_train, self.params, iterations=10000,
                              eval_set=cat_eval,
                              early_stopping_rounds=200,
                              verbose=500)
        y_pred = cat_model.predict(cat_eval, prediction_type='Probability')[:,1]
        ## 获取验证集的真实实体，以及按顺序排序预测的概率和对应的单词
        gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred)
        ## 获取搜索得到的阈值结果
        self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba)

        return self.threshold

Example #7

0

Show file

File: stack.py Project: glouppe/kaggle-higgs

def _parallel_eval(Classifier, params, X, y, w, n_repeat=5, verbose=1):
    if verbose > 0:
        print "[Start]", params

    thresholds, scores = [], []

    for i in range(n_repeat):
        if verbose > 0:
            print "Fold", i

        _, X_fold, _, y_fold, _, w_fold = train_test_split(X,
                                                           y,
                                                           w,
                                                           train_size=0.5,
                                                           random_state=i)
        X_pred = load_predictions("stack/*-fold%d.npy" % i)
        X_fold = np.hstack((X_fold, X_pred))

        X_train, X_valid, y_train, y_valid, w_train, w_valid = train_test_split(
            X_fold, y_fold, w_fold, train_size=0.33, random_state=i)
        X_train = np.asfortranarray(X_train, dtype=np.float32)

        w_train = rescale(w_train)
        w_train = rebalance(y_train, w_train)

        clf = Classifier(**params)
        try:
            clf = clf.fit(X_train, y_train, sample_weight=w_train)
        except:
            clf = clf.fit(X_train, y_train)

        threshold, score, _ = find_threshold(clf, X_valid, y_valid, w_valid)

        thresholds.append(threshold)
        scores.append(score)

    if verbose > 0:
        print "[End]", params, np.mean(thresholds), np.mean(scores)

    return (np.mean(scores), np.mean(thresholds), params, thresholds, scores)

Example #8

0

Show file

######## 1.1

fs = 160 # Frequency of sampling, given by data
resolution = 100 # Resolution of model (s.t. each bin has 1Hz of width)
freq = 10 # Frequency of interest
density = 0.2 # Density of the graph desired


###PDC

# Fitting PDC models
eo_pdc = ut.fit_model(eo, fs, resolution, "pdc", freq)
ec_pdc = ut.fit_model(ec, fs, resolution, "pdc", freq)

# Adjacency Matrices for 20% density networks
ut.adjacency_matrix(eo_pdc, ut.find_threshold(eo_pdc,density), "eo_pdc_20")
ut.adjacency_matrix(ec_pdc, ut.find_threshold(ec_pdc,density), "ec_pdc_20")



######## 1.2


# Fitting DTF models
eo_dtf = ut.fit_model(eo, fs, resolution, "dtf", freq)
ec_dtf = ut.fit_model(ec, fs, resolution, "dtf", freq)

# Adjacency Matrices for 20% density networks
ut.adjacency_matrix(eo_dtf, ut.find_threshold(eo_dtf,density), "eo_dtf_20")
ut.adjacency_matrix(ec_dtf, ut.find_threshold(ec_dtf,density), "ec_dtf_20")

Example #9

0

Show file

logging.info("Start training")
history = model.fit_generator(
    datagen.flow(train_X, train_y, batch_size=config['train']['batch_size']),
    epochs=config['train']['epochs'],
    callbacks=cbs,
    steps_per_epoch=len(train_X) / config['train']['batch_size'],
    class_weight=class_weights,
    validation_data=(valid_X, valid_y))
logging.info("Finish training")

model.save_weights(os.path.join(model_path, 'weights.h5'))
logging.info("Finish saving model")

# Find patch-based threshold from validation data
valid_probs = model.predict_proba(valid_X)
patch_threshold = find_threshold(valid_probs, valid_y)
print("Patch threshold: {}".format(patch_threshold))
logging.info("Patch threshold: {}".format(patch_threshold))

# Fine patient-based threshold from training and validation data
train_probs = model.predict_proba(train_X)
train_pred = predict_binary(train_probs, patch_threshold)
valid_pred = predict_binary(valid_probs, patch_threshold)
patient_y = []
patient_probs = []
pre_idx = 0
cur_idx = 0
for patient in train_idx:
    cur_idx = cur_idx + patient[2]
    pred_y = train_pred[pre_idx:cur_idx]
    true_y = train_y[pre_idx:cur_idx]

Example #10

0

Show file

    model = eval(config['model']['name'])(config['dataset']['input_dim'])
    model_name = filename.split('/')[-1].split('.')[0]
    if trans == "trans":
        model.load_weights(
            os.path.join('../models', model_name + '_trans', 'weights.h5'))
        weights, biases = model.layers[0].get_weights()
    else:
        model.load_weights(os.path.join('../models', model_name, 'weights.h5'))
        weights, biases = model.layers[0].get_weights()

    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer=keras.optimizers.Adam(lr=config['optimizer']['lr'],
                                                  amsgrad=True),
                  metrics=['accuracy'])
    valid_probs = model.predict_proba(valid_X)
    patch_threshold = find_threshold(valid_probs, valid_y)

    # Test data
    probs = model.predict_proba(test_X)

    # plot roc
    fpr, tpr, threshold = metrics.roc_curve(test_y, probs)
    roc_auc = metrics.auc(fpr, tpr)

    # Draw figures
    plt.figure()
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])

Example #11

0

Show file

    print('Updated baseline training complete in {:.4f}s'.format(time_updated_baseline))
    temp_time_updated_based.append(time_updated_baseline)
    # print(clf.predict(utils.normalization(ud_data)))
    # print(ud_label.squeeze())
    scoreD = utils.test(clf, utils.normalization(ud_data), ud_label.squeeze())
    print("Updated model score: {}".format(scoreD))
    # print("a)", scoreD, scoreA)
    temp_c0u.append(scoreD)

    '''
    e)
    '''
    # print(accs)
    weight = (len(accs) + 1) / 2
    proba_result_all = clf.predict_proba(utils.normalization(ud_data)) * weight
    threshold = utils.find_threshold(accs)
    # threshold = 0.4
    # threshold = utils.find_threshold(accs) - np.std(accs)
    # print(accs)

    # 数值大意味着中位数比平均大很多也就是大部分都很好或者个别特别特别好，这种情况就要严格些用中位值；
    # 数值小意味着中位数跟平均差不多，那就是大部分都不是很好，这种情况就可以宽泛些用均值
    # 经过print发现，感觉这个差值大于0.8的话就用中位，小于0.8就用均值。
    # print("249: ", np.median(accs) - np.mean(accs))
    # print("250: ", np.median(accs)/np.mean(accs)) 
    # print(utils.find_threshold(accs))
    # print(np.mean(accs))
    # print(np.std(accs))
    # print(utils.find_threshold(accs) - np.std(accs))
    print("Threshold: ", threshold)
    for i in range(len(clf_sources)):