コード例 #1
0
ファイル: learner.py プロジェクト: marcoal/Influence-Networks
 def auc_metrics(self, y, ypreds):
     self.log("Features Used: {}".format(self.featurizer.get_feature_names()))
     auc = roc_auc_score(y, ypreds)
     self.log("\tROC AUC: {}".format(auc))
     prc = average_precision_score(y, ypreds, average="weighted")
     self.log("\tPrecision-Recall AUC: {}".format(prc))
     return auc, prc
コード例 #2
0
ファイル: analyzeModel.py プロジェクト: TomDecroos/thesis
def get_scores(y_true,y_pred):
    brier_score = brier_score_loss(y_true,y_pred)
    log_score = log_loss(y_true,y_pred)
    roc_score = roc_auc_score(y_true, y_pred)
    pr_score = average_precision_score(y_true,y_pred)
    r2score = r2_score(y_true,y_pred)
    return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
コード例 #3
0
def compute_roc_auc(data_gt, data_pd, classes, full=True):
    roc_auc = []
    for i in range(classes):
        roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))
    print("Full AUC", roc_auc)
    roc_auc = np.mean(roc_auc)
    return roc_auc
コード例 #4
0
ファイル: analyze_model.py プロジェクト: TomDecroos/thesis
def get_scores(shots):
    y_true = [shot.result for shot in shots]
    y_pred = [shot.pred for shot in shots]
    brier_score = brier_score_loss(y_true,y_pred)
    log_score = log_loss(y_true,y_pred)
    roc_score = roc_auc_score(y_true, y_pred)
    pr_score = average_precision_score(y_true,y_pred)
    r2score = r2_score(y_true,y_pred)
    return math.sqrt(brier_score),log_score,roc_score,pr_score,r2score
コード例 #5
0
ファイル: arc_val.py プロジェクト: panky8070/ConvArc
def arc_val(epoch,
            epoch_fn,
            opt,
            val_loader,
            discriminator,
            logger,
            optimizer=None,
            loss_fn=None,
            fcn=None,
            coAttn=None):

    global best_validation_loss, best_auc, saving_threshold

    # freeze the weights from the ARC and set it to eval.
    if not (discriminator is None):
        for param in discriminator.parameters():
            param.requires_grad = False
        discriminator.eval()
        if opt.cuda:
            discriminator.cuda()

    # freeze the weights from the fcn and set it to eval.
    if opt.apply_wrn:
        for param in fcn.parameters():
            param.requires_grad = False
        fcn.eval()
        if opt.cuda:
            fcn.cuda()

    # freeze weigths from the coAttn module
    if opt.use_coAttn:
        for param in coAttn.parameters():
            param.requires_grad = False
        coAttn.eval()
        if opt.cuda:
            coAttn.cuda()

    val_epoch = 0
    val_auc_epoch = []
    val_loss_epoch = []
    start_time = datetime.now()
    while val_epoch < opt.val_num_batches:

        val_loader.dataset.set_path_tmp_epoch_iteration(epoch=epoch,
                                                        iteration=val_epoch)

        if opt.apply_wrn:
            val_auc, val_loss = epoch_fn(opt=opt,
                                         loss_fn=loss_fn,
                                         discriminator=discriminator,
                                         data_loader=val_loader,
                                         fcn=fcn,
                                         coAttn=coAttn)
        else:
            val_auc, val_loss = epoch_fn(opt=opt,
                                         loss_fn=loss_fn,
                                         discriminator=discriminator,
                                         data_loader=val_loader,
                                         coAttn=coAttn)

        if isinstance(val_auc, tuple):
            features = [item for sublist in val_auc[0] for item in sublist]
            labels = [item for sublist in val_auc[1] for item in sublist]
            val_auc = ranking.roc_auc_score(labels,
                                            features,
                                            average=None,
                                            sample_weight=None)

        val_auc_epoch.append(val_auc)
        val_loss_epoch.append(val_loss)

        # remove data repetition
        val_loader.dataset.remove_path_tmp_epoch(epoch=epoch,
                                                 iteration=val_epoch)

        val_epoch += 1

    time_elapsed = datetime.now() - start_time
    val_auc_std_epoch = np.std(val_auc_epoch)
    val_auc_epoch = np.mean(val_auc_epoch)
    val_loss_epoch = np.mean(val_loss_epoch)
    print ("====" * 20, "\n", "[" + multiprocessing.current_process().name + "]" + \
                             "epoch: ", epoch, ", validation loss: ", val_loss_epoch \
        , ", validation auc: ", val_auc_epoch, ", validation auc_std: ", val_auc_std_epoch, ", time: ", \
        time_elapsed.seconds, "s:", time_elapsed.microseconds / 1000, "ms\n", "====" * 20)
    logger.log_value('arc_val_loss', val_loss_epoch)
    logger.log_value('arc_val_auc', val_auc_epoch)
    logger.log_value('arc_val_auc_std', val_auc_std_epoch)

    is_model_saved = False
    #if best_validation_loss > (saving_threshold * val_loss_epoch):
    if best_auc < (saving_threshold * val_auc_epoch):
        print(
            "[{}] Significantly improved validation loss from {} --> {}. accuracy from {} --> {}. Saving..."
            .format(multiprocessing.current_process().name,
                    best_validation_loss, val_loss_epoch, best_auc,
                    val_auc_epoch))
        # save the fcn model
        if opt.apply_wrn:
            torch.save(fcn.state_dict(), opt.wrn_save)
        # Save the ARC discriminator
        if not (discriminator is None):
            torch.save(discriminator.state_dict(), opt.arc_save)
        # Save the Co-attn model
        if opt.use_coAttn:
            torch.save(coAttn.state_dict(), opt.coattn_save)
        # Save optimizer
        torch.save(optimizer.state_dict(), opt.arc_optimizer_path)
        # Acc-loss values
        best_validation_loss = val_loss_epoch
        best_auc = val_auc_epoch
        is_model_saved = True

    # remove the data from the epoch
    val_loader.dataset.remove_path_tmp_epoch(epoch=epoch)

    return val_auc_epoch, val_auc_std_epoch, val_loss_epoch, is_model_saved
コード例 #6
0
    if(i == 0):
        test_pred_labels = test_pred
        test_pred_labels = test_pred_labels.reshape(len(test_pred_labels),1)
    else:
        print((test_pred_labels).shape, 'test_pred_labels')
        test_pred = test_pred.reshape(len(test_pred),1)
        print((test_pred).shape, 'test_pred_labels')

        test_pred_labels = torch.cat((test_pred_labels, test_pred),1)

outAUROC = []   
#test_labels = test_labels.detach().cpu().clone().numpy()
#test_pred_labels = test_pred_labels.detach().cpu().clone().numpy()
for i in range(nnClassCount):
    try:
        outAUROC.append(roc_auc_score(test_labels[:, i], test_pred_labels[:, i]))

    except ValueError:
        pass
aurocMean = np.array(outAUROC)
print(aurocMean, 'aurocMean')

def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
コード例 #7
0
ファイル: arc_test.py プロジェクト: panky8070/ConvArc
def arc_test(epoch, epoch_fn, opt, test_loader, discriminator, logger):

    # LOAD AGAIN THE FCN AND ARC models. Freezing the weights.
    print("[%s] ... loading last validation model" %
          multiprocessing.current_process().name)
    if not (discriminator is None):
        discriminator.load_state_dict(torch.load(opt.arc_save))

    if not (discriminator is None):
        # freeze the weights from the ARC.
        for param in discriminator.parameters():
            param.requires_grad = False
        discriminator.eval()
        if opt.cuda:
            discriminator.cuda()

    if opt.apply_wrn:
        # Convert the opt params to dict.
        optDict = dict([(key, value) for key, value in opt._get_kwargs()])
        fcn = ConvCNNFactory.createCNN(opt.wrn_name_type, optDict)
        if torch.cuda.is_available():
            fcn.load_state_dict(torch.load(opt.wrn_load))
        else:
            fcn.load_state_dict(
                torch.load(opt.wrn_load, map_location=torch.device('cpu')))
        for param in fcn.parameters():
            param.requires_grad = False
        if opt.cuda:
            fcn.cuda()
        fcn.eval()

    # Load the Co-Attn module
    coAttn = None
    if opt.use_coAttn:
        coAttn = CoAttn(size=opt.coAttn_size,
                        num_filters=opt.arc_nchannels,
                        typeActivation=opt.coAttn_type,
                        p=opt.coAttn_p)
        if torch.cuda.is_available():
            coAttn.load_state_dict(torch.load(opt.coattn_load))
        else:
            coAttn.load_state_dict(
                torch.load(opt.coattn_load, map_location=torch.device('cpu')))
        if opt.cuda:
            coAttn.cuda()

    # TEST of FCN and ARC models
    start_time = datetime.now()
    print('[%s] ... testing' % multiprocessing.current_process().name)
    test_epoch = 0
    test_auc_epoch = []
    while test_epoch < opt.test_num_batches:

        test_loader.dataset.set_path_tmp_epoch_iteration(epoch=epoch,
                                                         iteration=test_epoch)

        if opt.apply_wrn:
            test_auc, test_loss = epoch_fn(opt=opt,
                                           loss_fn=None,
                                           discriminator=discriminator,
                                           data_loader=test_loader,
                                           fcn=fcn,
                                           coAttn=coAttn)
        else:
            test_auc, test_loss = epoch_fn(opt=opt,
                                           loss_fn=None,
                                           discriminator=discriminator,
                                           data_loader=test_loader,
                                           coAttn=coAttn)

        if isinstance(test_auc, tuple):
            features = [item for sublist in test_auc[0] for item in sublist]
            labels = [item for sublist in test_auc[1] for item in sublist]
            test_auc = ranking.roc_auc_score(labels,
                                             features,
                                             average=None,
                                             sample_weight=None)

        test_auc_epoch.append(np.mean(test_auc))

        test_loader.dataset.remove_path_tmp_epoch(epoch=epoch,
                                                  iteration=test_epoch)

        test_epoch += 1

    test_loader.dataset.remove_path_tmp_epoch(epoch=epoch)

    time_elapsed = datetime.now() - start_time
    test_auc_std_epoch = np.std(test_auc_epoch)
    test_auc_epoch = np.mean(test_auc_epoch)
    print ("====" * 20, "\n", "[" + multiprocessing.current_process().name + "]" +\
                             "epoch: ", epoch, ", test ARC auc: ", test_auc_epoch, ", test ARC auc_std: ", test_auc_std_epoch, ", time: ", \
        time_elapsed.seconds, "s:", time_elapsed.microseconds / 1000, "ms\n", "====" * 20)
    logger.log_value('arc_test_auc', test_auc_epoch)
    logger.log_value('arc_test_auc_std', test_auc_std_epoch)
    return test_auc_epoch, test_auc_std_epoch
コード例 #8
0
# df.to_csv('/deep/group/RareXpert/valid_logistic.csv',index=True,header=True)

probs_df = pd.read_csv(Path('/deep/group/RareXpert/cam_val.csv'))
probs = torch.tensor(probs_df.values)
probs = probs[:, 1:]

print(probs.shape)

clf_gini = DecisionTreeClassifier(criterion="gini",
                                  random_state=None,
                                  max_depth=32,
                                  min_samples_leaf=5)
clf_gini.fit(probs[:200, ], labels[:200])
test_pred = torch.from_numpy(clf_gini.predict(probs[200:, :]))
score = roc_auc_score(labels[200:], test_pred)
print(score)

seen_set = set([0, 1, 4, 7])

probs_df = probs_df.assign(Unseen=[0] * len(probs_df))

for i in range(len(probs_df)):
    label = torch.FloatTensor(labels_df.iloc[i])
    for j in range(10):
        if label[j] == 1:
            if j not in seen_set:
                probs_df.at[i, 'Unseen'] = 1

cams_df = pd.read_csv('/deep/group/RareXpert/cam_val.csv')
val_df = pd.read_csv('/deep/group/RareXpert/valid_absolute.csv')
コード例 #9
0
ファイル: baselineTriplets.py プロジェクト: panky8070/ConvArc
def do_epoch(epoch, repetitions, opt, data_loader, fcn, logger, optimizer=None):
    
    acc_epoch = []
    loss_epoch = []
    all_probs = []
    all_labels = []
    auc_epoch = []
    n_repetitions = 0
    while n_repetitions < repetitions:
        acc_batch = []
        loss_batch = []

        data_loader.dataset.set_path_tmp_epoch_iteration(epoch,n_repetitions)

        for batch_idx, (data, info) in enumerate(data_loader):
            if opt.cuda:
                data = data.cuda()
            if optimizer:
                inputs = Variable(data, requires_grad=True)
            else:
                inputs = Variable(data, requires_grad=False)

            feats_p = fcn.forward_features(inputs[:, 0, :, :, :]) # positive
            feats_a = fcn.forward_features(inputs[:, 1, :, :, :]) # anchor
            feats_n = fcn.forward_features(inputs[:, 2, :, :, :]) # negative

            # E1, E2, E3 = model(anchor_img, pos_img, neg_img)
            # dist_E1_E2 = F.pairwise_distance(E1, E2, 2)
            # dist_E1_E3 = F.pairwise_distance(E1, E3, 2)

            # target = torch.FloatTensor(dist_E1_E2.size()).fill_(-1)
            # if args.cuda:
            #     target = target.cuda()
            # target = Variable(target)
            
            # #Calculate loss
            # loss_triplet = criterion(dist_E1_E2, dist_E1_E3, target)
            # loss_embedd = E1.norm(2) + E2.norm(2) + E3.norm(2)
            # loss = loss_triplet + 0.001*loss_embedd
            # total_loss += loss

            feats_p = feats_p / (feats_p.norm(p=2, dim=1, keepdim=True) + 1e-12).expand_as(feats_p)
            feats_a = feats_a / (feats_a.norm(p=2, dim=1, keepdim=True) + 1e-12).expand_as(feats_a)
            feats_n = feats_n / (feats_n.norm(p=2, dim=1, keepdim=True) + 1e-12).expand_as(feats_n)

            # Do the classification for the positive and the negative
            logsoft_feats_p = torch.nn.LogSoftmax(dim=1)(fcn.forward_classifier(feats_p))
            logsoft_feats_n = torch.nn.LogSoftmax(dim=1)(fcn.forward_classifier(feats_n))

            #dists_p = torch.sqrt(torch.sum((feats_p - feats_a) ** 2, 1))  # euclidean distance
            #dists_n = torch.sqrt(torch.sum((feats_n - feats_a) ** 2, 1))  # euclidean distance
            dists_p = F.pairwise_distance(feats_a, feats_p, 2) # PairwiseDistance
            dists_n = F.pairwise_distance(feats_a, feats_n, 2) # PairwiseDistance

            # 1 means, dist_n should be larger than dist_p
            targets = torch.FloatTensor(len(dists_p)).fill_(1)
            targets = Variable(targets)
            if opt.cuda:
                targets = targets.cuda()

            # LOSS 1 - Constrastive loss
            margin = 0.2
            loss_fn_1 = torch.nn.MarginRankingLoss(margin=margin)
            if opt.cuda:
                loss_fn_1 = loss_fn_1.cuda()
            loss1 = loss_fn_1(dists_p, dists_n, targets)

            loss_fn_2 = torch.nn.NLLLoss()
            if opt.cuda:
                loss_fn_2 = loss_fn_2.cuda()
            # LOSS 2 - Involve the classifier
            targets_p = torch.FloatTensor(len(logsoft_feats_p)).fill_(1)
            targets_n = torch.FloatTensor(len(logsoft_feats_n)).fill_(0)
            targets = torch.stack((targets_p,targets_n)).view(-1)
            targets = Variable(targets)
            if opt.cuda:
                targets = targets.cuda()
            logsoft_feats = torch.stack((logsoft_feats_p,logsoft_feats_n)).view(-1,2)
            loss2 = loss_fn_2(logsoft_feats, targets.long())
            
            # Total loss
            loss = loss1 + loss2

            loss_batch.append(loss.item())
            
            if optimizer:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            #prediction = (dists_n - dists_p - margin).cpu().data
		    #prediction = prediction.view(prediction.numel())
		    #prediction = (prediction > 0).float()
		    #batch_acc = prediction.sum()*1.0/prediction.numel()

            # calculate distances. Similar samples should be close, dissimilar should be large.
            probs = torch.exp(logsoft_feats)
            max_index = probs.max(dim = 1)[1]
            acc = (max_index == targets.long()).sum().float()/len(targets)
            #acc_batch.append(acc.item())

            all_probs.append(probs.cpu().data.numpy()[:,1])
            all_labels.append(targets.long().data.cpu().numpy())

        auc = ranking.roc_auc_score([item for sublist in all_labels for item in sublist],
                                      [item for sublist in all_probs for item in sublist], average=None, sample_weight=None)
        auc_epoch.append(auc)
        #acc_epoch.append(np.mean(acc_batch))
        loss_epoch.append(np.mean(loss_batch))
        # remove data repetition
        data_loader.dataset.remove_path_tmp_epoch(epoch,n_repetitions)
        # next repetition
        n_repetitions += 1
    
    # remove data epoch
    data_loader.dataset.remove_path_tmp_epoch(epoch)

    auc_std_epoch = np.std(auc_epoch)
    auc_epoch = np.mean(auc_epoch)
    #acc_epoch = np.mean(acc_epoch)
    loss_epoch = np.mean(loss_epoch)

    #return acc_epoch, loss_epoch
    return auc_epoch, auc_std_epoch, loss_epoch
コード例 #10
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
コード例 #11
0
    shotprobs = list()
    for matchid in matchids:
        predictionsfile = '../../data/results/' + str(matchid) + postfix
        mp = np.loadtxt(predictionsfile)
        if dom:
            shotprobs.append(mp[:,3])
            is_shot.append([1 if x == 1 else 0 for x in mp[:,5]])
        else:
            shotprobs.append(mp[:,4])
            is_shot.append([1 if x == -1 else 0 for x in mp[:,5]])
        
    return shotprobs,is_shot
    #all_shotprobs = [pred for pred in s for s in shotprob]

dom = False
preds,ys = load_all_matches("_dtw",dom)
preds_n = load_all_matches("_naive",dom)[0]

scores = [roc_auc_score(y,p) for p,y in zip(preds,ys)]
#plt.hist(scores)

allpred = list([x for z in preds for x in z])
allpred_n = list([x for z in preds_n for x in z])
ally = list([x for z in ys for x in z])
print(len(ally)/69)
print(sum(ally)/69)
#Iprint(set(ally))
plot_roc_curves(ally,[allpred,allpred_n],["Ons model met DTW","Naief basismodel"])
#plot_a_fuckload_of_roc_curves(ys,[preds,preds_n],["Ons model met DTW","Naief basismodel"])
#plot_rocauc_hist(ys,[preds,preds_n],["Ons model met DTW","Naief basismodel"])
print kstest_roc_auc(ys,[preds,preds_n])
コード例 #12
0
ファイル: auc.py プロジェクト: ISSE-AILab/KGZNet
CLASS_NAMES = [
    'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass',
    'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema',
    'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia'
]
style = [
    'r-', 'g-', 'b-', 'y-', 'k-', 'c-', 'm-', 'r--', 'g--', 'b--', 'y--',
    'k--', 'c--', 'm--'
]
print(pred1.shape)
print(gt.shape)
print(float(gt[38, 0]))
average_roc = 0.0
plt.figure(figsize=(12, 5))
for i in range(14):
    roc_value = roc_auc_score(gt[:, i], pred1[:, i], sample_weight=None)
    print(CLASS_NAMES[i], ':', roc_value)
    average_roc += roc_value
    fpr, tpr, thresholds = roc_curve(gt[:, i], pred1[:, i])
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, style[i], label=CLASS_NAMES[i])
print('average_roc: ', average_roc / 14)
plt.title('KGZNet-(DenseNet-121)')
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
plt.legend()
plt.show()
# plt.savefig("1.pdf")
for i in range(14):
    roc_value = roc_auc_score(gt[:, i], pred2[:, i], sample_weight=None)
    print(CLASS_NAMES[i], ':', roc_value)
コード例 #13
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)
    # X_all_dense = X_all.todense()
    print(type(X_all))
    # print(type(X_all_dense[0]))
    # print(y_all)
    # print("===")

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    # print(X_train)
    # print(y_train)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    toarray = X_test.toarray()
    print(type(toarray))
    y_pred_gbdt = gbdt.predict_proba(toarray)
    # print(y_pred_gbdt)
    y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)  # gbdt auc: 0.96455

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)  # 基于原有特征的LR AUC: 0.93455

    # GBDT编码原有特征
    # X_train_leaves = gbdt.apply(X_train)
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    np.set_printoptions(linewidth=400)
    np.set_printoptions(threshold=np.inf)
    # print(X_train_leaves[0:22,:])  # 打印22行,所有列
    print(type(X_train_leaves))
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    print(train_rows, cols)

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print(X_trans.shape)
    # print(X_trans.todense()[0:22,:])

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    # print(X_trans[train_rows:, :])
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print("组合特征的个数:", X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
コード例 #14
0
def arc_train(epoch,
              epoch_fn,
              opt,
              train_loader,
              discriminator,
              logger,
              optimizer=None,
              loss_fn=None,
              fcn=None,
              coAttn=None):

    start_time = datetime.now()

    # set all gradients to True and the model in evaluation format.
    if not (discriminator is None):
        for param in discriminator.parameters():
            param.requires_grad = True
        discriminator.train(mode=True)
        if opt.cuda:
            discriminator.cuda()

    # set all gradients to True and the fcn in evaluation format.
    if opt.apply_wrn:
        for param in fcn.parameters():
            param.requires_grad = True
        fcn.train()
        if opt.cuda:
            fcn.cuda()

    # set all gradient to True
    if opt.use_coAttn:
        for param in coAttn.parameters():
            param.requires_grad = True
        coAttn.train()
        if opt.cuda:
            coAttn.cuda()

    train_loader.dataset.set_path_tmp_epoch_iteration(epoch=epoch, iteration=0)

    if opt.apply_wrn:
        train_auc_epoch, train_loss_epoch = epoch_fn(
            opt=opt,
            loss_fn=loss_fn,
            discriminator=discriminator,
            data_loader=train_loader,
            optimizer=optimizer,
            fcn=fcn,
            coAttn=coAttn)
    else:
        train_auc_epoch, train_loss_epoch = epoch_fn(
            opt=opt,
            loss_fn=loss_fn,
            discriminator=discriminator,
            data_loader=train_loader,
            optimizer=optimizer,
            coAttn=coAttn)

    if isinstance(train_auc_epoch, tuple):
        features = [item for sublist in train_auc_epoch[0] for item in sublist]
        labels = [item for sublist in train_auc_epoch[1] for item in sublist]
        train_auc_epoch = ranking.roc_auc_score(labels,
                                                features,
                                                average=None,
                                                sample_weight=None)

    time_elapsed = datetime.now() - start_time
    train_auc_std_epoch = 1.0
    train_loss_epoch = np.mean(train_loss_epoch)
    print(
        "[%s] epoch: %d, train loss: %f, train auc: %.2f, time: %02ds:%02dms" %
        (multiprocessing.current_process().name, epoch,
         np.round(train_loss_epoch, 6), np.round(train_auc_epoch, 6),
         time_elapsed.seconds, time_elapsed.microseconds / 1000))
    logger.log_value('arc_train_loss', train_loss_epoch)
    logger.log_value('arc_train_auc', train_auc_epoch)

    assert np.isnan(
        train_loss_epoch) == False, 'ERROR. Found NAN in train_ARC.'

    # Remove data from the epoch
    train_loader.dataset.remove_path_tmp_epoch(epoch=epoch, iteration=0)
    train_loader.dataset.remove_path_tmp_epoch(epoch=epoch)

    # Reduce learning rate when a metric has stopped improving
    logger.log_value('train_lr', [
        param_group['lr'] for param_group in optimizer.param_groups
    ][0])
    return train_auc_epoch, train_auc_std_epoch, train_loss_epoch
コード例 #15
0
        lg.info("Validation..." + str(countdown))
        f_ev = model.test_on_batch([x_test_batch_r, x_test_batch_c], y_test_batch)

        lg.info(str(f_ev))

        test_loss += f_ev[0]
        test_loss_avg = test_loss / test_step

        test_acc += f_ev[1]
        test_acc_avg = test_acc / test_step
        test_step += 1

        try:
            lg.info("Prediction...")
            pred = model.predict([x_test_batch_r, x_test_batch_c])
            roc_auc = roc_auc_score(y_test_batch, pred[:, 0])
            lg.info(str(pred.shape) + " ROC AUC " + str(roc_auc) + " Accuracy " + str(test_acc_avg))
            roc_auc_acc += roc_auc
            roc_auc_avg = roc_auc_acc / test_step
            lg.info(str(pred.shape) + " ROC AUC avg " + str(roc_auc_avg))

        except e as Exception:
            lg.exception("Can't predict")
            lg.error("Cant predict, " + str(e))
        countdown -= 1

        if countdown == 0:
            break

    stop = datetime.datetime.now()
    e_elap = stop - start
コード例 #16
0
ファイル: p6.py プロジェクト: amazor/testhub
tHat = np.empty((0,96))
tPCAHat = np.empty((0,96))

kf = KFold(n_splits=numSplits, shuffle = True, random_state = 69)
for train_index, test_index in kf.split(x):
    x_train, x_test = x[train_index], x[test_index]
    xPCA_train, xPCA_test = xPCA[train_index], xPCA[test_index]
    y_train, y_test = y[train_index], y[test_index]
    T = compositeClassifier.fit(x_train, y_train).decision_function(x_test)
    TPCA = compositeClassifier.fit(xPCA_train, y_train).decision_function(xPCA_test)
    yHat = np.append(yHat, y_test, axis=0)
    tHat = np.append(tHat, T, axis=0)
    tPCAHat = np.append(tPCAHat, TPCA, axis=0)

fpr,tpr, _ = roc_curve(yHat.ravel(), tHat.ravel())
roc_auc = roc_auc_score(yHat.ravel(), tHat.ravel())

precision, recall, _ = precision_recall_curve(yHat.ravel(), tHat.ravel())
pr_auc = average_precision_score(yHat, tHat, average="micro")

# for problem 6
fprPCA,tprPCA, _ = roc_curve(yHat.ravel(), tPCAHat.ravel())
roc_aucPCA = roc_auc_score(yHat.ravel(), tPCAHat.ravel())

precisionPCA, recallPCA, _ = precision_recall_curve(yHat.ravel(), tPCAHat.ravel())
pr_aucPCA = average_precision_score(yHat, tPCAHat, average="micro")




 ]
 number_of_zeros = len(
     [value for value in real_values_binary if value == 0])
 number_of_ones = len(
     [value for value in real_values_binary if value == 1])
 majority_label = 0 if number_of_zeros > number_of_ones else 1
 majority_baseline = [
     majority_label for i in range(len(real_values))
 ]
 acc = accuracy_score(real_values_binary, pred_values_binary)
 metrics = precision_recall_fscore_support(real_values_binary,
                                           pred_values_binary)
 f1 = f1_score(real_values_binary,
               pred_values_binary,
               average="weighted")
 roc = roc_auc_score(real_values_binary, predicted_values)
 acc_maj = accuracy_score(real_values_binary, majority_baseline)
 metrics_maj = precision_recall_fscore_support(
     real_values_binary, majority_baseline)
 f1_maj = f1_score(real_values_binary,
                   majority_baseline,
                   average="weighted")
 roc_maj = roc_auc_score(real_values_binary, majority_baseline)
 b, c = compute_correct_predictions(majority_baseline,
                                    pred_values_binary,
                                    real_values_binary)
 p_value = mcnemar_midp(b, c)
 report_file.write(
     str(acc) + ";" + str(metrics[0][0]) + ";" +
     str(metrics[0][1]) + ";" + str(metrics[1][0]) + ";" +
     str(metrics[1][1]) + ";" + str(metrics[2][0]) + ";" +
コード例 #18
0
def perform(X_train_src, X_test_src, y_train, y_test):
    tmp_folder_name = './tmp'
    if not os.path.exists(tmp_folder_name):
        os.makedirs(tmp_folder_name)

    in_file = open('stat.pkl', 'rb')
    stat_dict = cPickle.load(in_file)
    in_file.close()

    X_train_gbdt = feat_eng(X_train_src, stat_dict)
    X_test_gbdt = feat_eng(X_test_src, stat_dict)

    X_train_gbdt_sp = sparse.csr_matrix(X_train_gbdt)
    X_test_gbdt_sp = sparse.csr_matrix(X_test_gbdt)

    model_gbdt = GradientBoostingClassifier(n_estimators=30,
                                            learning_rate=0.05,
                                            max_depth=7,
                                            verbose=0,
                                            max_features=0.6)

    model_gbdt.fit(X_train_gbdt_sp, y_train)

    gbdt_y_pred = model_gbdt.predict(X_test_gbdt)
    gbdt_y_predprob = model_gbdt.predict_proba(X_test_gbdt)[:, 1]

    gbdt_acc = accuracy_score(y_test, gbdt_y_pred)
    gbdt_auc = roc_auc_score(y_test, gbdt_y_predprob)
    gbdt_loss = log_loss(y_test, gbdt_y_predprob)

    print('gbdt accuracy : %.3g' % gbdt_acc)
    print('gbdt auc: %.3f' % gbdt_auc)
    print('gbdt loss: %.3f' % gbdt_loss)

    gbdt_train_code = model_gbdt.apply(X_train_gbdt_sp)[:, :, 0]
    gbdt_test_code = model_gbdt.apply(X_test_gbdt_sp)[:, :, 0]

    X_train_src = X_train_src.astype(str).tolist()
    X_test_src = X_test_src.astype(str).tolist()

    ffmfeatures_train = gen_ffm_feature(X_train_src, gbdt_train_code,
                                        stat_dict)
    ffmfeatures_test = gen_ffm_feature(X_test_src, gbdt_test_code, stat_dict)

    dump_ffm_features(tmp_folder_name + '/tmp_tr.ffm', ffmfeatures_train,
                      y_train)
    dump_ffm_features(tmp_folder_name + '/tmp_te.ffm', ffmfeatures_test,
                      y_test)

    ffm_model = xl.create_ffm()
    ffm_model.setTrain(tmp_folder_name + "/tmp_tr.ffm")
    ffm_model.setValidate(tmp_folder_name + "/tmp_te.ffm")

    param = {
        'task':'binary', \
        'lr':0.2, \
        'lambda':0.002, \
        'opt': 'sgd', \
        'epoch': 10
    }

    ffm_model.fit(param, tmp_folder_name + "/ffm_model.out")

    # ffm_model.cv(param)

    ffm_model.setTest(tmp_folder_name + "/tmp_te.ffm")
    ffm_model.setSigmoid()
    y_test_pred = ffm_model.predict(tmp_folder_name + "/ffm_model.out",
                                    tmp_folder_name + "/ffm_output.txt")

    y_test_pred = read_pred(tmp_folder_name + "/ffm_output.txt")

    # test_loss = log_loss(y_test, y_pred_test)
    # print('shw test loss: ', test_loss)

    clean_tmp_files(tmp_folder_name)

    tmp = y_test_pred.reshape(y_test_pred.shape[0], 1)
    y_test_pred = np.hstack((1 - tmp, tmp))

    return y_test_pred
コード例 #19
0
                classifier_row['classifier'], classifier_row['fold'])
            classifier = joblib.load(open(classifier_fname, 'rb'))
            try:
                predicted = classifier.predict(data_test)
                metrics = dict()

                metrics['fname'] = classifier_row['fname']
                metrics['classifier'] = classifier_row['classifier']
                metrics['accuracy'] = accuracy_score(classes_test, predicted)
                tn, fp, fn, metrics['tp_rate'] = confusion_matrix(
                    classes_test, predicted).ravel()
                metrics['tp_rate'] = metrics['tp_rate'] / (metrics['tp_rate'] +
                                                           fn)
                metrics['kappa'] = cohen_kappa_score(classes_test, predicted)
                metrics['auc'] = roc_auc_score(classes_test,
                                               predicted,
                                               average='weighted')
                metrics['fscore'] = f1_score(classes_test,
                                             predicted,
                                             average='weighted')
                metrics['macro_f'] = f1_score(classes_test,
                                              predicted,
                                              average='macro')

                # metrics['fscore'] = f1_score(classes_test, predicted, average='weighted')
                # metrics['precision'] = precision_score(classes_test, predicted, average='weighted')
                # metrics['recall'] = recall_score(classes_test, predicted, average='weighted')
                # metrics['auc'] = roc_auc_score(classes_test, predicted, average='weighted')
                #
                # metrics['micro_f'] = f1_score(classes_test, predicted, average='micro')
                # metrics['micro_p'] = precision_score(classes_test, predicted, average='micro')
コード例 #20
0
    total_positive = sum(labels)
    total_negetive = len(labels) - total_positive

    acc = 0
    neg_remain = total_negetive
    i = 0
    while i < len(labels):
        if out[i][0] == 1:
            acc += neg_remain
        else:
            neg_remain -= 1

        i += 1

    return float(acc) / float(total_positive * total_negetive)


if __name__ == '__main__':
    import numpy as np
    from sklearn.metrics import ranking
    total = 50000
    p = np.random.rand(total)
    l = np.random.randint(0, 2, total)
    print ranking.roc_auc_score(l, p)

    sub_p = np.split(p, 5)
    sub_l = np.split(l, 5)
    sub_auc = map(lambda x: ranking.roc_auc_score(sub_l[x], sub_p[x]), range(5))
    print sum(sub_auc) / 5.0, sub_auc
#    print ranking.roc_auc_score(l, p)
コード例 #21
0
ファイル: show_results.py プロジェクト: panky8070/ConvArc
def show_results(y_test, prob_test, name, show=True, output_folder='', maxFNR=0.03, thresh = None):
    auc = ranking.roc_auc_score(y_test, prob_test, average=None, sample_weight=None)

    fpr, tpr, thresholds = ranking.roc_curve(y_test, prob_test, pos_label=1, sample_weight=None)
    fnr = 1 - tpr

    eer = min(zip(fpr, fnr, thresholds), key=lambda x: abs(x[0] - x[1]))

    idx_fnr = np.where(fnr<maxFNR)[0][0]
    if thresh == None:
        target_fnr = thresholds[idx_fnr]
    else:
        target_fnr = thresh
    y_pred = [float(score>=target_fnr) for score in prob_test]

    #fig = plt.figure()

    # show ROC
    if show:
        plt.figure(221)
        plt.plot(fpr, tpr, linewidth=2)
        plt.ylim(0, 1)
        plt.xlim(0, 1)
        plt.xlabel('FPR')
        plt.ylabel('TPR')
        plt.title(name + ' - ROC curve, AUC = %f' % (auc))

        # show FPR-FNR vs threshold curves
        plt.figure(222)
        fnr_line, = plt.plot(thresholds, fnr * 100, linewidth=2, color='blue')
        fpr_line, = plt.plot(thresholds, fpr * 100, linewidth=2, color='red', linestyle='--')
        plt.legend([fnr_line, fpr_line], ['False Negative Rate (FNR)', 'False Positive Rate (FPR)'])
        plt.ylim(0, 100.001)
        plt.xlim(np.min(prob_test), np.max(prob_test))
        plt.title(name + ' - EER = %0.1f%% at t=%0.2f' % (100 * (eer[0] + eer[1]) / 2, eer[2]))
        plt.show()

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    print ('AUC = %.2f' % (auc))
    print ('Confusion matrix (absolute frequency) at threshold = %.2f' % (target_fnr))
    print ('+---------------+------------+------------+')
    print ('|               |          TRUTH          |')
    print ('+---------------+------------+------------+')
    print ('|   PREDICTED   |  LEGIT(1)  |  FAKE (0)  |')
    print ('+---------------+------------+------------+')
    print ('|    LEGIT (1)  |%12d|%12d|' % (tp, fp))
    print ('+---------------+------------+------------+')
    print ('|     FAKE (0)  |%12d|%12d|' % (fn, tn))
    print ('+---------------+------------+------------+')

    print ('Confusion matrix (relative to |LEGIT| and |FAKE|) at threshold = %.2f' % (target_fnr))
    print ('+---------------+------------+------------+')
    print ('|               |          TRUTH          |')
    print ('+---------------+------------+------------+')
    print ('|   PREDICTED   |  LEGIT(1)  |  FAKE (0)  |')
    print ('+---------------+------------+------------+')
    print ('|    LEGIT (1)  |%11.1f%%|%11.1f%%|' % (tp*100.0/(tp+fn), fp*100.0/(fp+tn)))
    print ('+---------------+------------+------------+')
    print ('|     FAKE (0)  |%11.1f%%|%11.1f%%|' % (fn*100.0/(tp+fn), tn*100.0/(fp+tn)))
    print ('+---------------+------------+------------+')

    return y_pred, target_fnr
コード例 #22
0
def compute_auroc(true, score):
    true = true.cpu().numpy()
    score = score.cpu().numpy()
    # roc_auc_score(true, score)
    return roc_auc_score(true, score)
コード例 #23
0
def h2o_auc_score(y_actual, y_predict, average="macro", sample_weight=None, y_type=None):
    """Compute Area Under the Curve (AUC) using the trapezoidal rule.
    This implementation is restricted to the binary classification task
    or multilabel classification task in label indicator format.

    NOTE: using H2OFrames, this would require moving the predict vector locally for
    each task in the average binary score task. It's more efficient simply to bring both
    vectors local, and then use the sklearn h2o score. That's what we'll do for now.

    Parameters
    ----------
    y_actual : ``H2OFrame``, shape=(n_samples,)
        The one-dimensional ground truth

    y_predict : ``H2OFrame``, shape=(n_samples,)
        The one-dimensional predicted labels

    average : string, optional (default='macro')
        One of [None, 'micro', 'macro' (default), 'samples', 'weighted'].
        If ``None``, the scores for each class are returned. Otherwise,
        this determines the type of averaging performed on the data:

        ``'micro'``:
            Calculate metrics globally by considering each element of the label
            indicator matrix as a label.
        ``'macro'``:
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
        ``'weighted'``:
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label).
        ``'samples'``:
            Calculate metrics for each instance, and find their average.

    sample_weight : H2OFrame or float, optional (default=None)
        A frame of sample weights of matching dims with
        y_actual and y_predict.

    y_type : string, optional (default=None)
        The type of the column. If None, will be determined.


    Returns
    -------
    auc : float
    """
    # SKIP THESE FOR NOW, SINCE VALIDATED IN SKLEARN PORTION
    # y_type, y_actual, y_predict = _check_targets(y_actual, y_predict, y_type)
    # _err_for_continuous(y_type)  # this is restricted to classification tasks

    if sample_weight is not None:
        if isinstance(sample_weight, H2OFrame):
            _, _, sample_weight = _check_targets(y_actual, sample_weight, 'unknown')  #  we don't care about y_type here
            sample_weight = h2o_col_to_numpy(sample_weight)
        # else we just duck type it later

    # todo: do this better someday
    y_actual = h2o_col_to_numpy(y_actual)
    y_predict = h2o_col_to_numpy(y_predict)

    return roc_auc_score(y_actual, y_predict, average=average, sample_weight=sample_weight)