コード例 #1
0
ファイル: decision_tree.py プロジェクト: softlang/wikionto
def classifier_score(id_to_a_test, classifier, selector, X_test, y_test):
    X_test = selector.transform(X_test)
    y_test_predicted = classifier.predict(X_test)

    tp = 0
    tn = 0
    fp = 0
    fn = 0

    for x in range(len(y_test)):
        if y_test[x] == '1' and y_test_predicted[x] == '1':
            tp += 1
        if y_test[x] == '1' and y_test_predicted[x] == '0':
            fn += 1
            print("FN: " + id_to_a_test[x])
        if y_test[x] == '0' and y_test_predicted[x] == '0':
            tn += 1
        if y_test[x] == '0' and y_test_predicted[x] == '1':
            fp += 1
            # print("FP: " + id_to_a_test[x])
    print("Recall: " + str(tp / (tp + fn)))
    print("Specificity: " + str(tn / (fp + tn)))

    return {"TP": tp, "TN": tn, "FP": fp, "FN": fn,
            "Balanced_Accuracy": balanced_accuracy_score(y_test, y_test_predicted),
            "F_Measure": f1_score(y_test, y_test_predicted, pos_label='1'),
            "Recall": recall_score(y_test, y_test_predicted, pos_label='1'),
            "Negative-Recall": tn / (fp + tn),
            "Precision": precision_score(y_test, y_test_predicted, pos_label='1'),
            "Accuracy": classifier.score(X_test, y_test)}
コード例 #2
0
 def build_models_and_plot(self, X, y, model_dict, path, isplot, model_type, scoring_metric):
     ''' test a bunch of models and print out a sorted list of CV accuracies
         inputs:
             x: training data features, numpy array or Pandas dataframe
             y: training data labels, numpy array or Pandas dataframe
             model_dict: a dictionary of the form {name : model()}, where 'name' is a string
                         and 'model()' is a sci-kit-learn model object.
     '''
     n_folds=5
     random_state = np.random.RandomState(0)
     mean_aucs ={}
     mean_f1 = {}
      # Run classifier with cross-validation and plot ROC curves
     for (name, model) in model_dict.items():
         mean_aucs_list = []
         tprs = []
         aucs = []
         print("Model: ", name)
         cv = StratifiedKFold(n_splits=n_folds)
         i = 0
         scores = model_selection.cross_val_score(model, X, y, cv=n_folds, n_jobs=-1, scoring=scoring_metric)
         print("Scores: ", scores)
         plt.figure(figsize=(6, 4), dpi=150, facecolor='w', edgecolor='k')
         for train, test in cv.split(X, y):
             probas_ = model.fit(X[train], y[train] ).predict_proba(X[test])
             y_pred = model.fit(X[train], y[train] ).predict(X[test])
             if(scoring_metric=="balanced_accuracy"):
                 sm = balanced_accuracy_score(y[test], y_pred)
             else:
                 sm = f1_score(y[test], y_pred, average='macro')
             mean_aucs_list.append(self.auc_plotting(train, test, i, probas_, n_folds,
                                                    name, X,y, path, model_type, tprs,aucs))
             i += 1
             #print(set(y[test]), set(y_pred))
         self.save_model(name + "_" + model_type, path, model)
         mean_aucs[name]= [np.max(mean_aucs_list)]
         mean_f1 [name] = [np.mean(scores)]
     return mean_aucs, mean_f1
コード例 #3
0
     multi_true = regression_label_list[0].astype(np.int32)
     multi_true = np.squeeze(multi_true)
 elif multi_predictions_list != []:
     multi_pred = np.array(multi_predictions_list)
     for i in range(multi_pred.shape[1]):
         multi_pred[0, i] = np.argmax(np.bincount(multi_pred[:, i]))
     multi_pred = multi_pred[0]
     multi_pred = np.squeeze(multi_pred)
     multi_true = multi_label_list[0].astype(np.int32)
     multi_true = np.squeeze(multi_true)
 # change the standard 5-class predictions into the standard binary predictions
 binary_pred = (multi_pred >= 2).astype(np.int32)
 binary_true = (multi_true >= 2).astype(np.int32)
 # calculate the accuracy, balanced accuracy score, confusion matrix, precision, recall and F1 score of the binary classification
 binary_accuracy = metrics.accuracy_score(binary_true, binary_pred)
 binary_balanced_accuracy = metrics.balanced_accuracy_score(binary_true, binary_pred)
 binary_confusion_matrix = metrics.confusion_matrix(binary_true, binary_pred)
 precision = metrics.precision_score(binary_true, binary_pred)
 recall = metrics.recall_score(binary_true, binary_pred)
 f1_score = metrics.f1_score(binary_true, binary_pred)
 # output the result of the binary classification
 print('Binary-accuracy:\n{}'.format(binary_accuracy))
 print('Balanced Binary-accuracy:\n{}'.format(binary_balanced_accuracy))
 print('Binary-confusion matrix:\n{}'.format(binary_confusion_matrix))
 print('Precision:\n{}'.format(precision))
 print('Recall:\n{}'.format(recall))
 print('F1 score:\n{}'.format(f1_score))
 # calculate the accuracy, balanced accuracy score and confusion matrix of the 5-calss classification
 multi_accuracy = metrics.accuracy_score(multi_true, multi_pred)
 multi_balanced_accuracy = metrics.balanced_accuracy_score(multi_true, multi_pred)
 multi_confusion_matrix = metrics.confusion_matrix(multi_true, multi_pred)
コード例 #4
0
#### svm

# Best: 0.724737 using {'probability': True, 'kernel': 'linear', 'gamma': 0.0001, 'degree': 3, 'C': 0.1}

svc = svm.SVC(random_state=8)

C = [0.001, 0.1, 1, 10]
gamma = [0.0001, 0.001, 0.01, 0.1]
degree = [1, 2, 3, 4]
kernel = ['linear', 'rbf', 'poly']
probability = [True]

random_grid = {'C': C, 'kernel': kernel, 'gamma': gamma, 'degree': degree, 'probability': probability}

random_search = RandomizedSearchCV(estimator=svc, param_distributions=random_grid, n_iter=50, scoring='accuracy', cv=3,
                                   verbose=1, random_state=8)

random_search.fit(x_train, y_train)

print("Best: %f using %s" % (random_search.best_score_, random_search.best_params_))

svc = svm.SVC(random_state=8)

svc.fit(x_data_train, y_data_train)

svc_pred = svc.predict(x_test)

print(balanced_accuracy_score(y_test, svc_pred))
print(precision_score(y_test, svc_pred, average='weighted'))
print(recall_score(y_test, svc_pred, average='weighted'))
print(f1_score(y_test, svc_pred, average='weighted'))
コード例 #5
0
import torch
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from imblearn.datasets import make_imbalance
import numpy as np
from clstm import cnn,cnn_THS,X,Y
import sklearn
from sklearn.model_selection import train_test_split

vocab_size=100
#input_dim = 75
embedding_dim = 100
hidden_dim = 256
output_dim = 32
n_layers = 2
bidirectional = True
DROPOUT = 0.5
class_num=2
batch_size=32
dropout=0.5

model = cnn(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,class_num,batch_size,output_dim, dropout)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle=True)

for i in range(len(X_train)):
  model.partial_fit(np.asarray([X_train[i, :]]), np.asarray([y_train[i]]))
  
  if i % 1000 == 0:
    predictions = model.predict(X_test)
    print("Online Accuracy: {}".format(balanced_accuracy_score(y_test, predictions)))
コード例 #6
0
def test(args, io):
    #test_loader = DataLoader(ModelNet40(partition='test', num_points=args.num_points),
    #                         batch_size=args.test_batch_size, shuffle=True, drop_last=False)
    test_loader = DataLoader(ModelNet40(partition='test', num_points=args.num_points),
                             batch_size=args.test_batch_size, shuffle=False, drop_last=False)

    device = torch.device("cuda" if args.cuda else "cpu")

    #Try to load models
    if args.model == 'pointnet':
        model = PointNet(args).to(device)
    elif args.model == 'dgcnn':
        model = DGCNN(args).to(device)
    elif args.model == 'TransformerBaseline':
        model = DGCNN_Transformer(args).to(device)
    elif args.model == 'TemporalTransformer':
        model = DGCNN_TemporalTransformer(args).to(device)
    elif args.model == 'TemporalTransformer_v2':
        model = DGCNN_TemporalTransformer_v2(args).to(device)
    elif args.model == 'TemporalTransformer_v3':
        model = DGCNN_TemporalTransformer_v3(args).to(device)
    elif args.model == 'pi':
        model = pi_DGCNN(args).to(device)
    elif args.model == 'pi2':
        model = pi_DGCNN_v2(args).to(device)
    elif args.model == 'pipoint':
        model = pipoint_DGCNN(args).to(device)
    else:
        raise Exception("Not implemented")

    print(model)
    print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()])))


    model = nn.DataParallel(model)

    if args.model_path:
        if os.path.isfile(args.model_path):
            print("=> loading checkpoint '{}'".format(args.model_path))
            checkpoint = torch.load(args.model_path)
            print(checkpoint)

            if 'epoch' in checkpoint:
                args.start_epoch = checkpoint['epoch']
            #best_prec1 = checkpoint['best_prec1']
            #best_prec5 = checkpoint['best_prec5']
    
            if 'state_dict' in checkpoint:
                model.load_state_dict(checkpoint['state_dict'], strict=False)
            else:
                model.load_state_dict(checkpoint, strict=False)

            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.model_path, args.start_epoch))
        else:
            print("=> no checkpoint found at '{}'".format(args.model_path))
    #model.load_state_dict(torch.load(args.model_path))

    model = model.eval()
    test_acc = 0.0
    count = 0.0
    test_true = []
    test_pred = []

    batch_time = AverageMeter()

    
    end = time.time()
    for i, (data, label) in enumerate(test_loader):

        data, label = data.to(device), label.to(device).squeeze()
        data = data.permute(0, 2, 1)
        batch_size = data.size()[0]

        if args.model in ["pi", "pipoint", "pi2"]:
            logits, atts = model(data)
        elif args.model in ["TransformerBaseline", "TemporalTransformer", "TemporalTransformer_v2"]:
            logits = model(data)
        else:
            logits, degree = model(data)

        preds = logits.max(dim=1)[1]
        test_true.append(label.cpu().numpy())
        test_pred.append(preds.detach().cpu().numpy())
        batch_time.update(time.time() - end)
        end = time.time()

        if i % 10 == 0:
            print('Test {}, Time {batch_time.val:.3f} ({batch_time.avg:.3f})'.format(i, batch_time=batch_time))

    test_true = np.concatenate(test_true)
    test_pred = np.concatenate(test_pred)
    test_acc = metrics.accuracy_score(test_true, test_pred)
    avg_per_class_acc = metrics.balanced_accuracy_score(test_true, test_pred)

    per_class_acc = metrics.precision_score(test_true, test_pred, average=None)


    outstr = 'Test :: test acc: %.6f, test avg acc: %.6f'%(test_acc, avg_per_class_acc)
    outstr_2 = 'Test per class acc: {}'%per_class_acc
    io.cprint(outstr)
    io.cprint(outstr_2)

    if args.model in ["pi", "pipoint", "pi2"]:
        for j in range(4):
            io.cprint('Att {} : {}'.format(j, atts[j].mean().item()))
コード例 #7
0
pred_file = "D:\\Repos\\SJUMalwareEnsembleResearch\\Models\\Stacking\\Classic\\all_predictions5.sav"
pred_file2 = "D:\\Repos\\SJUMalwareEnsembleResearch\\Models\\Stacking\\Classic\\y_pred5.sav"

Y_pred = pickle.load(open(pred_file2, 'rb'))
pickle.dump(all_predictions, open(pred_file, 'wb'))
pickle.dump(Y_pred, open(pred_file2, 'wb'))

#Metrics

#Overall Accuracy
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, Y_pred)

#Balanced Accuracy
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(Y_test, Y_pred)

#Precision, Recall, F1Score
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(Y_test, Y_pred, average='weighted')

from collections import Counter
from sklearn import metrics
mapping = Counter(Y_test)
#print(Counter(y_test))
mapping = dict(sorted(mapping.items()))
#--- 259.12324500083923 seconds ---

label_map = {
    "0": "ADLOAD",
    "1": "AGENT",
コード例 #8
0
parameters = open("parameters.yml")
yamlparameters = yaml.load(parameters,Loader=yaml.FullLoader)
opt = Optimizer(config, api_key=yamlparameters["comet_api_key"], project_name="newhitmaskGBDTclassifier",auto_metric_logging=True)

X_train, X_test, y_train, y_test  = get_features(yamlparameters["DataDir"])


for experiment in opt.get_experiments():
    model = xgboost.XGBClassifier(booster="gbtree",verbosity=1,objective="binary:logistic",tree_method="exact",
                                  learning_rate=experiment.get_parameter("learning_rate"),
                                  gamma=experiment.get_parameter("gamma"),
                                  n_estimators=experiment.get_parameter("n_estimators"),
                                  max_depth=experiment.get_parameter("max_depth"),
                                  min_child_weight=experiment.get_parameter("min_child_weight"),
                                  subsample=experiment.get_parameter("subsample"),
                                  alpha=experiment.get_parameter("alpha"),n_jobs=4)

    model.fit(X_train,y_train)

    y_predict = model.predict(X_test)

    binary_accuracy = metrics.balanced_accuracy_score(y_test,y_predict)
    y_predict_prob = model.predict_proba(X_test)[:,1]
    auc = metrics.roc_auc_score(y_test,y_predict_prob)
    print("AUC:",auc)
    print("ACC:",binary_accuracy)
    experiment.log_metric("ROC",auc)

    experiment.log_metric("Binary_Accuracy",binary_accuracy)
コード例 #9
0
    random.choice(['M', 'S']) for i in range(len(y_test))
])

classifier_rf = RandomForestClassifier(n_estimators=5)
classifier_rf.fit(X_train, y_train)
puntos_predichos_rf = classifier_rf.predict(X_test)

classifier_svm = SVC()
classifier_svm.fit(valores_pixeles, clase)
puntos_predichos_svm = classifier_svm.predict(X_test)

# Observo metricas
# ==================================================================================
print("---------------------------------")
print("Accuracy_score")
print("---------------------------------")
print('RANDOM', accuracy_score(y_test, puntos_predichos_aleatorio))
print('RF:', accuracy_score(y_test, puntos_predichos_rf))
print('SVM:', accuracy_score(y_test, puntos_predichos_svm))

print("---------------------------------")
print("Balanced Accuracy_score")
print("---------------------------------")
print('RANDOM', balanced_accuracy_score(y_test, puntos_predichos_aleatorio))
print('RF:', balanced_accuracy_score(y_test, puntos_predichos_rf))
print('SVM:', balanced_accuracy_score(y_test, puntos_predichos_svm))

plot_confusion_matrix(classifier_svm, X_test, y_test)
plt.title("CM SVM")
plt.show()
コード例 #10
0
# %% [markdown]
# With the dummy classifier, which always predicts the negative class `'not
# donated'`, we obtain an accuracy score of 76%. Therefore, it means that this
# classifier, without learning anything from the data `data`, is capable of
# predicting as accurately as our logistic regression model.
#
# The problem illustrated above is also known as the class imbalance problem.
# When the classes are imbalanced, accuracy should not be used. In this case,
# one should either use the precision and recall as presented above or the
# balanced accuracy score instead of accuracy.

# %%
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy = balanced_accuracy_score(target_test, target_predicted)
print(f"Balanced accuracy: {balanced_accuracy:.3f}")
# %% [markdown]
# The balanced accuracy is equivalent to accuracy in the context of balanced
# classes. It is defined as the average recall obtained on each class.
#
# ## Evaluation and different probability thresholds
#
# All statistics that we presented up to now rely on `classifier.predict` which
# outputs the most likely label. We haven't made use of the probability
# associated with this prediction, which gives the confidence of the
# classifier in this prediction. By default, the prediction of a classifier
# corresponds to a threshold of 0.5 probability in a binary classification
# problem. We can quickly check this relationship with the classifier that
# we trained.
コード例 #11
0
ファイル: train.py プロジェクト: SCUT-CCNL/HiGCN
                loss_train = loss(output[train_idx], y[train_idx])
                loss_train.backward()
                optimizer.step()
                model.eval()
                with torch.no_grad():
                    output = model(x, A, gene_A)
                    loss_test = loss(output[test_idx], y[test_idx])
                    print(
                        'Epoch: {:04d}'.format(i),
                        'train_loss: {:.4f}'.format(loss_train.item()),
                        'train_roc_auc: {:.4f}'.format(
                            metrics.roc_auc_score(yy[train_idx].cpu(),
                                                  output[train_idx].cpu())),
                        'train_balance_acc_train: {:.4f}'.format(
                            metrics.balanced_accuracy_score(
                                y[train_idx].cpu(),
                                output[train_idx].argmax(dim=1).cpu())),
                        'test_roc_auc: {:.4f}'.format(
                            metrics.roc_auc_score(yy[test_idx].cpu(),
                                                  output[test_idx].cpu())),
                        'test_balance_acc_test: {:.4f}'.format(
                            metrics.balanced_accuracy_score(
                                y[test_idx].cpu(),
                                output[test_idx].argmax(dim=1).cpu())))

            #         loss_train_all.append(loss_train.item())
            #         train_bacc_all.append(metrics.balanced_accuracy_score(y[train_idx].cpu(), output[train_idx].argmax(dim=1).cpu()))
            #         loss_test_all.append(loss_test.item())
            #         test_bacc_all.append(metrics.balanced_accuracy_score(y[test_idx].cpu(), output[test_idx].argmax(dim=1).cpu()))
            # plot_train_test_loss(loss_train_all, train_bacc_all, loss_test_all, test_bacc_all)
            # print('confusion_matrix: ', confusion_matrix(y[test_idx].cpu(), output[test_idx].argmax(dim=1).cpu()))
コード例 #12
0
def train(args, io):
    if args.dataset == 'modelnet40':
        train_loader = DataLoader(ModelNet40(partition='train',
                                             num_points=args.num_points),
                                  num_workers=8,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  drop_last=True)
        test_loader = DataLoader(ModelNet40(partition='test',
                                            num_points=args.num_points),
                                 num_workers=8,
                                 batch_size=args.test_batch_size,
                                 shuffle=True,
                                 drop_last=False)
    else:
        pcd = np.empty((57449, 2048, 3), dtype=np.float32)
        label = np.empty((57449, ), dtype=np.int64)
        i = 0
        j = 0
        for fd in glob.glob('./data/shapenet/0*'):
            for f in os.listdir(fd):
                pct = o3d.io.read_point_cloud(os.path.join(fd, f))
                pcd[i] = np.asarray(pct.points)
                label[i] = j
                i = i + 1
            j = j + 1

        if i != 57449 or j != 57:
            raise ValueError('data stat doesn\'t match')

        pcd_train, pcd_test, label_train, label_test = train_test_split(
            pcd, label, stratify=label, random_state=2215)

        train_loader = DataLoader(shapeNetTrain(p=pcd_train,
                                                label=label_train,
                                                num_points=args.num_points),
                                  num_workers=8,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  drop_last=True)

        test_loader = DataLoader(shapeNetTest(p=pcd_test,
                                              label=label_test,
                                              num_points=args.num_points),
                                 num_workers=8,
                                 batch_size=args.test_batch_size,
                                 shuffle=True,
                                 drop_last=False)

    device = torch.device("cuda" if args.cuda else "cpu")

    #Try to load models
    if args.model == 'GSNET':
        if args.dataset == 'shapenet':
            model = GSNET(args, output_channels=57).to(device)
        else:
            model = GSNET(args).to(device)
    elif args.model == 'NewGSNET':
        print('NewGSNET!')
        if args.dataset == 'shapenet':
            model = NewGSNET(args, output_channels=57).to(device)
        else:
            model = NewGSNET(args).to(device)
    else:
        raise Exception("Not implemented")
    print(str(model))
    print(sum(p.numel() for p in model.parameters() if p.requires_grad))
    logfile.write(str(model))

    model = nn.DataParallel(model)
    print("Let's use", torch.cuda.device_count(), "GPUs!")

    if args.use_sgd:
        print("Use SGD")
        logfile.write("Use SGD")
        opt = optim.SGD(model.parameters(),
                        lr=args.lr * 100,
                        momentum=args.momentum,
                        weight_decay=1e-4)
    else:
        print("Use Adam")
        logfile.write("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=0.00001)

    criterion = cal_loss

    best_test_acc = 0
    for epoch in range(args.epochs):
        scheduler.step()
        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        model.train()
        train_pred = []
        train_true = []
        for data, label in train_loader:
            data, label = data.to(device), label.to(device).squeeze()
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            opt.zero_grad()
            logits = model(data)
            loss = criterion(logits, label)
            loss.backward()
            opt.step()
            preds = logits.max(dim=1)[1]
            count += batch_size
            train_loss += loss.item() * batch_size
            train_true.append(label.cpu().numpy())
            train_pred.append(preds.detach().cpu().numpy())
        train_true = np.concatenate(train_true)
        train_pred = np.concatenate(train_pred)
        outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f' % (
            epoch, train_loss * 1.0 / count,
            metrics.accuracy_score(train_true, train_pred),
            metrics.balanced_accuracy_score(train_true, train_pred))
        io.cprint(outstr)
        logfile.write(outstr)
        ####################
        # Test
        ####################
        test_loss = 0.0
        count = 0.0
        model.eval()
        test_pred = []
        test_true = []
        for data, label in test_loader:
            data, label = data.to(device), label.to(device).squeeze()
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            logits = model(data)
            loss = criterion(logits, label)
            preds = logits.max(dim=1)[1]
            count += batch_size
            test_loss += loss.item() * batch_size
            test_true.append(label.cpu().numpy())
            test_pred.append(preds.detach().cpu().numpy())
        test_true = np.concatenate(test_true)
        test_pred = np.concatenate(test_pred)
        test_acc = metrics.accuracy_score(test_true, test_pred)
        avg_per_class_acc = metrics.balanced_accuracy_score(
            test_true, test_pred)
        outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f' % (
            epoch, test_loss * 1.0 / count, test_acc, avg_per_class_acc)
        io.cprint(outstr)
        logfile.write(outstr)
        if test_acc >= best_test_acc:
            best_test_acc = test_acc
            io.cprint('Max Acc:%.6f' % best_test_acc)
            logfile.write('\nMax Acc:%.6f' % best_test_acc)
            torch.save(model.state_dict(),
                       'checkpoints/%s/models/model.t7' % args.exp_name)
コード例 #13
0
def test(args, io):

    if args.dataset == 'modelnet40':
        test_loader = DataLoader(ModelNet40(partition='test',
                                            num_points=args.num_points),
                                 num_workers=8,
                                 batch_size=args.test_batch_size,
                                 shuffle=True,
                                 drop_last=False)
    else:
        pcd = np.empty((57449, 2048, 3), dtype=np.float32)
        label = np.empty((57449, ), dtype=np.int64)
        i = 0
        j = 0
        for fd in glob.glob('./data/shapenet/0*'):
            for f in os.listdir(fd):
                pct = o3d.io.read_point_cloud(os.path.join(fd, f))
                pcd[i] = np.asarray(pct.points)
                label[i] = j
                i = i + 1
            j = j + 1

        if i != 57449 or j != 57:
            raise ValueError('data stat doesn\'t match')

        pcd_train, pcd_test, label_train, label_test = train_test_split(
            pcd, label, stratify=label, random_state=1152)

        test_loader = DataLoader(shapeNetTest(p=pcd_test,
                                              label=label_test,
                                              num_points=args.num_points),
                                 num_workers=8,
                                 batch_size=args.test_batch_size,
                                 shuffle=True,
                                 drop_last=False)

    device = torch.device("cuda" if args.cuda else "cpu")

    #Try to load models
    # if args.dataset == 'modelnet40':
    #     model = GSNET(args).to(device)
    # else:
    #     model = GSNET(args,output_channels=57).to(device)

    if args.model == 'GSNET':
        if args.dataset == 'shapenet':
            model = GSNET(args, output_channels=57).to(device)
        else:
            model = GSNET(args).to(device)
    elif args.model == 'NewGSNET':
        print('NewGSNET!')
        if args.dataset == 'shapenet':
            model = NewGSNET(args, output_channels=57).to(device)
        else:
            model = NewGSNET(args).to(device)
    else:
        raise Exception("Not implemented")

    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(args.model_path))
    model = model.eval()
    test_acc = 0.0
    count = 0.0
    test_true = []
    test_pred = []
    cnt = 0
    for data, label in test_loader:

        data, label = data.to(device), label.to(device).squeeze()
        data = data.permute(0, 2, 1)
        batch_size = data.size()[0]
        logits = model(data)
        preds = logits.max(dim=1)[1]
        if cnt <= 25:
            for index, inq in enumerate(
                    preds.detach().cpu().numpy() != label.cpu().numpy()):

                if inq:
                    pcd = o3d.geometry.PointCloud()
                    # print(data.cpu().permute(0,2,1).numpy()[index].shape)
                    pcd.points = o3d.utility.Vector3dVector(
                        (data.cpu().permute(0, 2, 1).numpy())[index])
                    o3d.io.write_point_cloud(
                        "./errors/gsnet/{:d}-{:s}-{:s}.ply".format(
                            cnt, folderTclass[(
                                preds.detach().cpu().numpy())[index]],
                            folderTclass[(label.cpu().numpy())[index]]), pcd)
                    cnt = cnt + 1
        test_true.append(label.cpu().numpy())
        test_pred.append(preds.detach().cpu().numpy())
    test_true = np.concatenate(test_true)
    test_pred = np.concatenate(test_pred)
    print(np.sum(test_true != test_pred))
    test_acc = metrics.accuracy_score(test_true, test_pred)
    avg_per_class_acc = metrics.balanced_accuracy_score(test_true, test_pred)
    outstr = 'Test :: test acc: %.6f, test avg acc: %.6f' % (test_acc,
                                                             avg_per_class_acc)
    io.cprint(outstr)
    logfile.write(outstr)
コード例 #14
0
def evaluate_model(est, x_tr, y_tr, x_ho, y_ho, **kwargs):
    pred_tr = est.predict(x_tr)
    score_tr = balanced_accuracy_score(y_tr, pred_tr)
    pred_ho = est.predict(x_ho)
    score_ho = balanced_accuracy_score(y_ho, pred_ho)
    return {'score-tr': score_tr, 'score-ho': score_ho}
コード例 #15
0
def log_model_metrics(y_actual: np.ndarray, y_predicted: np.ndarray) -> None:
    """log.info model evaluation metrics to stdout."""
    accuracy = balanced_accuracy_score(y_actual, y_predicted, adjusted=True)
    f1 = f1_score(y_actual, y_predicted, average="weighted")
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1", f1)
コード例 #16
0
                       random_state=22,
                       kernel='linear',
                       C=best_params[dict_dataset[dataset] + 'SVM.' +
                                     str(i + 1) + '.' + str(j + 1)]['C'],
                       probability=True)

            np.random.set_state(state)
            gkf = GroupKFold(n_splits=5).split(X_train, Y[train_idx],
                                               cluster_in[train_idx])
            for k, (train, test) in enumerate(gkf):
                clf.fit(X_train[train], Y[train_idx][train])
                acc.append(
                    accuracy_score(Y[train_idx][test],
                                   clf.predict(X_train[test])))
                bal_acc.append(
                    balanced_accuracy_score(Y[train_idx][test],
                                            clf.predict(X_train[test])))
                rec.append(
                    recall_score(Y[train_idx][test],
                                 clf.predict(X_train[test]),
                                 pos_label=1))
                pre.append(
                    precision_score(Y[train_idx][test],
                                    clf.predict(X_train[test]),
                                    pos_label=1))
                rocauc.append(
                    roc_auc_score(Y[train_idx][test],
                                  clf.predict_proba(X_train[test])[:, 1]))
                precision, recall, _ = precision_recall_curve(
                    Y[train_idx][test],
                    clf.predict_proba(X_train[test])[:, 1])
                aupr.append(auc(recall, precision))
コード例 #17
0
                    C=C,
                    seed=2 * 722019 + seed,
                    fn=fn,
                    device=device
                )  # 2*722019+seed is just to have a large odd number for seeding that is recommended for generating random numbers, fixed for reproducibility
                [X_tr_nmf, X_val_nmf, X_te_nmf, H] = m.fit_transform()

                chkpt = torch.load(fn)
                m.load_state_dict(chkpt['state_dict'])
                # best_iter = chkpt['epoch']
                accval = chkpt['best_val_acc']
                m.eval()

                y_tr_pred = m.predict(X_tr_nmf, pts_tr)
                y_te_pred = m.predict(X_te_nmf, pts_te)
                acctr = balanced_accuracy_score(y_train, y_tr_pred)
                accte = accuracy_score(y_test, y_te_pred)
                balaccte = balanced_accuracy_score(y_test, y_te_pred)
                preciste = precision_score(y_test, y_te_pred)
                recallte = recall_score(y_test, y_te_pred)
                f1te = f1_score(y_test, y_te_pred)

                w2 = np.square(
                    m.state_dict()['fc.weight'].cpu().numpy()).sum(axis=None)
                b2 = np.square(
                    m.state_dict()['fc.bias'].cpu().numpy()).sum(axis=None)
                w = 1 / pd.Series(y_train).value_counts(
                    normalize=True).sort_index().to_numpy()
                vce = chkpt['celoss']

                # err = np.vstack((X_train, X_val, X_test)) - np.vstack((X_tr_nmf, X_val_nmf, X_te_nmf)) @ H
コード例 #18
0
ファイル: sub_itr.py プロジェクト: ascillitoe/defragTrees
def run(prefix, Kmax, restart, trial, modeltype='regression', rftype='R', treenum=100, depth=20, maxitr=1000, tol=1e-6, njobs=1,smear_num=100,verbose=True,scoring='standard'):
    
    # trial
    dirname = './result/result_%s_itr' % (prefix,)
    for t in range(trial):
        if verbose:
            print('\n***********************************')
            print('Fold: %02d/%02d' %(t+1,trial))
            print(  '***********************************')

        # data
        dirname2 = '%s/result_%02d' % (dirname, t)
        trfile = '%s/%s_train_%02d.csv' % (dirname2, prefix, t)
        tefile = '%s/%s_test_%02d.csv' % (dirname2, prefix, t)
        Ztr = pd.read_csv(trfile, delimiter=',', header=None).values
        Xtr = Ztr[:, :-1]
        ytr = Ztr[:, -1]
        Zte = pd.read_csv(tefile, delimiter=',', header=None).values
        Xte = Zte[:, :-1]
        yte = Zte[:, -1]
       
        if (rftype=='R'):
            # build R random forest
            if modeltype == 'regression':
                os.system('Rscript ./baselines/buildRegForest.R %s %s %s %d 0' % (trfile, tefile, dirname2, treenum))
            elif modeltype == 'classification':        
                if verbose: print('Running buildClfForest.R')
                os.system('Rscript ./baselines/buildClfForest.R %s %s %s %d 0' % (trfile, tefile, dirname2, treenum))
            if verbose: print('Running DefragModel.parseRtrees')
            splitter = DefragModel.parseRtrees('%s/forest/' % (dirname2,))
            zfile = '%s/pred_test.csv' % (dirname2,)
            zte = pd.read_csv(zfile, delimiter=' ', header=None).values[:, -1]
            if modeltype == 'regression':
                score = np.mean((yte - zte)**2)
                cover = 1.0
                coll = 1.0
            elif modeltype == 'classification':
                if (scoring=='balanced'):
                    from sklearn.metrics import balanced_accuracy_score
                    score = 1.0 - balanced_accuracy_score(yte,zte)
                elif (scoring=='standard'):
                    from sklearn.metrics import accuracy_score
                    score = 1.0 - accuracy_score(yte,zte)
                elif (scoring=='f1'):
                    from sklearn.metrics import f1_score
                    score = f1_score(yte,zte)
                cover = 1.0
                coll = 1.0

        elif (rftype=='SL'):
            if modeltype=='regression':
                forest = RandomForestRegressor(n_estimators=treenum,n_jobs=njobs)
                forest.fit(Xtr,ytr)
                score = mean_squared_error(yte,forest.predict(Xte))
            elif modeltype == 'classification':
                forest = RandomForestClassifier(n_estimators=treenum,n_jobs=njobs, max_depth=depth)
                forest.fit(Xtr,ytr)
#                parameters = {'max_leaf_nodes':[16, 32, 64, 128], 'min_samples_leaf':[10,50,100,150]}
#                forest = RandomForestClassifier(n_estimators=treenum,n_jobs=njobs)
#                gs = GridSearchCV(forest, parameters, cv=3)
#                gs.fit(Xtr,ytr)
#                print(gs.best_params_)
#                forest = gs.best_estimator_
                if (scoring=='balanced'):
                    from sklearn.metrics import balanced_accuracy_score
                    score = 1.0 - balanced_accuracy_score(yte,zte)
                elif (scoring=='standard'):
                    from sklearn.metrics import accuracy_score
                    score = 1.0 - accuracy_score(yte,zte)
                elif (scoring=='f1'):
                    from sklearn.metrics import f1_score
                    score = f1_score(yte,zte)
            cover = 1.0
            coll = 1.0

            # parse sklearn tree ensembles into the array of (feature index, threshold)
            splitter = DefragModel.parseSLtrees(forest) 

            # Write sklearn random forests to file (in the same format as the .R ones)
            # inTrees, NHarvest, BATrees and DTree2 can then be performed on this RF without any modification to packages.
            DefragModel.save_SLtrees('%s/forest/' % (dirname2,),forest)
        
        print('RF Test Score = %.2f' % (score))
        print('RF Test Coverage = %.2f' % (cover))
        print('RF Overlap = %.2f' % (coll))
        np.savetxt('%s/res_rf_%02d.csv' % (dirname2, t), np.array([score, cover, coll, -1]), delimiter=',')

        # defragTrees
        if verbose: print('Defragging model')
        mdl = DefragModel(modeltype=modeltype, restart=restart, maxitr=maxitr, tol=tol, seed=restart*t, njobs=njobs,score=scoring)
        if verbose: print('Fitting defrag')
        mdl.fit(Xtr, ytr, splitter, Kmax, fittype='FAB')
        joblib.dump(mdl, '%s/%s_defrag_%02d.mdl' % (dirname2, prefix, t), compress=9)
        score, cover, coll = mdl.evaluate(Xte, yte)
        print('Defrag Test Score = %.2f' % (score))
        print('Defrag Test Coverage = %.2f' % (cover))
        print('Defrag Overlap = %.2f' % (coll))
        np.savetxt('%s/res_defrag_%02d.csv' % (dirname2, t), np.array([score, cover, coll, len(mdl.rule_)]), delimiter=',')

        if (rftype=='R'):
            # inTrees
            if verbose: print('Fitting inTree model')
            mdl2 = inTreeModel(modeltype=modeltype,score=scoring)
            mdl2.fit(Xtr, ytr, '%s/inTrees.txt' % (dirname2,))
            joblib.dump(mdl2, '%s/%s_inTrees_%02d.mdl' % (dirname2, prefix, t), compress=9)
            score, cover, coll = mdl2.evaluate(Xte, yte)
            print('inTrees Test Score = %.2f' % (score))
            print('inTrees Test Coverage = %.2f' % (cover))
            print('inTrees Overlap = %.2f' % (coll))
            np.savetxt('%s/res_inTrees_%02d.csv' % (dirname2, t), np.array([score, cover, coll, len(mdl2.rule_)]), delimiter=',')
            
            # NHarvest
            if verbose: print('Fitting NHarvest model')
            mdl3 = NHarvestModel(modeltype=modeltype,score=scoring)
            mdl3.fit(Xtr, ytr, '%s/nodeHarvest.txt' % (dirname2,))
            joblib.dump(mdl3, '%s/%s_nodeHarvest_%02d.mdl' % (dirname2, prefix, t), compress=9)
            score, cover, coll = mdl3.evaluate(Xte, yte)
            zfile = '%s/pred_test_nh.csv' % (dirname2,)
            zte = pd.read_csv(zfile, delimiter=' ', header=None).values[:, -1]
            if modeltype == 'regression':
                score = np.mean((yte - zte)**2)
            elif modeltype == 'classification':
                if (score=='balanced'):
                    from sklearn.metrics import balanced_accuracy_score
                    score = 1.0 - balanced_accuracy_score(yte,zte)
                elif (score=='standard'):
                    from sklearn.metrics import accuracy_score
                    score = 1.0 - accuracy_score(yte,zte)
                elif (score=='f1'):
                    from sklearn.metrics import f1_score
                    score = f1_score(yte,zte)
            print('NHarvest Test Score = %.2f' % (score))
            print('NHarvest Test Coverage = %.2f' % (cover))
            print('NHarvest Overlap = %.2f' % (coll))
            np.savetxt('%s/res_nodeHarvest_%02d.csv' % (dirname2, t), np.array([score, cover, coll, len(mdl3.rule_)]), delimiter=',')
        
        # BA Tree Result
        if verbose: print('Fitting BATree model')
        mdl4 = BTreeModel(modeltype=modeltype, njobs=njobs, seed=t, smear_num=50,score=scoring)
        mdl4.fit(Xtr, ytr, '%s/forest/' % (dirname2,))
        joblib.dump(mdl4, '%s/%s_BATree_%02d.mdl' % (dirname2, prefix, t), compress=9)
        score, cover, coll = mdl4.evaluate(Xte, yte)
        print('BATree Test Score = %.2f' % (score))
        print('BATree Test Coverage = %.2f' % (cover))
        print('BATree Overlap = %.2f' % (coll))
        np.savetxt('%s/res_BATree_%02d.csv' % (dirname2, t), np.array([score, cover, coll, len(mdl4.rule_)]), delimiter=',')
        BATree_depth = mdl4.tree.max_depth_
     
        # DTree - depth = 2
        if verbose: print('Fitting Dtree - depth=2')
        mdl5 = DTreeModel(modeltype=modeltype, max_depth=[2],score=scoring)
        mdl5.fit(Xtr, ytr)
        joblib.dump(mdl5, '%s/%s_DTree2_%02d.mdl' % (dirname2, prefix, t), compress=9)
        score, cover, coll = mdl5.evaluate(Xte, yte)
        print('DTree2 Test Score = %.2f' % (score))
        print('DTree2 Test Coverage = %.2f' % (cover))
        print('DTree2 Overlap = %.2f' % (coll))
        np.savetxt('%s/res_DTree2_%02d.csv' % (dirname2, t), np.array([score, cover, coll, len(mdl5.rule_)]), delimiter=',')
    
        # DTree - depth = BATree_depth
        if verbose: print('Fitting Dtree to match BATree depth = ', BATree_depth)
        mdl5 = DTreeModel(modeltype=modeltype, max_depth=[BATree_depth],score=scoring)
        mdl5.fit(Xtr, ytr)
        joblib.dump(mdl5, '%s/%s_DTreeBA_%02d.mdl' % (dirname2, prefix, t), compress=9)
        score, cover, coll = mdl5.evaluate(Xte, yte)
        print('DTreeBA Test Score = %.2f' % (score))
        print('DTreeBA Test Coverage = %.2f' % (cover))
        print('DTreeBA Overlap = %.2f' % (coll))
        np.savetxt('%s/res_DTreeBA_%02d.csv' % (dirname2, t), np.array([score, cover, coll, len(mdl5.rule_)]), delimiter=',')

   
    # summary
    plot_summarize(prefix, trial, rftype)
    summary2csv(prefix, trial, rftype)
コード例 #19
0
def SM_test(summaryMeasure,
            k,
            data,
            label,
            data_test,
            label_test,
            batch_size,
            is_mmens=False):

    train_split_k = train_split[k]
    val_split_k = val_split[k]
    #reset the model before new cv
    model = CNN_model()
    model.to(device)
    # Loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.SGD(model.parameters(),
                          momentum=0.9,
                          lr=0.001,
                          weight_decay=1e-3)

    # load data
    train_split_k = train_split[k]
    testData = load_data(
        split='test',
        train_split=train_split_k,
        val_split=val_split_k,
        summaryMeasure=summaryMeasure,
        data=data,
        label=label,  # need train data for mean and sd normalization
        data_test=data_test,
        label_test=label_test)

    test_loader = DataLoader(dataset=testData,
                             batch_size=batch_size,
                             shuffle=True)

    # set and load best model
    model = CNN_model()
    model.load_state_dict(
        torch.load(MODEL_STORE_PATH + '/' + summaryMeasure + '_BEST_model' +
                   '_CV' + str(k + 1) + '.pt'))

    model.to(device)
    # START TRAINING FOR THIS CV
    # TEST EVALUATION
    model.eval()

    total_ep = 0.0
    correct_ep = 0.0
    loss_ep = 0.0
    current_ep_acc = 0.0
    current_ep_loss = 0.0
    total_step_v = len(test_loader)

    # F1 calc
    F1_labels = []
    F1_pred = []

    for i, (images, labels
            ) in enumerate(test_loader):  #TEST THE MODEL ON THE VALIDATION

        if device.type == "cuda":
            images = images.to(device)
            labels = labels.to(device)
        labels = labels.reshape(len(labels), 1).type(torch.cuda.FloatTensor)

        outputs_v = model(images)  # the forward uses the entire batch together
        predicted = outputs_v.data > 0.0  #define True if >0.5 and False if <0.5

        # collect true labels and predicted for metrix calc

        if i == 0:
            F1_labels = labels.int().cpu().numpy()
            F1_pred = predicted.int().cpu().numpy()
        else:
            F1_labels = np.concatenate((F1_labels, labels.int().cpu().numpy()))
            F1_pred = np.concatenate((F1_pred, predicted.int().cpu().numpy()))

        loss_v = criterion(outputs_v, labels)

        total_v = labels.size(
            0)  # get it in case the last batch has different number of sample
        correct_v = (predicted == labels
                     ).sum().item()  # sum the correct answers for this batch
        #acc_list_v.append(correct_v / total_v) # n of correct / n of sample in this batch

        total_ep += total_v  # sum all the samples. it must end up being the tot training set
        correct_ep += correct_v  # sum together the right answers in entire training set
        loss_ep += (loss_v.item() * len(labels))

        current_ep_acc = (correct_ep / total_ep) * 100
        current_ep_loss = loss_ep / total_ep

    acc = accuracy_score(F1_labels, F1_pred)
    bal_acc = balanced_accuracy_score(F1_labels, F1_pred)
    F1_Score = f1_score(F1_labels, F1_pred,
                        average='weighted')  #, zero_division=1)
    tn, fp, fn, tp = confusion_matrix(F1_labels, F1_pred).ravel()

    print('CV {}, sumMeasure: {} Test Accuracy of the model is: {}, F1: {}%'.
          format(k + 1, summaryMeasure, current_ep_acc, F1_Score))

    if is_mmens == True:
        return F1_labels.reshape(-1), F1_pred.reshape(-1)
    else:
        return bal_acc, f1_score
コード例 #20
0
    )
    print(
        f"\nBondad del modelo de Regresión Logística con características estandarizadas para el modelo {nombre}"
    )
    # Para el cálculo de errores, hacemos una predicción sobre el conjunto Test separado inicialmente
    # y calculamos la precisión balanceada. Para expresarlo en términos del error, hacemos 1 - accuracy
    y_pred = clf_rl.predict(X_train)
    print(f"Ein = {1-balanced_accuracy_score(y_train[:, i], y_pred)}")
    y_pred = clf_rl.predict(X_test)
    print(f"Etest = {1-balanced_accuracy_score(y_test[:, i], y_pred)}")
    print(f"Ecv = {1-clf_rl.best_score_}")

    # Almacenamos este modelo como el mejor estimador, de cara a compararlo con los otros a utilizar
    best_estimator = clf_rl
    best_name = "Regresión Logística"
    best_score = balanced_accuracy_score(y_test[:, i], y_pred)
    stop()

    # Apoyándonos en la función de Scikit-Learn, graficamos la matriz de confusión resultante para el conjunto Test.
    plot_confusion_matrix(clf_rl.best_estimator_,
                          X_test,
                          y_test[:, i],
                          display_labels=labels[i],
                          values_format='d')
    plt.title(
        f"Matriz de confusión para el caso de {nombre}\n usando Regresión Logística"
    )
    plt.ylabel(f"Clase verdadera")
    plt.xlabel(f"Clase predicha")
    plt.show()
コード例 #21
0
def SM_train_val_kfoldCV(summaryMeasure, data, label, batch_size, num_epochs,
                         n_split):

    # CROSSVALIDATION LOOP
    for k in np.arange(n_split):
        train_split_k = train_split[k]
        val_split_k = val_split[k]

        #reset the model before new cv
        model = CNN_model()
        model.to(device)
        # Loss and optimizer
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.SGD(model.parameters(),
                              momentum=0.9,
                              lr=0.001,
                              weight_decay=1e-3)

        # load data
        trainData = load_data(split='train',
                              train_split=train_split_k,
                              val_split=val_split_k,
                              summaryMeasure=summaryMeasure,
                              data=data,
                              label=label)
        valData = load_data(split='val',
                            train_split=train_split_k,
                            val_split=val_split_k,
                            summaryMeasure=summaryMeasure,
                            data=data,
                            label=label)

        train_loader = DataLoader(dataset=trainData,
                                  batch_size=batch_size,
                                  shuffle=True)
        val_loader = DataLoader(dataset=valData,
                                batch_size=batch_size,
                                shuffle=True)

        # START TRAINING FOR THIS CV

        # collect loss and acc of each epoch
        epoch_loss_train = []
        epoch_acc_train = []

        epoch_loss_val = []
        epoch_balacc_val = []
        epoch_F1_Score = []
        best_acc = None

        # TRAIN THE MODEL
        for epoch in range(num_epochs):
            total_ep = 0.0
            correct_ep = 0.0
            loss_ep = 0.0

            model.train()

            for i, (images, labels) in enumerate(train_loader):

                if device.type == "cuda":
                    images = images.to(device)
                    labels = labels.to(device)
                labels = labels.reshape(len(labels),
                                        1).type(torch.cuda.FloatTensor)

                # Run the forward pass
                outputs = model(images)
                loss = criterion(outputs, labels)

                # Backprop and perform  optimisation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # Track the accuracy
                total = labels.size(0)
                predicted = outputs.data > 0.0
                correct = (predicted == labels).sum().item()

                total_ep += total
                correct_ep += correct
                loss_ep += (loss.item() * len(labels))

            epoch_acc_train.append((correct_ep / total_ep))
            epoch_loss_train.append(loss_ep / total_ep)

            # Test the model on validation data
            model.eval()

            total_ep = 0.0
            correct_ep = 0.0
            loss_ep = 0.0

            # F1 calc
            F1_labels = []
            F1_pred = []

            for i, (images, labels) in enumerate(val_loader):

                if device.type == "cuda":
                    images = images.to(device)
                    labels = labels.to(device)
                labels = labels.reshape(len(labels),
                                        1).type(torch.cuda.FloatTensor)

                outputs_v = model(
                    images)  # the forward uses the entire batch together
                predicted = outputs_v.data > 0.0  #define True if >0.5 and False if <0.5

                # collect true labels and predicted for metrix calc
                if i == 0:
                    F1_labels = labels.int().cpu().numpy()
                    F1_pred = predicted.int().cpu().numpy()
                else:
                    F1_labels = np.concatenate(
                        (F1_labels, labels.int().cpu().numpy()))
                    F1_pred = np.concatenate(
                        (F1_pred, predicted.int().cpu().numpy()))

                loss_v = criterion(outputs_v, labels)

                total_v = labels.size(0)
                correct_v = (predicted == labels).sum().item()

                total_ep += total_v
                correct_ep += correct_v
                loss_ep += (loss_v.item() * len(labels))

            # calculate metrices of this epoch
            current_ep_loss = loss_ep / total_ep

            current_ep_acc = (correct_ep / total_ep)
            acc = accuracy_score(F1_labels, F1_pred)
            balacc = balanced_accuracy_score(F1_labels, F1_pred)

            F1_Score = f1_score(F1_labels, F1_pred, average='weighted')
            tn, fp, fn, tp = confusion_matrix(F1_labels, F1_pred).ravel()

            if not best_acc or best_acc < current_ep_acc:
                best_acc = current_ep_acc
                torch.save(
                    model.state_dict(),
                    MODEL_STORE_PATH + '/' + summaryMeasure + '_BEST_model' +
                    '_CV' + str(k + 1) + '.pt')

            # collect matrixes of all the epochs
            epoch_loss_val.append((loss_ep / total_ep))
            epoch_balacc_val.append(balacc)

        print('{}, CV_{}, best accuracy = {}'.format(summaryMeasure, k + 1,
                                                     acc))
コード例 #22
0
def eval_whitebox_classifier(R,
                             g,
                             EX,
                             StdX,
                             NormV,
                             x0,
                             label_x0,
                             bb_classifier,
                             wb_name,
                             precision_recalls=False):
    # scale x0 in the ridge model space
    sx0 = np.divide((x0 - EX), StdX, where=np.logical_not(np.isclose(StdX, 0)))
    # sx0 = np.divide((x0 - EX), StdX, where=StdX!=0)
    # compute the p-score of sx0
    sx0_w = np.dot(sx0, g.coef_)
    p_score = sx0_w + g.intercept_

    if linalg.norm(g.coef_) < 1.0e-5 or (
            abs(sx0_w) < 1.0e-5
    ):  # or math.isclose(p_score, 0) or math.isclose(p_score, 1):
        N_sx0_w = np.zeros(len(x0))
        R.wb_plane_dist_x0 = 0.0
    else:
        N_sx0_w = np.multiply(sx0, (0.5 - p_score) / sx0_w)
        R.wb_plane_dist_x0 = p_score / linalg.norm(g.coef_)

    # get the boundary point x1
    sx1 = sx0 + N_sx0_w
    x1 = (sx1 * StdX) + EX

    prob_x1 = bb_classifier([x1])[0]
    R.wb_class_x1 = 1 if prob_x1[1] > prob_x1[0] else 0
    R.wb_prob_x1_F = prob_x1[0]
    R.wb_prob_x1_T = prob_x1[1]
    R.wb_prob_x1_c0 = prob_x1[label_x0]

    R.wb_local_discr = g.predict([sx0])[0] - R.prob_x0
    R.wb_boundary_discr = g.predict([sx1])[0] - prob_x1[0]

    # build the (scaled) neighborhood of x0
    SNX0 = np.tile(sx0,
                   (NormV.shape[0], 1))  # repeat T times the scaled x1 row
    SNX0 = SNX0 + NormV
    NX0 = (SNX0 * StdX) + EX

    # build the (scaled) neighborhood of x1
    SNX1 = np.tile(sx1,
                   (NormV.shape[0], 1))  # repeat T times the scaled x1 row
    SNX1 = SNX1 + NormV
    NX1 = (SNX1 * StdX) + EX

    # predict the instance classes using the Black-Box and the White-Box classifiers
    BBY0, WBY0 = bb_classifier(NX0)[:, 0], g.predict(SNX0)
    BBY1, WBY1 = bb_classifier(NX1)[:, 0], g.predict(SNX1)
    if label_x0 == 1:
        WBY0, WBY1 = 1 - WBY0, 1 - WBY1
    BBCLS0, WBCLS0 = BBY0 > 0.5, WBY0 > 0.5
    BBCLS1, WBCLS1 = BBY1 > 0.5, WBY1 > 0.5

    R.wb_x1_change_score = np.mean(BBCLS1 != label_x0)
    R.wb_avg_bb_nx0 = np.mean(BBY0)
    R.wb_avg_bb_nx1 = np.mean(BBY1)
    R.wb_ratio_x0 = np.mean(BBCLS0)
    R.wb_ratio_x1 = np.mean(BBCLS1)
    R.wb_ratio_wb_x0 = np.mean(WBCLS0)
    R.wb_ratio_wb_x1 = np.mean(WBCLS1)

    try:
        R.wb_fidelity = accuracy_score(BBCLS0, WBCLS0)
        R.wb_prescriptivity = accuracy_score(BBCLS1, WBCLS1)
        R.wb_bal_fidelity = balanced_accuracy_score(BBCLS0, WBCLS0)
        R.wb_bal_prescriptivity = balanced_accuracy_score(BBCLS1, WBCLS1)

        R.wb_fidelity_f1 = f1_score(BBCLS0, WBCLS0)
        R.wb_prescriptivity_f1 = f1_score(BBCLS1, WBCLS1)

        # print(sklearn.metrics.confusion_matrix(BBCLS1, WBCLS1), wb_name)

        if precision_recalls:
            R.wb_precision_x1 = precision_score(BBCLS1, WBCLS1)
            R.wb_recall_x1 = recall_score(BBCLS1, WBCLS1)

        # R.wb_fidelity_R2 = g.score(SNX0, BBY0)
        # R.wb_prescriptivity_R2 = g.score(SNX1, BBY1)
    except:
        R.wb_bal_fidelity, R.wb_bal_prescriptivity = 0, 0
        R.wb_fidelity, R.wb_prescriptivity = 0, 0
        # R.wb_fidelity_R2, R.wb_prescriptivity_R2 = 0, 0
        R.wb_fidelity_f1, R.wb_prescriptivity_f1 = 0, 0

    # rename R keys (wb_* -> wb_name_*)
    for key in copy.copy(list(R.__dict__.keys())):
        if key.startswith("wb_"):
            R.__dict__[wb_name + key[2:]] = R.__dict__.pop(key)

    return (x1, sx1)
コード例 #23
0
ファイル: main.py プロジェクト: ehorel/Explainable-Clusters
                     amsgrad=False)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['categorical_accuracy'])

    model.fit(x=X_train,
              y=Y_train_trans,
              batch_size=batch_size,
              epochs=nr_epochs,
              validation_data=(X_val, Y_val_trans),
              callbacks=[early_stop, reduce_lr],
              verbose=1)

    Y_val_predicted = np.argmax(model.predict(X_val), axis=1)
    nn_val_acc = accuracy_score(Y_val, Y_val_predicted)
    nn_val_bal_acc = balanced_accuracy_score(Y_val, Y_val_predicted)

    print('Neural network accuracy on val set: {0} \n'.format(
        np.round(nn_val_acc, 2)))
    print('Neural network bal acc on val set: {0} \n'.format(
        np.round(nn_val_bal_acc, 2)))

    Y_test_predicted = np.argmax(model.predict(X_test), axis=1)
    nn_test_acc = accuracy_score(Y_test, Y_test_predicted)
    nn_test_bal_acc = balanced_accuracy_score(Y_test, Y_test_predicted)

    print('Neural network acc on test set: {0} \n'.format(
        np.round(nn_test_acc, 2)))
    print('Neural network bal acc on test set: {0} \n'.format(
        np.round(nn_test_bal_acc, 2)))
コード例 #24
0
def train(args, io):
    train_loader = DataLoader(ModelNet40(partition='train', num_points=args.num_points), num_workers=8,
                              batch_size=args.batch_size, shuffle=True, drop_last=False)
    test_loader = DataLoader(ModelNet40(partition='test', num_points=args.num_points), num_workers=8,
                             batch_size=args.test_batch_size, shuffle=False, drop_last=False)

    device = torch.device("cuda" if args.cuda else "cpu")

    #Try to load models
    if args.model == 'pointnet':
        model = PointNet(args).to(device)
    elif args.model == 'dgcnn':
        model = DGCNN(args).to(device)
    elif args.model == 'TransformerBaseline':
        model = DGCNN_Transformer(args).to(device)
    elif args.model == 'TemporalTransformer':
        model = DGCNN_TemporalTransformer(args).to(device)
    elif args.model == 'TemporalTransformer_v2':
        model = DGCNN_TemporalTransformer_v2(args).to(device)
    elif args.model == 'TemporalTransformer_v3':
        model = DGCNN_TemporalTransformer_v3(args).to(device)
    elif args.model == 'pi':
        model = pi_DGCNN(args).to(device)
    elif args.model == 'pi2':
        model = pi_DGCNN_v2(args).to(device)
    elif args.model == 'pipoint':
        model = pipoint_DGCNN(args).to(device)
    else:
        raise Exception("Not implemented")
    print(str(model))

    #model = nn.DataParallel(model, device_ids=list(range(3)))
    model = nn.DataParallel(model)
    print("Let's use", torch.cuda.device_count(), "GPUs!")

    # get the number of model parameters
    print('Number of model parameters: {}'.format(sum([p.data.nelement() for p in model.parameters()])))

    if args.use_sgd:
        print("Use SGD")
        optimizer = optim.SGD(model.parameters(), lr=args.lr*100, momentum=args.momentum, weight_decay=1e-4)
    else:
        print("Use Adam")
        optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    scheduler = CosineAnnealingLR(optimizer, args.epochs, eta_min=args.lr)
    
    criterion = cal_loss

    best_test_acc = 0

    if args.model_path:
        if os.path.isfile(args.model_path):
            print("=> loading checkpoint '{}'".format(args.model_path))
            checkpoint = torch.load(args.model_path)
            print(checkpoint)

            if 'epoch' in checkpoint:
                args.start_epoch = checkpoint['epoch']
            #best_prec1 = checkpoint['best_prec1']
            #best_prec5 = checkpoint['best_prec5']
    
            if 'state_dict' in checkpoint:
                model.load_state_dict(checkpoint['state_dict'])
            else:
                model.load_state_dict(checkpoint, strict=False)


            if 'optimizer' in checkpoint:
                optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.model_path, args.start_epoch))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))


    end = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        scheduler.step()
        

        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        model.train()
        train_pred = []
        train_true = []

        batch_time = AverageMeter()
        data_time = AverageMeter()

        end = time.time()
        for i, (data, label) in enumerate(train_loader):
            data_time.update(time.time()-end) 

            data, label = data.to(device), label.to(device).squeeze()

            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            optimizer.zero_grad()

            if args.model in ["pi", "pipoint", "pi2"]:
                logits, atts = model(data)
            elif args.model in ["TransformerBaseline", "TemporalTransformer", "TemporalTransformer_v2", "TemporalTransformer_v3"]:
                logits = model(data)
            else:
                logits, degree = model(data)

            '''
            if args.visualize == True:

                print(args.visualize)

                import matplotlib.pyplot as plt
                #cmap = plt.cm.get_cmap("hsv", 30)
                cmap = plt.cm.get_cmap("binary", 40)
                cmap = np.array([cmap(i) for i in range(40)])[:,:3]
                obj = degree[7,:,:3].cpu().numpy()
                obj_degree = degree[7,:,3:].squeeze()
                obj_degree = obj_degree.cpu().numpy().astype(int)
              
                obj_max = np.max(obj_degree)
                obj_min = np.min(obj_degree)

                print(obj_max)
                print(obj_min)

                for i in range(obj_min, obj_max):
                    print("{} : {}".format(i, sum(obj_degree == i)))


                gt = cmap[obj_degree-obj_min, :]
                showpoints(obj, gt, gt, ballradius=3)
            '''
            loss = criterion(logits, label, smoothing=True)
            loss.backward()
            optimizer.step()

            preds = logits.max(dim=1)[1]
            count += batch_size
            train_loss += loss.item() * batch_size
            train_true.append(label.cpu().numpy())
            train_pred.append(preds.detach().cpu().numpy())

            batch_time.update(time.time()-end)
            end=time.time()

            if i % 10 == 0:
                print_str = 'Train {}, loss {}, Time {batch_time.val:.3f} ({batch_time.avg:.3f}), Data {data_time.val:.3f} ({data_time.avg:.3f})'.format(epoch, train_loss*1.0/count, batch_time=batch_time, data_time=data_time)
                print(print_str)


        train_true = np.concatenate(train_true)
        train_pred = np.concatenate(train_pred)


        outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f' % (epoch,
                                                                                 train_loss*1.0/count,
                                                                                 metrics.accuracy_score(
                                                                                     train_true, train_pred),
                                                                                 metrics.balanced_accuracy_score(
                                                                                     train_true, train_pred))


        #outstr = 'Train {}, Time {batch_time.val:.3f} ({batch_time.avg:.3f}), Data {data_time.val:.3f} ({data_time.avg:.3f}), loss: {}, train acc: {}, train avg acc: {}'.format(epoch, batch_time=batch_time, data_time=data_time, train_loss*1.0/count, metrics.accuracy_score(train_true, train_pred), metrics.balanced_accuracy_score(train_true, train_pred))
        io.cprint(outstr)

        ####################
        # Test
        ####################
        with torch.no_grad():
            test_loss = 0.0
            count = 0.0
            model.eval()
            test_pred = []
            test_true = []

            batch_time = AverageMeter()
            losses = AverageMeter()

            end = time.time()
            for j, (data, label) in enumerate(test_loader):
                data, label = data.to(device), label.to(device).squeeze()
                data = data.permute(0, 2, 1)
                batch_size = data.size()[0]

                if args.model in ["pi", "pipoint", "pi2"]:
                    logits, atts = model(data)
                elif args.model in ["TransformerBaseline", "TemporalTransformer", "TemporalTransformer_v2", "TemporalTransformer_v3"]:
                    logits = model(data)
                else:
                    logits, degree = model(data)


                loss = criterion(logits, label, smoothing=True)
                preds = logits.max(dim=1)[1]
                count += batch_size
                test_loss += loss.item() * batch_size
                test_true.append(label.cpu().numpy())
                test_pred.append(preds.detach().cpu().numpy())

                batch_time.update(time.time() - end)
                end = time.time()

                if j % 10 == 0:
                    print('Test {}, Loss {}, Time {batch_time.val:.3f} ({batch_time.avg:.3f})'.format(j, test_loss*1.0/count, batch_time=batch_time))


            test_true = np.concatenate(test_true)
            test_pred = np.concatenate(test_pred)
            test_acc = metrics.accuracy_score(test_true, test_pred)
            avg_per_class_acc = metrics.balanced_accuracy_score(test_true, test_pred)

            #per_class_acc = metrics.precision_score(test_true, test_pred, average=None)
            #outstr = 'Test {}, loss: {}, train acc: {}, train avg acc: {}'.format(epoch, batch_time=batch_time, data_time=data_time, test_loss*1.0/count, test_acc, avg_per_class_acc)
            outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f' % (epoch,
                                                                              test_loss*1.0/count,
                                                                              test_acc,
                                                                              avg_per_class_acc)
            #outstr_2 = 'Test per class acc: {}'%per_class_acc
            io.cprint(outstr)
            #io.cprint(outstr_2)
            if args.model in ["pi", "pipoint", "pi2"]:
                for j in range(4):
                    io.cprint('Att {} : {}'.format(j, atts[j].mean().item()))

            is_best = test_acc >= best_test_acc

            if is_best:
                best_test_acc = test_acc

            save_checkpoint({
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict(),
                }, is_best, args.exp_name)
#ETAPA 03: PREDIÇÃO

Y_maio_19_pred_lgbm = classifier_lgbm.predict(X_maio_19)

#Análise de Métricas

from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score

#Accuracy Score

mtrc_accuracy_score_lgbm = accuracy_score(Y_maio_19, Y_maio_19_pred_lgbm)
print('Accuracy Score : ' + str(mtrc_accuracy_score_lgbm))

#Balanced Accuracy

mtrc_balanced_accuracy_score_lgbm = balanced_accuracy_score(Y_maio_19, Y_maio_19_pred_lgbm)
print('Balanced Accuracy Score : ' + str(mtrc_balanced_accuracy_score_lgbm))

#Precision Score
#Número de vezes que uma classe foi predita corretamente dividida pelo número de vezes que a classe foi predita.

mtrc_precision_score_lgbm = precision_score(Y_maio_19, Y_maio_19_pred_lgbm)
print('Precision Score : ' + str(mtrc_precision_score_lgbm))

#Recall Score
#Número de vezes que uma classe foi predita corretamente (TP) dividido pelo número de vezes que a classe aparece no dado de teste (TP + FN).

mtrc_recall_score_lgbm = recall_score(Y_maio_19, Y_maio_19_pred_lgbm)
print('Recall Score : ' + str(mtrc_recall_score_lgbm))

#F1 Score
コード例 #26
0
def train_CNN(X, epochs, lr):

        pred = CNN_model(X)
        pred_classes = tf.argmax(pred, axis=1)
        labels = tf.argmax(y, axis=1)

        # give weight
        '''
        ratio = 1/10
        class_weight = tf.constant([ratio, 1-ratio])
        weighted_pred = tf.multiply(pred, class_weight)
        '''

        # Error function and Optimizer
        with tf.name_scope('Error'):
                error = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits= pred, labels= y))
        tf.summary.scalar('cross_entropy', error)

        with tf.name_scope('Optimizer'):        
                optimizer = tf.train.AdamOptimizer(lr).minimize(error)

        # Tensorflow Session
        with tf.Session() as sess:
                merged = tf.summary.merge_all()
                writer = tf.summary.FileWriter('./Visualization', sess.graph)
                sess.run(tf.global_variables_initializer())

                for epoch in range(epochs):
                        epoch_loss = 0
                        start = 0
                        # Training step
                        while start < len(y_train):
                                end = start + batch_size
                                
                                #X_batch = X_train[start:end]
                                #y_batch = y_train[start:end]
                                X_batch, y_batch = next_batch_balanced(batch_size, X_train, y_train)
                                start += batch_size
                                
                                # Running training
                                _, cost = sess.run([optimizer, error], feed_dict= {X: X_batch, y: y_batch, keep_prob: 0.6})
                                epoch_loss += cost

                        s = sess.run(merged, feed_dict={X: X_val, y: y_val, keep_prob: 1})
                        writer.add_summary(s, epoch+1)
                        print('Training:\nEpoch', epoch+1, 'completed out of', n_epochs, 'loss:', epoch_loss)
                        
                        y_true= sess.run(labels,feed_dict={X: X_val, y: y_val, keep_prob: 1}) 
                        y_pred= sess.run(pred_classes, feed_dict={X: X_val, y: y_val, keep_prob: 1})
                        print('Validation')
                        print("Balanced Accuracy: {:.4f}".format(balanced_accuracy_score(y_true, y_pred)))
                        print("Precision (positive): {:.4f}".format(precision_score(y_true, y_pred, pos_label=1)))
                        print("Precision (negative): {:.4f}".format(precision_score(y_true, y_pred, pos_label=0)))
                        print("Recall (positive): {:.4f}".format(recall_score(y_true, y_pred, pos_label=1)))
                        print("Recall (negative): {:.4f}".format(recall_score(y_true, y_pred, pos_label=0)))
                        
                # Testing
                y_true = sess.run(labels,feed_dict={X: X_test, y: y_test, keep_prob: 1}) 
                y_pred = sess.run(pred_classes, feed_dict={X: X_test, y: y_test, keep_prob: 1})
                
                print('Testing')
                print("Balanced Accuracy: {:.4f}".format(balanced_accuracy_score(y_true, y_pred)))
                print("Precision (positive): {:.4f}".format(precision_score(y_true, y_pred, pos_label=1)))
                print("Precision (negative): {:.4f}".format(precision_score(y_true, y_pred, pos_label=0)))
                print("Recall (positive): {:.4f}".format(recall_score(y_true, y_pred, pos_label=1)))
                print("Recall (negative): {:.4f}".format(recall_score(y_true, y_pred, pos_label=0)))
                #file.write(testing_string)

                saver = tf.train.Saver()
                saver.save(sess, './models/dnn', global_step = 1)
                writer.close()
コード例 #27
0
    def run(self):
        np.seterr(divide='warn', invalid='warn')
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['columns_to_ignore']
        ]

        # Make predictions on the validation dataset normally and with various columns missing
        normal_predictions = self.transaction.model_backend.predict('validate')

        normal_predictions_test = self.transaction.model_backend.predict(
            'test')
        normal_accuracy = evaluate_accuracy(
            normal_predictions,
            self.transaction.input_data.validation_df,
            self.transaction.lmd['stats_v2'],
            output_columns,
            backend=self.transaction.model_backend)

        for col in output_columns:
            if self.transaction.lmd['tss']['is_timeseries']:
                reals = list(self.transaction.input_data.validation_df[
                    self.transaction.input_data.
                    validation_df['make_predictions'] == True][col])
            else:
                reals = self.transaction.input_data.validation_df[col]
            preds = normal_predictions[col]

            fails = False

            data_type = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_type']
            data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_subtype']

            if data_type == DATA_TYPES.CATEGORICAL:
                if data_subtype == DATA_SUBTYPES.TAGS:
                    encoder = self.transaction.model_backend.predictor._mixer.encoders[
                        col]
                    if balanced_accuracy_score(
                            encoder.encode(reals).argmax(axis=1),
                            encoder.encode(preds).argmax(
                                axis=1)) <= self.transaction.lmd['stats_v2'][
                                    col]['balanced_guess_probability']:
                        fails = True
                else:
                    if balanced_accuracy_score(
                            reals, preds) <= self.transaction.lmd['stats_v2'][
                                col]['balanced_guess_probability']:
                        fails = True
            elif data_type == DATA_TYPES.NUMERIC:
                if r2_score(reals, preds) < 0:
                    fails = True
            else:
                pass

            if fails:
                if not self.transaction.lmd['force_predict']:

                    def predict_wrapper(*args, **kwargs):
                        raise Exception('Failed to train model')

                    self.session.predict = predict_wrapper
                log.error('Failed to train model to predict {}'.format(col))

        empty_input_predictions = {}
        empty_input_accuracy = {}
        empty_input_predictions_test = {}

        ignorable_input_columns = [
            x for x in input_columns if self.transaction.lmd['stats_v2'][x]
            ['typing']['data_type'] != DATA_TYPES.FILE_PATH and (
                not self.transaction.lmd['tss']['is_timeseries']
                or x not in self.transaction.lmd['tss']['order_by'])
        ]

        for col in ignorable_input_columns:
            empty_input_predictions[
                col] = self.transaction.model_backend.predict(
                    'validate', ignore_columns=[col])
            empty_input_predictions_test[
                col] = self.transaction.model_backend.predict(
                    'test', ignore_columns=[col])
            empty_input_accuracy[col] = evaluate_accuracy(
                empty_input_predictions[col],
                self.transaction.input_data.validation_df,
                self.transaction.lmd['stats_v2'],
                output_columns,
                backend=self.transaction.model_backend)

        # Get some information about the importance of each column
        self.transaction.lmd['column_importances'] = {}
        for col in ignorable_input_columns:
            accuracy_increase = (normal_accuracy - empty_input_accuracy[col])
            # normalize from 0 to 10
            self.transaction.lmd['column_importances'][col] = 10 * max(
                0, accuracy_increase)

        # Run Probabilistic Validator
        overall_accuracy_arr = []
        self.transaction.lmd['accuracy_histogram'] = {}
        self.transaction.lmd['confusion_matrices'] = {}
        self.transaction.lmd['accuracy_samples'] = {}
        self.transaction.hmd['probabilistic_validators'] = {}

        self.transaction.lmd['train_data_accuracy'] = {}
        self.transaction.lmd['test_data_accuracy'] = {}
        self.transaction.lmd['valid_data_accuracy'] = {}

        for col in output_columns:

            # Training data accuracy
            predictions = self.transaction.model_backend.predict(
                'predict_on_train_data',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['train_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.train_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Testing data accuracy
            predictions = self.transaction.model_backend.predict(
                'test',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['test_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.test_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Validation data accuracy
            predictions = self.transaction.model_backend.predict(
                'validate',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['valid_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.validation_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

        for col in output_columns:
            pval = ProbabilisticValidator(
                col_stats=self.transaction.lmd['stats_v2'][col],
                col_name=col,
                input_columns=input_columns)
            predictions_arr = [normal_predictions_test] + [
                x for x in empty_input_predictions_test.values()
            ]

            pval.fit(self.transaction.input_data.test_df, predictions_arr,
                     [[ignored_column]
                      for ignored_column in empty_input_predictions_test])
            overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats(
            )
            overall_accuracy_arr.append(overall_accuracy)

            self.transaction.lmd['accuracy_histogram'][
                col] = accuracy_histogram
            self.transaction.lmd['confusion_matrices'][col] = cm
            self.transaction.lmd['accuracy_samples'][col] = accuracy_samples
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                pval)

        self.transaction.lmd['validation_set_accuracy'] = sum(
            overall_accuracy_arr) / len(overall_accuracy_arr)

        # conformal prediction confidence estimation
        self.transaction.lmd['stats_v2']['train_std_dev'] = {}
        self.transaction.hmd['label_encoders'] = {}
        self.transaction.hmd['icp'] = {'active': False}

        for target in output_columns:
            data_type = self.transaction.lmd['stats_v2'][target]['typing'][
                'data_type']
            data_subtype = self.transaction.lmd['stats_v2'][target]['typing'][
                'data_subtype']
            is_classification = data_type == DATA_TYPES.CATEGORICAL

            fit_params = {
                'target': target,
                'all_columns': self.transaction.lmd['columns'],
                'columns_to_ignore': []
            }
            fit_params['columns_to_ignore'].extend(
                self.transaction.lmd['columns_to_ignore'])
            fit_params['columns_to_ignore'].extend(
                [col for col in output_columns if col != target])

            if is_classification:
                if data_subtype != DATA_SUBTYPES.TAGS:
                    all_targets = [
                        elt[1][target].values for elt in inspect.getmembers(
                            self.transaction.input_data)
                        if elt[0] in {'test_df', 'train_df', 'validation_df'}
                    ]
                    all_classes = np.unique(
                        np.concatenate([np.unique(arr)
                                        for arr in all_targets]))

                    enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
                    enc.fit(all_classes.reshape(-1, 1))
                    fit_params['one_hot_enc'] = enc
                    self.transaction.hmd['label_encoders'][target] = enc
                else:
                    fit_params['one_hot_enc'] = None
                    self.transaction.hmd['label_encoders'][target] = None

                adapter = ConformalClassifierAdapter
                nc_function = MarginErrFunc(
                )  # better than IPS as we'd need the complete distribution over all classes
                nc_class = ClassifierNc
                icp_class = IcpClassifier

            else:
                adapter = ConformalRegressorAdapter
                nc_function = AbsErrorErrFunc()
                nc_class = RegressorNc
                icp_class = IcpRegressor

            if (data_type == DATA_TYPES.NUMERIC or
                (is_classification and data_subtype != DATA_SUBTYPES.TAGS)
                ) and not self.transaction.lmd['tss']['is_timeseries']:
                model = adapter(self.transaction.model_backend.predictor,
                                fit_params=fit_params)
                nc = nc_class(model, nc_function)

                X = deepcopy(self.transaction.input_data.train_df)
                y = X.pop(target)

                if is_classification:
                    self.transaction.hmd['icp'][target] = icp_class(
                        nc, smoothing=False)
                else:
                    self.transaction.hmd['icp'][target] = icp_class(nc)
                    self.transaction.lmd['stats_v2']['train_std_dev'][
                        target] = self.transaction.input_data.train_df[
                            target].std()

                X = clean_df(X, self.transaction.lmd['stats_v2'],
                             output_columns)
                self.transaction.hmd['icp'][target].fit(X.values, y.values)
                self.transaction.hmd['icp']['active'] = True

                # calibrate conformal estimator on test set
                X = deepcopy(self.transaction.input_data.validation_df)
                y = X.pop(target).values

                if is_classification:
                    if isinstance(enc.categories_[0][0], str):
                        cats = enc.categories_[0].tolist()
                        y = np.array([cats.index(i) for i in y])
                    y = y.astype(int)

                X = clean_df(X, self.transaction.lmd['stats_v2'],
                             output_columns)
                self.transaction.hmd['icp'][target].calibrate(X.values, y)
コード例 #28
0
def dlModel_classifier(X_train,
                       y_train,
                       X_test,
                       y_test,
                       EPOCHS=100,
                       batch_size=5,
                       loss='binary_crossentropy',
                       metrics=['accuracy'],
                       optimizer=SGD(lr=0.01, momentum=0.9)):
    start_time = time.time()
    estimator = DLmodel_baseline(X_train, y_train, X_test, y_test, loss,
                                 metrics, optimizer)
    # Validation: ¶
    # Fit the model
    history = estimator.fit(X_train,
                            y_train,
                            validation_split=0.20,
                            epochs=180,
                            batch_size=10,
                            verbose=0)

    # list all data in history
    print(history.history.keys())
    #estimator.fit(X_train, y_train, epochs=EPOCHS, validation_split=0.2, verbose=1, batch_size=batch_size)

    y_pred = estimator.predict_classes(X_test)

    # summarizing historical accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    time_end = time.time() - start_time

    # Scores
    acc = accuracy_score(y_pred=y_pred, y_true=y_test) * 100
    print('accuracy ' + str(acc))
    print('balanced_accuracy_score ' +
          str(balanced_accuracy_score(y_pred=y_pred, y_true=y_test) * 100))

    #f1_score
    f1 = f1_score(y_pred=y_pred, y_true=y_test, average='macro') * 100
    print('f1_score ' + str(f1))
    #precision_score
    prec = precision_score(y_pred=y_pred, y_true=y_test,
                           average='weighted') * 100
    print('precision_score ' + str(prec))
    #log_loss
    print('Log loss ' + str(log_loss(y_pred=y_pred, y_true=y_test)))
    #recall_score
    recall = recall_score(y_pred=y_pred, y_true=y_test) * 100
    print('recall_score ' + str(recall))
    #roc_curve
    y_pred_proba = estimator.predict_proba(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(
        y_true=y_test, y_score=y_pred_proba)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive_rate, true_positive_rate, label="Keras")
    plt.xlabel('false_positive_rate')
    plt.ylabel('true_positive_rate')
    plt.title('ROC curve of ' + "Keras")
    plt.show()

    #roc_auc_score
    roc = roc_auc_score(y_test, y_pred_proba) * 100
    print('roc_auc_score ' + str(roc))

    #confusion_matrix
    print('confusion_matrix ' + str(confusion_matrix(y_test, y_pred)))

    class_report = classification_report(y_test, y_pred)
    print("classification_report" + str(class_report))
    f = open(mlresult_dir + str(project_identifier) + "_log_dlModels.csv", "a")

    f.write("\n Time taken to execute the Keras is " + str(time_end) + "\n" +
            "Dated on" + str(datetime.datetime.now()) + "\n" +
            ' Accuracy Score' + str(acc) + "\n" + 'f1_score ' + str(f1) +
            "\n" + 'precision_score ' + str(prec) + "\n" + 'recall_score ' +
            str(recall) + "\n" + 'roc_auc_score ' + str(roc) + "\n" +
            'classification_report ' + str(class_report))
    f.close()
    print("\n Time taken to execute the Keras is " + str(time_end))

    print("\n" + "Dated on" + str(datetime.datetime.now()) + "\n")
    # resturn model object

    return estimator, y_pred
コード例 #29
0
import joblib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

x_test = joblib.load('data/x_test_w.joblib')
y_test = joblib.load('data/y_test_w.joblib')

loaded = joblib.load('data/white-final-trained-model.joblib')
pred = loaded.predict(x_test)

print('test acc= ', accuracy_score(pred, y_test))
print('bal test acc= ', balanced_accuracy_score(pred, y_test))
print('F1 score= ', f1_score(pred, y_test, average='micro'))
print('F1 score= ', f1_score(pred, y_test, average=None))

cm = confusion_matrix(y_test, pred)
display = ConfusionMatrixDisplay(cm).plot()
plt.figure(1)
plt.title('Test white wine')
classNames = ['poor', 'average', 'excellent']
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
plt.show(display)
コード例 #30
0
def infer(valid_queue, model, criterion):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()
    model.eval()
    preds = np.asarray([])
    targets = np.asarray([])
    logits_pred = []
    names = []

    for step, (input, target, name) in enumerate(valid_queue):
        #input = input.cuda()
        #target = target.cuda(non_blocking=True)
        input = Variable(input, volatile=True).cuda()
        target = Variable(target, volatile=True).cuda(async=True)
        logits = model(input)
        loss = criterion(logits, target)

        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        n = input.size(0)
        #objs.update(loss.data[0], n)
        #top1.update(prec1.data[0], n)
        #top5.update(prec5.data[0], n)
        objs.update(loss.item(), n)
        top1.update(prec1.item(), n)
        top5.update(prec5.item(), n)

        #minha alteracao
        output = logits
        topk = (1, 3)
        maxk = max(topk)
        batch_size = target.size(0)
        _, predicted = torch.max(output.data, 1)
        #minha alteracao
        preds = np.concatenate((preds, predicted.cpu().numpy().ravel()))
        #targets = np.concatenate((targets,target.cpu().numpy().ravel()))
        targets = np.concatenate((targets, target.data.cpu().numpy().ravel()))
        names.append(name)
        logits_pred.append(output.data.cpu().numpy())

        if step % args.report_freq == 0:
            logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg,
                         top5.avg)

    print(preds.shape)
    print(targets.shape)
    print('np.unique(targets):', np.unique(targets))
    print('np.unique(preds): ', np.unique(preds))
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score
    cr = classification_report(targets, preds, output_dict=True)
    a1, a2, a3 = cr['macro avg']['f1-score'], cr['macro avg']['precision'], cr[
        'macro avg']['recall']
    topover = (a1 + a2 + a3) / 3
    print(classification_report(targets, preds))
    from sklearn.metrics import balanced_accuracy_score
    from sklearn.metrics import accuracy_score
    print(balanced_accuracy_score(targets, preds))
    acc = accuracy_score(targets, preds)

    log_score = open("log_score.txt", "a")
    log_score.write('logits: ' + str(logits_pred) + "\n")
    log_score.write('names: ' + str(names) + "\n")
    log_score.write('accuracy:' + str(acc) + '\n')
    log_score.write('report: ' + str(cr) + '\n')
    log_score.close()
    print("SAVED!!")

    from sklearn.metrics import confusion_matrix
    matrix = confusion_matrix(targets, preds)
    print(matrix.diagonal() / matrix.sum(axis=1))
    print(matrix)

    return top1.avg, objs.avg
コード例 #31
0
ファイル: utils.py プロジェクト: viviwang1008/ETH-ML
 def bmac(self, clf, x_test, y_test):
     from sklearn.metrics import balanced_accuracy_score
     return balanced_accuracy_score(y_test, self.clf.predict(x_test))
コード例 #32
0
# Classification using a single decision tree
###############################################################################
# We train a decision tree classifier which will be used as a baseline for the
# rest of this example.

###############################################################################
# The results are reported in terms of balanced accuracy and geometric mean
# which are metrics widely used in the literature to validate model trained on
# imbalanced set.

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print('Decision tree classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_tree),
              geometric_mean_score(y_test, y_pred_tree)))
cm_tree = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree, classes=np.unique(satimage.target), ax=ax,
                      title='Decision tree')

###############################################################################
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
コード例 #33
0
ファイル: custom_scores.py プロジェクト: cl0vis/Auto_ViML
def gini_bal_accuracy(truth, predictions):
    try:
        from sklearn.metrics import balanced_accuracy_score
        return balanced_accuracy_score(truth, predictions)
    except:
        return accuracy_score(truth, predictions)