Esempio n. 1
0
	def plot_cm_roc(self,data_loader):
		print("*****************Calculating Confusion Matrix*****************")
		
		
		save_path = os.path.join(self.output_model_dir, self.config.model_name, self.config.file_type,"data/")
		
		classes = np.arange(self.num_classes)
		probOutput,trueOutput,predictedOutput = self.test(data_loader)
		

		trueOutput = prepareData(trueOutput)
		predictedOutput = prepareData(predictedOutput)
		probOutput = prepareData(probOutput)

		one_hot_true_out = one_hot(trueOutput)

		normalized_confusion_matrix(trueOutput,predictedOutput,classes,save_path)
		plot_roc_curve(one_hot_true_out,probOutput,classes,save_path)

		if(self.config.debug == False):
			path = os.path.join(self.output_model_dir, self.config.model_name, self.config.file_type,"data/")
		
			cm =  Image.open(path+"confusion_matrix.png")
			roc =  Image.open(path+"roc_curve.png")
			
			wandb.log({"Confusion Matrix": [wandb.Image(cm, caption="Confusion Matrix")]})
			wandb.log({"ROC Curve": [wandb.Image(roc, caption="ROC Curve")]})
			
			# wandb.sklearn.plot_confusion_matrix(trueOutput,predictedOutput,classes)
			# wandb.sklearn.plot_roc(one_hot_true_out,probOutput,classes)




		return None
Esempio n. 2
0
    def _train_party_classifier(self, force: bool = False):
        """
        Trains classifier learning to predict political party from moral relevance weight vectors.
        :param force: Trains and overwrites classifier even if already available.
        :return:
        """

        pp_model_path = "data/party_predictor.pkl"
        pp_predictor = None

        # Build model predicting moral values for word.
        if force or not os.path.isfile(pp_model_path):
            df = self._users_df.sample(frac=1)
            df.mv_scores = df.mv_scores.values / df.num_words.values
            df.loc[df.party == "Libertarians", "party"] = "Republican Party"
            class_names = ["Republican Party", "Democratic Party"]

            x = np.asarray([np.asarray(x) for x in df.mv_scores.values])
            le = preprocessing.LabelEncoder()
            le.fit(class_names)
            y = le.transform(df.party.values)

            for train_index, test_index in StratifiedShuffleSplit(
                    n_splits=1, test_size=0.5).split(x, y):
                x_train, x_test = x[train_index], x[test_index]
                y_train, y_test = y[train_index], y[test_index]

                pp_predictor = xgb.XGBClassifier(objective='binary:logistic',
                                                 colsample_bytree=0.7,
                                                 learning_rate=0.05,
                                                 n_estimators=6000,
                                                 n_jobs=0,
                                                 nthread=0)
                pp_predictor.fit(x_train, y_train)
                pickle.dump(pp_predictor, open(pp_model_path, "wb"))

                y_pred = pp_predictor.predict(x_test)
                print(
                    classification_report(y_test,
                                          y_pred,
                                          target_names=class_names))
                utils.plot_precision_recall_curve(y_test, y_pred)
                utils.plot_roc_curve(y_test, y_pred, 2)
                utils.plot_confusion_matrix(
                    y_test,
                    y_pred, ["Republican Party", "Democratic Party"],
                    title="Confusion Matrix")
                # scores = cross_val_score(pp_predictor, x, y, cv=20, scoring='f1_macro')
                # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

        # Load built model.
        else:
            pp_predictor = pickle.load(open(
                pp_model_path, "rb"))  # pd.read_pickle(path=pp_model_path)

        return pp_predictor
Esempio n. 3
0
def test():
    test_data = get_test_data()
    x = test_data[0]
    y = test_data[1]

    # Recreate the model.
    model = DeepSEA()
    model.compile(optimizer=tf.keras.optimizers.SGD(momentum=0.9),
                  loss=tf.keras.losses.BinaryCrossentropy())
    model.build(input_shape=(None, 1000, 4))
    model.summary()

    # Load the weights of the old model. (The weights content the weights of model and status of optimizer.)
    # Because the tensorflow delay the creation of variables in model and optimizer, so the optimizer status will
    # be restored when the model is trained first. like: model.train_on_batch(x[0:1], y[0:1])
    model.load_weights('./result/model/ckpt')
    # model.load_weights('./result/model/bestmodel.h5')

    result = model.predict(x)  # shape = (455024, 919)

    np.savez('./result/test_result.npz', result=result, label=y)

    result = np.mean((result[0:227512], result[227512:]), axis=0)
    result_shape = np.shape(result)
    y = y[0:227512]

    fpr_list, tpr_list, auroc_list = [], [], []
    precision_list, recall_list, aupr_list = [], [], []
    for i in tqdm(range(result_shape[1]), ascii=True):
        fpr_temp, tpr_temp, auroc_temp = calculate_auroc(result[:, i], y[:, i])
        precision_temp, recall_temp, aupr_temp = calculate_aupr(
            result[:, i], y[:, i])

        fpr_list.append(fpr_temp)
        tpr_list.append(tpr_temp)
        precision_list.append(precision_temp)
        recall_list.append(recall_temp)
        auroc_list.append(auroc_temp)
        aupr_list.append(aupr_temp)

    plot_roc_curve(fpr_list, tpr_list, './result/')
    plot_pr_curve(precision_list, recall_list, './result/')

    header = np.array([['auroc', 'aupr']])
    content = np.stack((auroc_list, aupr_list), axis=1)
    content = np.concatenate((header, content), axis=0)
    write2csv(content, './result/result.csv')
    write2txt(content, './result/result.txt')
    avg_auroc = np.nanmean(auroc_list)
    avg_aupr = np.nanmean(aupr_list)
    print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
Esempio n. 4
0
def test():
    dataset_test = get_test_data(64)

    model = DanQ()
    loss_object = keras.losses.BinaryCrossentropy()
    optimizer = keras.optimizers.Adam()
    trainer = Trainer(model=model,
                      loss_object=loss_object,
                      optimizer=optimizer,
                      experiment_dir='./result/DanQ')

    result, label = trainer.test(dataset_test,
                                 test_steps=int(np.ceil(455024 / 64)),
                                 dis_show_bar=True)

    result = np.mean((result[0:227512], result[227512:]), axis=0)
    result_shape = np.shape(result)
    label = label[0:227512]

    fpr_list, tpr_list, auroc_list = [], [], []
    precision_list, recall_list, aupr_list = [], [], []
    for i in tqdm(range(result_shape[1]), ascii=True):
        fpr_temp, tpr_temp, auroc_temp = calculate_auroc(
            result[:, i], label[:, i])
        precision_temp, recall_temp, aupr_temp = calculate_aupr(
            result[:, i], label[:, i])

        fpr_list.append(fpr_temp)
        tpr_list.append(tpr_temp)
        precision_list.append(precision_temp)
        recall_list.append(recall_temp)
        auroc_list.append(auroc_temp)
        aupr_list.append(aupr_temp)

    plot_roc_curve(fpr_list, tpr_list, './result/DanQ/')
    plot_pr_curve(precision_list, recall_list, './result/DanQ/')

    header = np.array([['auroc', 'aupr']])
    content = np.stack((auroc_list, aupr_list), axis=1)
    content = np.concatenate((header, content), axis=0)
    write2csv(content, './result/DanQ/result.csv')
    write2txt(content, './result/DanQ/result.txt')
    avg_auroc = np.nanmean(auroc_list)
    avg_aupr = np.nanmean(aupr_list)
    print('AVG-AUROC:{:.3f}, AVG-AUPR:{:.3f}.\n'.format(avg_auroc, avg_aupr))
Esempio n. 5
0
def main():
    emotionals, rationals = emotional_rational()

    preprocessor = Preprocessor()
    emotionals = preprocessor.parse_sentences(emotionals)
    rationals = preprocessor.parse_sentences(rationals)

    train_pos = emotionals[:len(emotionals) // 2]
    train_neg = rationals[:len(rationals) // 2]

    test_pos = emotionals[len(emotionals) // 2:]
    test_neg = rationals[len(rationals) // 2:]

    vectorizer = CountVectorizer()

    X_train = vectorizer.fit_transform(train_pos + train_neg)
    y_train = np.array([1] * len(train_pos) + [0] * len(train_neg))

    X_test = vectorizer.transform(test_pos + test_neg)
    y_test = np.array([1] * len(test_pos) + [0] * len(test_neg))

    print('Vocabulary size : {}'.format(len(vectorizer.vocabulary_)))

    nbsvm = NBSVM()
    nbsvm.fit(X_train, y_train)

    print('Test accuracy : {}'.format(nbsvm.score(X_test, y_test)))

    y_pred = nbsvm.predict(X_test)
    print('F1 score : {}'.format(f1_score(y_test, y_pred, average='macro')))

    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    roc_auc = auc(fpr, tpr)
    print('AUC of emotionals : {}'.format(roc_auc))
    plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_emotional_roc.png')

    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=0)
    roc_auc = auc(fpr, tpr)
    print('AUC of rationals : {}'.format(roc_auc))
    plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_rational_roc.png')
Esempio n. 6
0
"""Testing A Simple Prediction"""
#print("Feature vector: %s" % X_test[:1])
print("Label: %s" % str(y_test[0]))
print("Predicted: %s" % str(net0.predict(X_test[:1])))


"""Metrics"""

# layer_info = PrintLayerInfo()
# net0.verbose = 3
# net0.initialize()
#print layer_info(net0)

print "[Classification Report]: "
print classification_report(y_test, predicted)
print "[Train dataset] Score: ", net0.score(X_train, y_train)
print "[Test dataset] Score: ", net0.score(X_test, y_test)
plot_matrix(net0, X_test, y_test, filename)

valid_accuracies = np.array([i["valid_accuracy"] for i in net0.train_history_])
plot_accuracy(valid_accuracies, filename)

train_loss = [row['train_loss'] for row in net0.train_history_]
valid_loss = [row['valid_loss'] for row in net0.train_history_]
plot_loss(valid_loss, train_loss, filename)

y_score = net0.predict_proba(X_test) #[:, 1]
y_test_bin = np.array(label_binarize(y_test, classes=np.unique(y)))
n_classes = y_test_bin.shape[1]
plot_roc_curve(n_classes, y_test_bin, y_score, filename=filename)
Esempio n. 7
0
        gt_labels = np.array(gt_labels)

        return logits, gt_labels

if __name__ == '__main__':
    model = NaiveBayes()
    tokenizer = stemmedTokenizer

    model.create_dict(json_reader("col774_yelp_data/train.json"), tokenizer)
    model.train(json_reader("col774_yelp_data/train.json"), tokenizer)

    # outputs = model.predict(json_reader("col774_yelp_data/test.json"), tokenizer)
    # f = open("outputs_stemmed_test.pickle","wb")
    # pickle.dump(outputs, f)
    # f.close()

    logits, gt_labels = _load_object("outputs_stemmed_test.pickle")
    conf_matrix = create_confusion_matrix(logits, gt_labels)
    
    print(calc_accuracy(logits, gt_labels) * 100)
    print(conf_matrix)

    plot_confusion_matrix(conf_matrix, model.classes)

    probs = logits_to_prob_vector(logits)
    plot_roc_curve(logits, gt_labels)
    



Esempio n. 8
0
def main():
    args = parse_args()

    # set random seed
    utils.seed_torch(args.seed)

    # Setup CUDA, GPU
    if not torch.cuda.is_available():
        print("cuda is not available")
        exit(0)
    else:
        args.device = torch.device("cuda")
        args.n_gpus = torch.cuda.device_count()
        print(f"available cuda: {args.n_gpus}")

    # Setup model
    model = MelanomaNet(arch=args.arch)
    if args.n_gpus > 1:
        model = torch.nn.DataParallel(module=model)
    model.to(args.device)
    model_path = f'{configure.MODEL_PATH}/{args.arch}_fold_{args.fold}.pth'

    # Setup data
    total_batch_size = args.per_gpu_batch_size * args.n_gpus
    train_loader, valid_loader = datasets.get_dataloader(
        image_dir=configure.TRAIN_IMAGE_PATH,
        fold=args.fold,
        batch_size=total_batch_size,
        num_workers=args.num_workers)

    # define loss function (criterion) and optimizer
    criterion = torch.nn.BCEWithLogitsLoss()
    # criterion = MarginFocalBCEWithLogitsLoss()
    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=args.learning_rate,
                                  weight_decay=args.weight_decay)

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=5,
                                                gamma=0.5)
    """ Train the model """
    current_time = datetime.now().strftime('%b%d_%H_%M_%S')
    log_dir = f'{configure.TRAINING_LOG_PATH}/{args.arch}_fold_{args.fold}_{current_time}'

    tb_writer = None
    if args.log:
        tb_writer = SummaryWriter(log_dir=log_dir)

    print(f'training started: {current_time}')
    best_score = 0.0
    for epoch in range(args.epochs):
        train_loss = train(dataloader=train_loader,
                           model=model,
                           criterion=criterion,
                           optimizer=optimizer,
                           args=args)

        valid_loss, y_true, y_score = valid(dataloader=valid_loader,
                                            model=model,
                                            criterion=criterion,
                                            args=args)

        valid_score = roc_auc_score(y_true=y_true, y_score=y_score)

        learning_rate = scheduler.get_lr()[0]
        if args.log:
            tb_writer.add_scalar("learning_rate", learning_rate, epoch)
            tb_writer.add_scalar("Loss/train", train_loss, epoch)
            tb_writer.add_scalar("Loss/valid", valid_loss, epoch)
            tb_writer.add_scalar("Score/valid", valid_score, epoch)

            # Log the roc curve as an image summary.
            figure = utils.plot_roc_curve(y_true=y_true, y_score=y_score)
            figure = utils.plot_to_image(figure)
            tb_writer.add_image("ROC curve", figure, epoch)

        if valid_score > best_score:
            best_score = valid_score
            state = {
                'state_dict': model.module.state_dict(),
                'train_loss': train_loss,
                'valid_loss': valid_loss,
                'valid_score': valid_score
            }
            torch.save(state, model_path)

        current_time = datetime.now().strftime('%b%d_%H_%M_%S')
        print(
            f"epoch:{epoch:02d}, "
            f"train:{train_loss:0.3f}, valid:{valid_loss:0.3f}, "
            f"score:{valid_score:0.3f}, best:{best_score:0.3f}, date:{current_time}"
        )

        scheduler.step()

    current_time = datetime.now().strftime('%b%d_%H_%M_%S')
    print(f'training finished: {current_time}')

    if args.log:
        tb_writer.close()
Esempio n. 9
0
net0.fit(X_train, y_train)
predicted = net0.predict(X_test)
"""Testing A Simple Prediction"""
#print("Feature vector: %s" % X_test[:1])
print("Label: %s" % str(y_test[0]))
print("Predicted: %s" % str(net0.predict(X_test[:1])))
"""Metrics"""

# layer_info = PrintLayerInfo()
# net0.verbose = 3
# net0.initialize()
#print layer_info(net0)

print "[Classification Report]: "
print classification_report(y_test, predicted)
print "[Train dataset] Score: ", net0.score(X_train, y_train)
print "[Test dataset] Score: ", net0.score(X_test, y_test)
plot_matrix(net0, X_test, y_test, filename)

valid_accuracies = np.array([i["valid_accuracy"] for i in net0.train_history_])
plot_accuracy(valid_accuracies, filename)

train_loss = [row['train_loss'] for row in net0.train_history_]
valid_loss = [row['valid_loss'] for row in net0.train_history_]
plot_loss(valid_loss, train_loss, filename)

y_score = net0.predict_proba(X_test)  #[:, 1]
y_test_bin = np.array(label_binarize(y_test, classes=np.unique(y)))
n_classes = y_test_bin.shape[1]
plot_roc_curve(n_classes, y_test_bin, y_score, filename=filename)
t0 = time()
grid_params = {'C': [0.1, 0.5, 1, 2, 3, 4, 5, 10, 15, 20, 30, 100]}

gs = GridSearchCV(SVC(kernel='rbf', probability=True),
                  grid_params,
                  verbose=1,
                  cv=5,
                  n_jobs=-1)
gs_results = gs.fit(X_train, y_train)
print("SVM Training done in %0.3fs\n" % (time() - t0))
print("Best estimator after cross validation:")
print("C-support - %d\n" % gs.best_estimator_.C)

# Testing
t0 = time()
y_pred = gs.predict(X_test)
print("SVM Testing done in %0.3fs\n" % (time() - t0))

# ROC Curve plot
probs = gs.predict_proba(X_test)
probs = probs[:, 1]
auc = metrics.roc_auc_score(y_test, probs)
print('AUC: %.2f\n' % auc)
fpr, tpr, thresholds = metrics.roc_curve(y_test, probs)
utils.plot_roc_curve('SVM ROC', fpr, tpr)

# Confusion Matrix
print('SVM Confusion Matrix')
print('-------------------------')
print(confusion_matrix(y_test, y_pred))
        scores = []
        for c in np.random.uniform(1, 50, 50):
            model = SVC(C=c)  # , probability=True
            cv_score = cross_val_score(model,
                                       x_train,
                                       y_train,
                                       cv=4,
                                       scoring='f1_micro')
            cs.append(c)
            scores.append(np.mean(cv_score))
            print('C: {}. CV score. Mean: {}. Sd: {}'.format(
                c, np.mean(cv_score), np.std(cv_score)))
        print(list(zip(cs, cv_score)))
    elif do == 'test':
        model.fit(x_train, y_train)
        test_score = model.score(x_test, y_test)
        print('Test score: {}'.format(test_score))

        class_dict_inv = {v: k for k, v in class_dict.items()}
        y_pred = model.predict(x_test)
        f1s = f1_score(y_test, y_pred, average=None)
        print('F1 scores:')
        for k, i in class_dict.items():
            print('{}: {}'.format(k, f1s[i]))
        plot_confusion_matrix(
            y_pred,
            y_test, [class_dict_inv[i] for i in range(len(class_dict))],
            normalize=True)
        y_pred = model.predict_proba(x_test)
        plot_roc_curve(y_pred, y_test, class_dict)
}

gs = GridSearchCV(KNeighborsClassifier(),
                  grid_params,
                  verbose=1,
                  cv=5,
                  n_jobs=-1)
gs_results = gs.fit(X_train, y_train)
print("KNN Training done in %0.3fs\n" % (time() - t0))
print("Best estimator after cross validation:")
print("Metric - %s\nK - %d\n" %
      (gs.best_estimator_.metric, gs.best_estimator_.n_neighbors))

# Testing
t0 = time()
y_pred = gs.predict(X_test)
print("KNN Testing done in %0.3fs\n" % (time() - t0))

# ROC Curve plot
probs = gs.predict_proba(X_test)
probs = probs[:, 1]
auc = metrics.roc_auc_score(y_test, probs)
print('AUC: %.2f\n' % auc)
fpr, tpr, thresholds = metrics.roc_curve(y_test, probs)
utils.plot_roc_curve('KNN ROC', fpr, tpr)

# Confusion Matrix
print('KNN Confusion Matrix')
print('-------------------------')
print(confusion_matrix(y_test, y_pred))
Esempio n. 13
0
t0 = time()
grid_params = {'n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200, 300]}

gs = GridSearchCV(AdaBoostClassifier(),
                  grid_params,
                  verbose=1,
                  cv=5,
                  n_jobs=-1)
gs_results = gs.fit(X_train, y_train)
print("Adaboost Training done in %0.3fs\n" % (time() - t0))
print("Best estimator after cross validation:")
print("Decision Stumps - %d\n" % gs.best_estimator_.n_estimators)

# Testing
t0 = time()
y_pred = gs.predict(X_test)
print("Adaboost Testing done in %0.3fs\n" % (time() - t0))

# ROC Curve plot
probs = gs.predict_proba(X_test)
probs = probs[:, 1]
auc = metrics.roc_auc_score(y_test, probs)
print('AUC: %.2f\n' % auc)
fpr, tpr, thresholds = metrics.roc_curve(y_test, probs)
utils.plot_roc_curve('Adaboost ROC', fpr, tpr)

# Confusion Matrix
print('Adaboost Confusion Matrix')
print('-------------------------')
print(confusion_matrix(y_test, y_pred))
    flist = [os.path.join(root_dir, _dir) for _dir in os.listdir(root_dir) \
            if 'preds_labels' in _dir]

    aucs, aps = [], []
    preds, trues = [], []
    for fname in flist:
        with open(fname, 'rb') as f:
            data = pickle.load(f)  # (pred, label) tuple

        aucs.append(get_auroc(data[1], data[0]))
        aps.append(get_ap(data[1], data[0]))
        preds.append(data[0])
        trues.append(data[1])

        plot_args = {'lw': 1, 'alpha': 0.5, 'color': 'gray', 'ls': '-'}
        plot_roc_curve(0, data[1], data[0], **plot_args)
        plot_pr_curve(1, data[1], data[0], **plot_args)

    plot_args = {'lw': 1, 'alpha': 0.9, 'color': 'black', 'ls': '-'}
    preds = np.concatenate(preds, axis=0)
    trues = np.concatenate(trues, axis=0)

    auc_cint = np.std(aucs) / np.sqrt(len(aucs)) * 1.96
    ap_cint = np.std(aps) / np.sqrt(len(aps)) * 1.96

    aucstr = '{} AUC: {:.4f} ({} {:.4f})'.format(model, np.mean(aucs),
                                                 u"\u00B1", auc_cint)
    apstr = '{} AP: {:.4f} ({} {:.4f})'.format(model, np.mean(aps), u"\u00B1",
                                               ap_cint)
    plot_roc_curve(0, trues, preds, legend=aucstr, **plot_args)
    plt.savefig(os.path.join(root_dir, 'auc'))
Esempio n. 15
0
def model(fout, X_train, X_valid, X_test, y_train, y_valid, ids, cat_vars,
          cat_sz, emb_szs, params, verbose):
    "Build and fit model for given train/validation/test/out files"

    nround = params.get('nround', 10)
    early_stopping_rounds = params.get('early_stopping_rounds', 50)

    drop_this = [
        "PersonalField41",
        "PersonalField37",
        "PropertyField10",
        "PersonalField46",
        "GeographicField23A",
        "PersonalField32",
        "GeographicField21A",
        "GeographicField64_2",
        "GeographicField56A",
        "PersonalField51",
        "PersonalField52",
        "PersonalField30",
        "PersonalField71",
        "PersonalField68",
        "GeographicField22A",
        "PersonalField7_1",
        "PersonalField47",
        "Field12_1",
        "PropertyField2A",
        "PersonalField62",
        "PropertyField11A",
        "GeographicField64_0",
        "GeographicField63_0",
        "GeographicField5A",
        "PersonalField29",
        "SalesField9",
        "PersonalField72",
        "PersonalField23",
        "GeographicField60A",
        "PersonalField44",
        "GeographicField12A",
        "PersonalField78",
        "PersonalField48",
        "PersonalField58",
        "GeographicField13A",
        "PropertyField4_1",
        "PropertyField4_0",
        "PersonalField33",
        "GeographicField62A",
        "PropertyField36_1",
        "PersonalField74",
        "PropertyField38_0",
        "PersonalField36",
        "PersonalField50",
        "GeographicField61A",
        "PersonalField54",
        "PersonalField53",
        "PropertyField30_1",
        "PropertyField22",
        "PersonalField38",
        "PersonalField55",
        "GeographicField63_1",
        "GeographicField18A",
        "PropertyField38_1",
        "GeographicField64_1",
        "PropertyField30_0",
        "Field12_0",
        "SalesField13",
        "PersonalField59",
        "PersonalField56",
        "PropertyField28_2",
        "SalesField15",
        "PersonalField19_23",
        "PersonalField76",
        "PropertyField31_2",
        "SalesField14",
        "Field6_3",
        "PropertyField36_0",
        "PersonalField19_21",
        "PersonalField57",
        "GeographicField15A",
        "PersonalField63",
        "PropertyField23",
        "PersonalField7_0",
        "Field6_4",
        "Field6_2",
        "PropertyField14_1",
        "PersonalField75",
        "PropertyField13",
        "PropertyField11B",
        "Field6_1",
        "PersonalField19_22",
        "PersonalField19_24",
        "PersonalField31",
        "PersonalField19_8",
        "PersonalField19_20",
        "PropertyField28_0",
        "PersonalField77",
        "PersonalField61",
        "PersonalField25",
        "PersonalField17_4",
        "PersonalField19_17",
        "PropertyField32_1",
        "GeographicField7A",
        "PersonalField19_5",
        "year_1",
        "PropertyField3_1",
        "PersonalField19_12",
        "PropertyField14_2",
        "PersonalField19_10",
        "GeographicField12B",
        "GeographicField11A",
        "PersonalField18_21",
        "PersonalField79",
        "PropertyField17",
        "PropertyField28_1",
        "PersonalField19_18",
        "PersonalField19_3",
        "PersonalField80",
        "PropertyField7_5",
        "PersonalField19_4",
        "PropertyField15",
        "PropertyField7_2",
        "PersonalField19_7",
        "PersonalField18_12",
        "PersonalField19_0",
        "PersonalField18_20",
        "PersonalField19_25",
        "Field6_0",
    ]

    for col in drop_this:
        X_train = X_train.drop(col, axis=1)
        X_valid = X_valid.drop(col, axis=1)
        X_test = X_test.drop(col, axis=1)

    TOPF = [
        'PersonalField10A', 'SalesField1A', 'PersonalField9', 'SalesField1B',
        'PersonalField10B'
    ]

    X_train["avg"] = X_train.mean(axis=1)
    X_valid["avg"] = X_valid.mean(axis=1)
    X_test["avg"] = X_test.mean(axis=1)

    X_train["sumTop"] = X_train[TOPF].sum(axis=1)
    X_valid["sumTop"] = X_valid[TOPF].sum(axis=1)
    X_test["sumTop"] = X_test[TOPF].sum(axis=1)

    ncols = X_test.columns.size

    X_train['value_count'] = X_train.apply(lambda x:
                                           (ncols - x.count()) / ncols,
                                           axis=1)
    X_valid['value_count'] = X_valid.apply(lambda x:
                                           (ncols - x.count()) / ncols,
                                           axis=1)
    X_test['value_count'] = X_test.apply(lambda x: (ncols - x.count()) / ncols,
                                         axis=1)

    X_train["comb1"] = X_train["sumTop"] * X_train["value_count"]
    X_valid["comb1"] = X_valid["sumTop"] * X_valid["value_count"]
    X_test["comb1"] = X_test["sumTop"] * X_test["value_count"]

    X_train["comb2"] = X_train["sumTop"] * X_train["avg"]
    X_valid["comb2"] = X_valid["sumTop"] * X_valid["avg"]
    X_test["comb2"] = X_test["sumTop"] * X_test["avg"]

    X_train["comb5"] = X_train["sumTop"] - X_train["value_count"]
    X_valid["comb5"] = X_valid["sumTop"] - X_valid["value_count"]
    X_test["comb5"] = X_test["sumTop"] - X_test["value_count"]

    X_train["comb6"] = X_train["sumTop"] - X_train["avg"]
    X_valid["comb6"] = X_valid["sumTop"] - X_valid["avg"]
    X_test["comb6"] = X_test["sumTop"] - X_test["avg"]

    newTOP = [
        'SalesField8', 'SalesField6', 'PersonalField10A', 'SalesField2B',
        'PersonalField10B', 'PropertyField29', 'SalesField5', 'sumTop',
        'SalesField1B', 'PersonalField9'
    ]

    X_train["avgTop2"] = X_train[newTOP].mean(axis=1)
    X_valid["avgTop2"] = X_valid[newTOP].mean(axis=1)
    X_test["avgTop2"] = X_test[newTOP].mean(axis=1)

    X_train["sumTop2"] = X_train[newTOP].sum(axis=1)
    X_valid["sumTop2"] = X_valid[newTOP].sum(axis=1)
    X_test["sumTop2"] = X_test[newTOP].sum(axis=1)

    print("Train shape", np.shape(X_train))
    print("Valid shape", np.shape(X_valid))
    print("Test  shape", np.shape(X_test))

    # preparre DMatrix object for training/fitting
    dtrain = xgb.DMatrix(X_train, label=y_train)
    deval = xgb.DMatrix(X_valid, label=y_valid)
    dtest = xgb.DMatrix(X_test)

    # model parameters
    args = {
        'max_depth': 6,
        'eta': 0.012,
        'subsample': 0.86,
        'colsample_bytree': 0.38,
        'eval_metric': 'auc',
        'silent': 0,
        'n_jobs': 4,
        'objective': 'binary:logistic'
    }
    if verbose:
        print("model parameters")
        pprint.pprint(args)

    # use evaluation list while traning
    evallist = [(deval, 'eval'), (dtrain, 'train')]
    # train our model with early stopping that we'll see that we don't overfit
    #bst = xgb.train(args, dtrain, nround, evallist, early_stopping_rounds=early_stopping_rounds)
    bst = xgb.train(args,
                    dtrain,
                    nround,
                    evallist,
                    early_stopping_rounds=early_stopping_rounds)

    # try eli5 explanation of our model
    # see permutation importance:
    # https://www.kaggle.com/dansbecker/permutation-importance?utm_medium=email&utm_source=mailchimp&utm_campaign=ml4insights
    try:
        import eli5
        html_obj = eli5.show_weights(bst, top=10)
        import html2text
        print(html2text.html2text(html_obj.data))
    except:
        pass

    # validate results
    pred = bst.predict(deval)
    myscores = bst.get_score()
    for i in sorted(myscores, key=myscores.get):
        print("\"" + i + "\"," + str(myscores[i]))
    print("AUC", metrics.roc_auc_score(y_valid, pred))

    # create AUC/ROC plot
    plot_roc_curve(y_valid, pred)

    # make prediction
    if fout:
        #pred = sclf2.predict(X_test)
        #pred = sclf.predict(X_test)
        #pred = eclf1.predict(X_test)
        pred = bst.predict(dtest)
        data = {'QuoteNumber': ids, 'QuoteConversion_Flag': pred}
        sub = pd.DataFrame(data,
                           columns=['QuoteNumber', 'QuoteConversion_Flag'])
        print("Write prediction to %s" % fout)
        sub.to_csv(fout, index=False)
Esempio n. 16
0
def main():
    emotionals, rationals = emotional_rational()

    preprocessor = Preprocessor()
    emotionals = preprocessor.parse_sentences(emotionals)
    rationals = preprocessor.parse_sentences(rationals)

    emotionals = emotionals[:len(emotionals)]
    rationals = rationals[:len(emotionals)]

    sentences = emotionals + rationals
    Y = np.array([[0, 1]] * len(emotionals) + [[1, 0]] * len(rationals))

    max_features = 200
    tokenizer = Tokenizer(num_words=max_features, split=' ')
    tokenizer.fit_on_texts(sentences)

    X = tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(X, maxlen=MAX_LEN)

    epochs = 15

    # --- Add Features ---
    dict_loader = EmotionalDict('dataset/nouns', 'dataset/verbs')
    emotional_dict = dict_loader.load()

    features_loader = AdditionalFeatures(emotionals+rationals, emotional_dict)
    add_features = features_loader.emotional_features()
    ######################

    x_aux_train = add_features[:848]
    x_aux_test = add_features[848:]

    model = build_model(x_aux_train.shape)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    print(X_train.shape, Y_train.shape)
    print(X_test.shape, Y_test.shape)

    batch_size = 32
    model.fit({'main_input': X_train, 'add_input': x_aux_train}, Y_train, epochs=epochs, batch_size=batch_size, verbose=2)

    score, acc = model.evaluate({'main_input': X_test, 'add_input': x_aux_test}, Y_test, verbose=2, batch_size=batch_size)

    print('score: {}'.format(score))
    print('acc: {}'.format(acc))

    Y_pred = model.predict({'main_input': X_test, 'add_input': x_aux_test}, batch_size=1, verbose=2)

    print(classification_report(Y_test[:, 1], np.round(Y_pred[:, 1]), target_names=['rationals', 'emotionals']))

    fpr, tpr, _ = roc_curve(Y_test[:, 1], Y_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    plot_roc_curve(fpr, tpr, roc_auc, 'roc.png')

    cnf_matrix = confusion_matrix(Y_test[:, 1], np.round(Y_pred[:, 1]))
    plot_confusion_matrix(cnf_matrix, ['rationals', 'emotionals'], 'cnf.png')

    attention_vector = np.mean(get_activations(model, X_test, True, 'attention_vec')[0], axis=2).squeeze()
    attention_vector = np.mean(attention_vector, axis=0)

    import matplotlib.pyplot as plt
    import pandas as pd
    pd.DataFrame(attention_vector, columns=['attention (%)']).plot(kind='bar', title='Attention')
    plt.savefig('attention_vec.png')

    attention_vector_indices = np.argsort(attention_vector)[::-1]

    word_index = tokenizer.word_index
    word_index_inv = {v: k for k, v in word_index.items()}

    with open('attention_word.txt', 'w') as f:
        for i, attention_index in enumerate(attention_vector_indices, start=1):
            try:
                print('No.{} : {}'.format(i, word_index_inv[attention_index]), file=f)
            except KeyError:
                continue
Esempio n. 17
0
def run_main(args):

    # Define parameters
    epochs = args.epochs
    dim_au_out = args.bottleneck  #8, 16, 32, 64, 128, 256,512
    dim_dnn_in = dim_au_out
    dim_dnn_out = 1
    select_drug = args.drug
    na = args.missing_value
    data_path = args.data_path
    label_path = args.label_path
    test_size = args.test_size
    valid_size = args.valid_size
    g_disperson = args.var_genes_disp
    model_path = args.source_model_path
    encoder_path = args.encoder_path
    log_path = args.logging_file
    batch_size = args.batch_size
    encoder_hdims = args.encoder_h_dims.split(",")
    preditor_hdims = args.predictor_h_dims.split(",")
    reduce_model = args.dimreduce
    prediction = args.predition
    sampling = args.sampling
    PCA_dim = args.PCA_dim

    encoder_hdims = list(map(int, encoder_hdims))
    preditor_hdims = list(map(int, preditor_hdims))
    load_model = bool(args.load_source_model)

    preditor_path = model_path + reduce_model + args.predictor + prediction + select_drug + '.pkl'

    # Read data
    data_r = pd.read_csv(data_path, index_col=0)
    label_r = pd.read_csv(label_path, index_col=0)
    label_r = label_r.fillna(na)

    now = time.strftime("%Y-%m-%d-%H-%M-%S")

    ut.save_arguments(args, now)

    # Initialize logging and std out
    out_path = log_path + now + ".err"
    log_path = log_path + now + ".log"

    out = open(out_path, "w")
    sys.stderr = out

    logging.basicConfig(
        level=logging.INFO,  #控制台打印的日志级别
        filename=log_path,
        filemode='a',  ##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志
        #a是追加模式,默认如果不写的话,就是追加模式
        format=
        '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
        #日志格式
    )
    logging.getLogger('matplotlib.font_manager').disabled = True

    logging.info(args)

    # data = data_r

    # Filter out na values
    selected_idx = label_r.loc[:, select_drug] != na

    if (g_disperson != None):
        hvg, adata = ut.highly_variable_genes(data_r, min_disp=g_disperson)
        # Rename columns if duplication exist
        data_r.columns = adata.var_names
        # Extract hvgs
        data = data_r.loc[selected_idx, hvg]
    else:
        data = data_r.loc[selected_idx, :]

    # Do PCA if PCA_dim!=0
    if PCA_dim != 0:
        data = PCA(n_components=PCA_dim).fit_transform(data)
    else:
        data = data

    # Extract labels
    label = label_r.loc[selected_idx, select_drug]

    # Scaling data
    mmscaler = preprocessing.MinMaxScaler()
    lbscaler = preprocessing.MinMaxScaler()

    data = mmscaler.fit_transform(data)
    label = label.values.reshape(-1, 1)

    if prediction == "regression":
        label = lbscaler.fit_transform(label)
        dim_model_out = 1
    else:
        le = LabelEncoder()
        label = le.fit_transform(label)
        dim_model_out = 2

    #label = label.values.reshape(-1,1)

    logging.info(np.std(data))
    logging.info(np.mean(data))

    # Split traning valid test set
    X_train_all, X_test, Y_train_all, Y_test = train_test_split(
        data, label, test_size=test_size, random_state=42)
    X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_all,
                                                          Y_train_all,
                                                          test_size=valid_size,
                                                          random_state=42)
    # sampling method
    if sampling == None:
        X_train, Y_train = sam.nosampling(X_train, Y_train)
        logging.info("nosampling")
    elif sampling == "upsampling":
        X_train, Y_train = sam.upsampling(X_train, Y_train)
        logging.info("upsampling")
    elif sampling == "downsampling":
        X_train, Y_train = sam.downsampling(X_train, Y_train)
        logging.info("downsampling")
    elif sampling == "SMOTE":
        X_train, Y_train = sam.SMOTEsampling(X_train, Y_train)
        logging.info("SMOTE")
    else:
        logging.info("not a legal sampling method")

    logging.info(data.shape)
    logging.info(label.shape)
    #logging.info(X_train.shape, Y_train.shape)
    #logging.info(X_test.shape, Y_test.shape)
    logging.info(X_train.max())
    logging.info(X_train.min())

    # Select the Training device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Assuming that we are on a CUDA machine, this should print a CUDA device:
    logging.info(device)
    torch.cuda.set_device(device)

    # Construct datasets and data loaders
    X_trainTensor = torch.FloatTensor(X_train).to(device)
    X_validTensor = torch.FloatTensor(X_valid).to(device)
    X_testTensor = torch.FloatTensor(X_test).to(device)
    X_allTensor = torch.FloatTensor(data).to(device)

    if prediction == "regression":
        Y_trainTensor = torch.FloatTensor(Y_train).to(device)
        Y_trainallTensor = torch.FloatTensor(Y_train_all).to(device)
        Y_validTensor = torch.FloatTensor(Y_valid).to(device)
    else:
        Y_trainTensor = torch.LongTensor(Y_train).to(device)
        Y_trainallTensor = torch.LongTensor(Y_train_all).to(device)
        Y_validTensor = torch.LongTensor(Y_valid).to(device)

    train_dataset = TensorDataset(X_trainTensor, X_trainTensor)
    valid_dataset = TensorDataset(X_validTensor, X_validTensor)
    test_dataset = TensorDataset(X_testTensor, X_testTensor)
    all_dataset = TensorDataset(X_allTensor, X_allTensor)

    X_trainDataLoader = DataLoader(dataset=train_dataset,
                                   batch_size=batch_size,
                                   shuffle=True)
    X_validDataLoader = DataLoader(dataset=valid_dataset,
                                   batch_size=batch_size,
                                   shuffle=True)
    X_allDataLoader = DataLoader(dataset=all_dataset,
                                 batch_size=batch_size,
                                 shuffle=True)

    # construct TensorDataset
    trainreducedDataset = TensorDataset(X_trainTensor, Y_trainTensor)
    validreducedDataset = TensorDataset(X_validTensor, Y_validTensor)

    trainDataLoader_p = DataLoader(dataset=trainreducedDataset,
                                   batch_size=batch_size,
                                   shuffle=True)
    validDataLoader_p = DataLoader(dataset=validreducedDataset,
                                   batch_size=batch_size,
                                   shuffle=True)

    dataloaders_train = {'train': trainDataLoader_p, 'val': validDataLoader_p}

    if (bool(args.pretrain) != False):
        dataloaders_pretrain = {
            'train': X_trainDataLoader,
            'val': X_validDataLoader
        }
        if reduce_model == "VAE":
            encoder = VAEBase(input_dim=data.shape[1],
                              latent_dim=dim_au_out,
                              h_dims=encoder_hdims)
        else:
            encoder = AEBase(input_dim=data.shape[1],
                             latent_dim=dim_au_out,
                             h_dims=encoder_hdims)

        if torch.cuda.is_available():
            encoder.cuda()

        logging.info(encoder)
        encoder.to(device)

        optimizer_e = optim.Adam(encoder.parameters(), lr=1e-2)
        loss_function_e = nn.MSELoss()
        exp_lr_scheduler_e = lr_scheduler.ReduceLROnPlateau(optimizer_e)

        if reduce_model == "AE":
            encoder, loss_report_en = t.train_AE_model(
                net=encoder,
                data_loaders=dataloaders_pretrain,
                optimizer=optimizer_e,
                loss_function=loss_function_e,
                n_epochs=epochs,
                scheduler=exp_lr_scheduler_e,
                save_path=encoder_path)
        elif reduce_model == "VAE":
            encoder, loss_report_en = t.train_VAE_model(
                net=encoder,
                data_loaders=dataloaders_pretrain,
                optimizer=optimizer_e,
                n_epochs=epochs,
                scheduler=exp_lr_scheduler_e,
                save_path=encoder_path)

        logging.info("Pretrained finished")

    # Train model of predictor

    if args.predictor == "DNN":
        if reduce_model == "AE":
            model = PretrainedPredictor(input_dim=X_train.shape[1],
                                        latent_dim=dim_au_out,
                                        h_dims=encoder_hdims,
                                        hidden_dims_predictor=preditor_hdims,
                                        output_dim=dim_model_out,
                                        pretrained_weights=encoder_path,
                                        freezed=bool(args.freeze_pretrain))
        elif reduce_model == "VAE":
            model = PretrainedVAEPredictor(
                input_dim=X_train.shape[1],
                latent_dim=dim_au_out,
                h_dims=encoder_hdims,
                hidden_dims_predictor=preditor_hdims,
                output_dim=dim_model_out,
                pretrained_weights=encoder_path,
                freezed=bool(args.freeze_pretrain),
                z_reparam=bool(args.VAErepram))

    elif args.predictor == "GCN":

        if reduce_model == "VAE":
            gcn_encoder = VAEBase(input_dim=data.shape[1],
                                  latent_dim=dim_au_out,
                                  h_dims=encoder_hdims)
        else:
            gcn_encoder = AEBase(input_dim=data.shape[1],
                                 latent_dim=dim_au_out,
                                 h_dims=encoder_hdims)

        gcn_encoder.load_state_dict(torch.load(args.GCNreduce_path))
        gcn_encoder.to(device)

        train_embeddings = gcn_encoder.encode(X_trainTensor)
        zOut_tr = train_embeddings.cpu().detach().numpy()
        valid_embeddings = gcn_encoder.encode(X_validTensor)
        zOut_va = valid_embeddings.cpu().detach().numpy()
        test_embeddings = gcn_encoder.encode(X_testTensor)
        zOut_te = test_embeddings.cpu().detach().numpy()

        adj_tr, edgeList_tr = g.generateAdj(
            zOut_tr,
            graphType='KNNgraphStatsSingleThread',
            para='euclidean' + ':' + str('10'),
            adjTag=True)
        adj_va, edgeList_va = g.generateAdj(
            zOut_va,
            graphType='KNNgraphStatsSingleThread',
            para='euclidean' + ':' + str('10'),
            adjTag=True)
        adj_te, edgeList_te = g.generateAdj(
            zOut_te,
            graphType='KNNgraphStatsSingleThread',
            para='euclidean' + ':' + str('10'),
            adjTag=True)

        Adj_trainTensor = preprocess_graph(adj_tr)
        Adj_validTensor = preprocess_graph(adj_va)
        Adj_testTensor = preprocess_graph(adj_te)

        Z_trainTensor = torch.FloatTensor(zOut_tr).to(device)
        Z_validTensor = torch.FloatTensor(zOut_va).to(device)
        Z_testTensor = torch.FloatTensor(zOut_te).to(device)

        if (args.binarizied == 0):
            zDiscret_tr = zOut_tr > np.mean(zOut_tr, axis=0)
            zDiscret_tr = 1.0 * zDiscret_tr
            zDiscret_va = zOut_va > np.mean(zOut_va, axis=0)
            zDiscret_va = 1.0 * zDiscret_va
            zDiscret_te = zOut_te > np.mean(zOut_te, axis=0)
            zDiscret_te = 1.0 * zDiscret_te

            Z_trainTensor = torch.FloatTensor(zDiscret_tr).to(device)
            Z_validTensor = torch.FloatTensor(zDiscret_va).to(device)
            Z_testTensor = torch.FloatTensor(zDiscret_te).to(device)

        ZTensors_train = {'train': Z_trainTensor, 'val': Z_validTensor}
        XTensors_train = {'train': X_trainTensor, 'val': X_validTensor}

        YTensors_train = {'train': Y_trainTensor, 'val': Y_validTensor}
        AdjTensors_train = {'train': Adj_trainTensor, 'val': Adj_validTensor}

        if (args.GCNfeature == "x"):
            dim_GCNin = X_allTensor.shape[1]
            GCN_trainTensors = XTensors_train
            GCN_testTensor = X_testTensor
        else:
            dim_GCNin = Z_testTensor.shape[1]
            GCN_trainTensors = ZTensors_train
            GCN_testTensor = Z_testTensor

        model = GCNPredictor(input_feat_dim=dim_GCNin,
                             hidden_dim1=encoder_hdims[0],
                             hidden_dim2=dim_au_out,
                             dropout=0.5,
                             hidden_dims_predictor=preditor_hdims,
                             output_dim=dim_model_out,
                             pretrained_weights=encoder_path,
                             freezed=bool(args.freeze_pretrain))

        # model2 = GAEBase(input_dim=X_train_all.shape[1], latent_dim=128,h_dims=[512])
        # model2.to(device)
        # test = model2((X_trainTensor,Adj_trainTensor))

    logging.info(model)
    if torch.cuda.is_available():
        model.cuda()
    model.to(device)

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=1e-2)

    if prediction == "regression":
        loss_function = nn.MSELoss()
    else:
        loss_function = nn.CrossEntropyLoss()

    exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer)

    if args.predictor == "GCN":
        model, report = t.train_GCNpreditor_model(model=model,
                                                  z=GCN_trainTensors,
                                                  y=YTensors_train,
                                                  adj=AdjTensors_train,
                                                  optimizer=optimizer,
                                                  loss_function=loss_function,
                                                  n_epochs=epochs,
                                                  scheduler=exp_lr_scheduler,
                                                  save_path=preditor_path)

    else:
        model, report = t.train_predictor_model(model,
                                                dataloaders_train,
                                                optimizer,
                                                loss_function,
                                                epochs,
                                                exp_lr_scheduler,
                                                load=load_model,
                                                save_path=preditor_path)
    if args.predictor != 'GCN':
        dl_result = model(X_testTensor).detach().cpu().numpy()
    else:
        dl_result = model(GCN_testTensor,
                          Adj_testTensor).detach().cpu().numpy()

    #torch.save(model.feature_extractor.state_dict(), preditor_path+"encoder.pkl")

    logging.info('Performances: R/Pearson/Mse/')

    if prediction == "regression":
        logging.info(r2_score(dl_result, Y_test))
        logging.info(pearsonr(dl_result.flatten(), Y_test.flatten()))
        logging.info(mean_squared_error(dl_result, Y_test))
    else:
        lb_results = np.argmax(dl_result, axis=1)
        #pb_results = np.max(dl_result,axis=1)
        pb_results = dl_result[:, 1]

        report_dict = classification_report(Y_test,
                                            lb_results,
                                            output_dict=True)
        report_df = pd.DataFrame(report_dict).T
        ap_score = average_precision_score(Y_test, pb_results)
        auroc_score = roc_auc_score(Y_test, pb_results)

        report_df['auroc_score'] = auroc_score
        report_df['ap_score'] = ap_score

        report_df.to_csv("saved/logs/" + reduce_model + args.predictor +
                         prediction + select_drug + now + '_report.csv')

        logging.info(classification_report(Y_test, lb_results))
        logging.info(average_precision_score(Y_test, pb_results))
        logging.info(roc_auc_score(Y_test, pb_results))

        model = DummyClassifier(strategy='stratified')
        model.fit(X_train, Y_train)
        yhat = model.predict_proba(X_test)
        naive_probs = yhat[:, 1]

        ut.plot_roc_curve(Y_test,
                          naive_probs,
                          pb_results,
                          title=str(roc_auc_score(Y_test, pb_results)),
                          path="saved/figures/" + reduce_model +
                          args.predictor + prediction + select_drug + now +
                          '_roc.pdf')
        ut.plot_pr_curve(Y_test,
                         pb_results,
                         title=average_precision_score(Y_test, pb_results),
                         path="saved/figures/" + reduce_model +
                         args.predictor + prediction + select_drug + now +
                         '_prc.pdf')
    SCORES['Y_TRUE'][tooth_type] = np.concatenate(
        [SCORES['Y_TRUE'][tooth_type],
         final_true.flatten()])
    SCORES['Y_PRED'][tooth_type] = np.concatenate(
        [SCORES['Y_PRED'][tooth_type],
         final_pred.flatten()])

    print('========')
'''
######  PLOT ROC/AUC CURVE #############
'''
for group_type in SCORES['GROUP_TYPE']:
    print("=============  GROUPE {} =====================".format(group_type))

    plot_roc_curve(SCORES['Y_TRUE'][group_type], SCORES['Y_PRED'][group_type],
                   group_type)

    print('DICE : {}'.format(
        np_dice_coef(SCORES['Y_TRUE'][group_type],
                     SCORES['Y_PRED'][group_type])))

    matrix_data = confusion_matrix(SCORES['Y_TRUE'][group_type],
                                   SCORES['Y_PRED'][group_type])
    plot_confusion_matrix(cm=matrix_data,
                          normalize=True,
                          target_names=['Background', 'Carry'],
                          title='Confusion Matrix for {}'.format(group_type),
                          cmap=plt.cm.Blues)

    try:
        tn, fp, fn, tp = confusion_matrix(