Example #1
0
def pmid_26033813_analysis(drug: str):
    tree = build_tree()

    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_expr = expr.loc[selected_samples, :]

    fit_tree(selected_expr, selected_labels, tree)

    predictions = pd.Series(
        [
            predict_sample(sample_name, selected_expr, tree)
            for sample_name in selected_samples
        ],
        index=selected_samples,
    )

    rd = RocData.calculate(selected_labels, predictions)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26033813 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, predictions)
    plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Example #2
0
    def roc_pr(self):
        '''
        tn, fp, fn, tp = metrics.confusion_matrix(self.labels, self.predicts)
        fpr = tp / (fp + tn)
        fnr = fn / (fn + tp)
        utils.plot_det(fpr, fnr)
        '''

        # using True to replace POS exmaple
        # ROC curve
        self.fpr, self.tpr, self.ths = metrics.roc_curve(y_true=self.labels,
                                                         y_score=self.predicts,
                                                         pos_label=1)
        print('fpr', self.fpr)
        print('tpr', self.tpr)
        print('ths', self.ths)
        self.auc = metrics.auc(self.fpr, self.tpr)
        print('AUC', self.auc)

        utils.plot_roc(self.fpr,
                       self.tpr,
                       self.ths,
                       self.auc,
                       save_path='output/roc.png')

        # PR-curve
        self.precision, self.recall, thresholds = metrics.precision_recall_curve(
            y_true=self.labels, probas_pred=self.predicts, pos_label=1)
        print("precision len", len(self.precision))
        print("recall len", len(self.recall))
        print("thresholds len", len(thresholds))
        utils.plot_pr(self.precision,
                      self.recall,
                      thresholds,
                      save_path='output/pr.png')
Example #3
0
def test(model, dataloader, epoch, is_graph=False):
    global best_test
    labels, distances = [], []
    with torch.set_grad_enabled(False):
        comparer = FullPairComparer().cuda()
        model.eval()
        for batch_idx, (data1, data2, target) in enumerate(dataloader):
            dist = []
            target = target.cuda(non_blocking=True)

            output1 = model(data1, False)
            output2 = model(data2, False)
            dist = comparer(output1, output2)  #TODO: sign - torch.sign()
            #dist = comparer(torch.sign(F.relu(output1)), torch.sign(F.relu(output2)))  # TODO: sign - torch.sign()
            distances.append(dist.data.cpu().numpy())
            labels.append(target.data.cpu().numpy())
            if batch_idx % 50 == 0:
                print('Batch-Index -{}'.format(str(batch_idx)))

    labels = np.array([sublabel for label in labels for sublabel in label])
    distances = np.array([subdist for dist in distances for subdist in dist])
    tpr, fpr, fnr, fpr_optimum, fnr_optimum, accuracy, threshold = evaluate(
        distances, labels)

    EER = np.mean(fpr_optimum + fnr_optimum) / 2
    print('TEST - Accuracy           = {:.12f}'.format(accuracy))
    print('TEST - EER                = {:.12f}'.format(EER))
    is_best = EER <= best_test
    best_test = min(EER, best_test)

    if is_best and is_graph:
        plot_roc(fpr,
                 tpr,
                 figure_name=args.outdir + '/Test_ROC-{}.png'.format(epoch))
        plot_DET_with_EER(fpr,
                          fnr,
                          fpr_optimum,
                          fnr_optimum,
                          figure_name=args.outdir +
                          '/Test_DET-{}.png'.format(epoch))
        plot_density(distances,
                     labels,
                     figure_name=args.outdir +
                     '/Test_DENSITY-{}.png'.format(epoch))
        df_results = pd.DataFrame({
            'distances': distances.transpose(),
            'labels': labels.transpose()
        })
        df_results.to_csv(args.outdir + "/test_outputs.csv", index=False)

        if args.evaluate is False:
            shutil.copyfile(args.outdir + '/model_best.pth.tar',
                            args.outdir + '/test_model_best.pth.tar')

    return EER
Example #4
0
def ki67_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, gene]
    selected_labels = labels_all.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_expr)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, selected_expr)
    plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Example #5
0
def pmid_26892682_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, selected_genes]
    selected_labels = labels_all.loc[selected_samples]

    ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix()
    probs = expit(ln_p_over_1_minus_p)

    rd = RocData.calculate(selected_labels, probs)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26892682 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, probs)
    plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Example #6
0
def validate(model, epoch):
    model.eval()
    labels, distances = [], []

    pbar = tqdm(enumerate(test_loader))

    for batch_idx, (data_a, data_p, label) in pbar:  #label这里是 0 1
        if args.cuda:
            data_a, data_p = data_a.cuda(), data_p.cuda()
        data_a, data_p, label = Variable(data_a, volatile=True), Variable(
            data_p, volatile=True), Variable(label)

        out_a = model(data_a, None, None)
        out_p = model(data_p, None, None)

        #one batch dists
        dists = l2_dist.forward(
            out_a, out_p
        )  #torch.sqrt(torch.sum((out_a - out_p) ** 2, 1))  # euclidean distance
        distances.append(dists.data.cpu().numpy())
        labels.append(label.data.cpu().numpy())

        if batch_idx % args.log_interval == 0:
            pbar.set_description('Test Epoch: {} [{}/{} ({:.0f}%)]'.format(
                epoch, batch_idx * len(data_a), len(test_loader.dataset),
                100. * batch_idx / len(test_loader)))

    labels = np.array([sublabel for label in labels for sublabel in label])
    distances = np.array([subdist for dist in distances for subdist in dist])

    tpr, fpr, accuracy, best_threshold = evaluate(distances, labels)
    print('\n\33[91mTest set: Accuracy: {:.8f} best_threshold: {:.2f}\33[0m'.
          format(np.mean(accuracy), best_threshold))
    logger.log_value('Test Accuracy', np.mean(accuracy))
    plot_roc(fpr,
             tpr,
             args.log_dir,
             figure_name="roc_test_epoch_{}.png".format(epoch))
    return np.mean(accuracy)
Example #7
0
    def train(self):
        with self.graph.as_default():
            # TODO: add resume option
            # TODO: add exception handler
            train_handle = self.sess.run(
                self.data.train_iterator.string_handle())
            valid_handle = self.sess.run(
                self.data.valid_iterator.string_handle())
            for epoch in range(self.hparams.epoch_num):
                self.sess.run(self.data.train_iterator.initializer)
                self.sess.run(self.data.valid_iterator.initializer)

                log.infov('Epoch %i' % epoch)
                #Train
                self._train_epoch('epoch %i (training)' % epoch, train_handle)
                #Validate
                y_score, y = self._evaluate('epoch %i (evaluating)' % epoch,
                                            valid_handle)

            # call roc
            y_score = np.concatenate(y_score, axis=0)
            y = np.concatenate(y, axis=0)
            plot_roc(y_score, y)
    def test_detector(self, test_path, saved_model):

        test_data, test_labels = self.load_dataset(test_path)

        model = load_model(saved_model)

        pred = model.predict(test_data)
        pred = np.argmax(pred, axis=1)

        # y_compare = np.argmax(test_labels, axis=1)
        score = metrics.accuracy_score(test_labels, pred)
        print("Final accuracy: {}".format(score))

        # Compute confusion matrix
        cm = confusion_matrix(test_labels, pred)
        np.set_printoptions(precision=2)
        print('Confusion matrix, without normalization')
        print(cm)
        plt.figure()
        utils.plot_confusion_matrix(cm, names=['M', 'B'], plot_name='metrics/conf_matrix.png')

        #plot ROC curve
        # pred = [pred[i] for i in np.nonzero(pred)]  # Only positive cases - benign
        utils.plot_roc(pred, test_labels, plot_name='metrics/roc_curve.png')
# 5 fold cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
prediction_scores = np.empty(y.shape[0], dtype='object')

for train_idx, val_idx in tqdm(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train = y[train_idx]

    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_val)[:, 1]

    # Save the predictions for this fold
    prediction_scores[val_idx] = y_pred

plt.title('SVM 5-fold cross validation ROC AUC')
plot_roc(y, prediction_scores)
plt.savefig('report/figures/svm_roc.png', dpi=300)

plot_prediction_samples(imgs, y, prediction_scores, 'SVM Prediction Samples')
plt.savefig('report/figures/svm_confmat.png', dpi=300)
# %%

# load and preprocess test data then create submission
X_test, test_ids = get_data(test=True)
X_test = np.stack([get_HOG(img, **hog_params) for img in X_test])

clf = clf.fit(X, y)
test_predictions = clf.predict_proba(X_test)[:, 1]
make_submission(test_ids,
                test_predictions,
                fname='submissions/svc_10_hog_16_4_fulltrain.csv')
def train_model(clf_factory, X, Y, name, plot=False):
    labels = np.unique(Y)

    cv = ShuffleSplit(
        n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(
        list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            print("Plotting %s" % genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

            desc = "%s %s" % (name, genre_list[label])
            plot_pr(pr_scores[label][median], desc, precisions[label][median],
                    recalls[label][median], label='%s vs rest' % genre_list[label])
            plot_roc(roc_scores[label][median], desc, tprs[label][median],
                     fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),
               np.mean(all_pr_scores), np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Example #11
0
                rd.fpr,
                rd.tpr,
                lw=1,
                label=
                f'Permutation {i} (area = {rd.auc:.{SIGNIFICANT_DIGITS}f})')

        plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

        plt.xlim([-0.01, 1.01])
        plt.ylim([-0.01, 1.01])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.axes().set_aspect('equal', 'datalim')
        plt.legend(loc='lower right')
        plt.title(f'SNF Clustering: {drug.title()}')

        figure_path = output_path / f'roc_comparison_{drug}.pdf'
        print('Saving ROC plot to', figure_path)
        plt.savefig(str(figure_path), bbox_inches='tight')

    best_permutation = aucs.idxmax()
    rd = roc_data[best_permutation]

    plot_roc(
        roc_data[best_permutation],
        f'SNF Clustering ROC: {drug.title()}',
        output_path / f'roc_best_{drug}.pdf',
    )

    rd.save(data_path / f'roc_data_{drug}.pickle')
def train_model(clf_factory, X, Y, name, plot=False):
    """
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0)
    #print "cv = ",cv
    train_errors = []
    test_errors = []

    scores = []

    pr_scores, precisions, recalls, thresholds = list(defaultdict(list)), list(
        defaultdict(list)), list(defaultdict(list)), list(defaultdict(list))

    roc_scores, tprs, fprs = list(defaultdict(list)), list(
        defaultdict(list)), list(defaultdict(list))

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        global clf
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)
    """ for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)"""

    if plot:
        for label in labels:
            print("Plotting %s" % genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) // 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_pr(pr_scores[label][median],
                    desc,
                    precisions[label][median],
                    recalls[label][median],
                    label='%s vs rest' % genre_list[label])
            plot_roc(roc_scores[label][median],
                     desc,
                     tprs[label][median],
                     fprs[label][median],
                     label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores)
               )  #222pr_scores[label].append(auc(recall, precision))
    print(summary)

    #save the trained model to disk
    joblib.dump(
        clf,
        r'C:\Users\Rag9704\Documents\GitHub\Music_Genre_Classification\my_model.pkl'
    )

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Example #13
0
patient_gene_set_muts = pd.DataFrame(0, index=muts.index, columns=range(len(entrez_gene_sets)))

for i, gene_set in enumerate(entrez_gene_sets):
    patient_gene_set_muts.loc[:, i] = muts.loc[:, gene_set].any(axis=1).astype(int)

pathway_mut_counts = patient_gene_set_muts.sum(axis=1)

gene_set_mut_matrix_path = data_path / 'gene_set_mut_matrix.pickle'
print('Saving gene set mutation matrix to', gene_set_mut_matrix_path)
patient_gene_set_muts.to_pickle(gene_set_mut_matrix_path)

pathway_mut_count_path = data_path / 'pathway_mut_counts.pickle'
print('Saving pathway mutation counts to', pathway_mut_count_path)
pathway_mut_counts.to_pickle(pathway_mut_count_path)

drugs = ['ai_all', 'arimidex']

feature_label_path = find_newest_data_path(f'compute_drug_features_labels_alpha_{args.alpha:.2f}')

for drug in drugs:
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, pathway_mut_counts.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_counts = pathway_mut_counts.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_counts)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'WExT Pathway Mutation Count ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')
Example #14
0
def train_model(clf_factory, X, Y, name, plot=False):
    labels = np.unique(Y)

    cv = ShuffleSplit(n=len(X),
                      n_iter=1,
                      test_size=0.3,
                      indices=True,
                      random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            print("Plotting", genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

            desc = "%s %s" % (name, genre_list[label])
            plot_pr(pr_scores[label][median],
                    desc,
                    precisions[label][median],
                    recalls[label][median],
                    label='%s vs rest' % genre_list[label])
            plot_roc(roc_scores[label][median],
                     desc,
                     tprs[label][median],
                     fprs[label][median],
                     label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Example #15
0
def test_task(dl_model,
              processed_train_outputs,
              best_checkpoint,
              version,
              batch_size,
              device='0'):
    checkpoint = best_checkpoint
    DEVICE = device
    VERSION = version
    BATCH_SIZE = batch_size

    [
        numeric_input_train, numeric_input_dev, numeric_input_test,
        numeric_input_train_bureau, numeric_input_dev_bureau,
        numeric_input_test_bureau, numeric_input_train_prev_app,
        numeric_input_dev_prev_app, numeric_input_test_prev_app
    ] = processed_train_outputs["normalized_data"]

    [
        categorical_input_train, categorical_input_dev, categorical_input_test,
        categorical_input_train_bureau, categorical_input_dev_bureau,
        categorical_input_test_bureau, categorical_input_train_prev_app,
        categorical_input_dev_prev_app, categorical_input_test_prev_app
    ] = processed_train_outputs["categorical_inputs"]

    [data_normalizer_app, data_normalizer_bur,
     data_normalizer_prev_app] = processed_train_outputs["data_normalizers"]
    [target_train, target_dev,
     target_test] = processed_train_outputs["targets"]
    [application_predict, bureau_predict,
     previous_application_predict] = processed_train_outputs["predict_data"]

    # Instantiating Batcher object
    predict_batcher = U.Batcher([
        numeric_input_test, categorical_input_test, numeric_input_test_bureau,
        categorical_input_test_bureau, numeric_input_test_prev_app,
        categorical_input_test_prev_app
    ],
                                BATCH_SIZE,
                                shuffle_on_reset=False)

    predictions_test = []
    # predictions = np.zeros([len(all_ids_test)])
    with M.start_tensorflow_session(device=DEVICE) as sess:
        dl_model.model_saver.restore(
            sess,
            "models/dl_model_" + str(VERSION) + "/model_" + str(VERSION) +
            "-" + str(checkpoint),
        )
        for i in range(predict_batcher.n_batches):
            # Get next batch
            batch_numeric_input_predict, batch_categorical_input_predict, batch_numeric_input_predict_bu, batch_categorical_input_predict_bu, batch_numeric_input_predict_prev_app, batch_categorical_input_predict_prev_app = predict_batcher.next(
            )

            # Creating feed_dict

            feed_dict_predict = {
                dl_model.placeholders.numeric_input:
                batch_numeric_input_predict,
                dl_model.placeholders.numeric_input_bureau:
                batch_numeric_input_predict_bu,
                dl_model.placeholders.numeric_input_prev_app:
                batch_numeric_input_predict_prev_app
            }

            for i in range(
                    len(C.col_classes["application_train"]["categorical"])):
                feed_dict_predict[dl_model.placeholders.embedding[
                    C.col_classes["application_train"]["categorical"]
                    [i]]] = batch_categorical_input_predict[:,
                                                            i].reshape([-1, 1])

            for i in range(len(C.col_classes["bureau"]["categorical"])):
                feed_dict_predict[dl_model.placeholders.embedding_bureau[
                    C.col_classes["bureau"]["categorical"]
                    [i]]] = np.expand_dims(
                        batch_categorical_input_predict_bu[:, :, i], axis=2)

            for i in range(
                    len(C.col_classes["previous_application"]["categorical"])):
                feed_dict_predict[dl_model.placeholders.embedding_prev_app[
                    C.col_classes["previous_application"]["categorical"]
                    [i]]] = np.expand_dims(
                        batch_categorical_input_predict_prev_app[:, :, i],
                        axis=2)

            # Run forward prop
            pred = sess.run(dl_model.forward.pred, feed_dict=feed_dict_predict)
            predictions_test.append(pred)

    final_prediction_test = []
    for pred_i in predictions_test:
        for elem in pred_i:
            final_prediction_test.append(elem)
    final_prediction_test = np.squeeze(np.array(final_prediction_test))
    U.plot_roc(target_test, final_prediction_test)
    U.print_distribution(final_prediction_test)
    uplift = U.get_uplift(target_test,
                          final_prediction_test,
                          N=100,
                          plot="uplift_acum")

    return {"prediction": final_prediction_test, "uplift": uplift}
Example #16
0
def train_using_pretrained_model(images, labels, path, net, epochs=10, learning_rate=0.0001, batch_size=32):
    best_accuracy = 0.0
    train_loss, test_loss = [], []
    train_acc, test_acc = [], []
    roc = []
    roc_score = []
    roc_true = []

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    # Training data
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=6)
    train_data = Dataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)

    # Testing data
    test_data = Dataset(X_test, y_test)
    test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=batch_size)

    for epoch in range(epochs):
        train_loss_it, train_acc_it = [], []
        test_loss_it, test_acc_it = [], []
        roc_score_it, roc_true_it = [], []
        total_step = len(train_loader)
        for i, (images, labels) in enumerate(train_loader):
            images = images.reshape(len(images), 1, 224, 224)
            labels = labels
            optimizer.zero_grad()
            outputs = net(images)

            loss = criterion(outputs.double(), labels)

            loss.backward()
            optimizer.step()

            # Accuracy
            predicted = torch.round(outputs.data)
            total = labels.size(0) * labels.size(1)
            correct = (predicted == labels).sum().item()
            accuracy = 100 * correct / total

            print('Epoch [{}/{}], Step [{}/{}], Train-Loss: {:.4f}, Train-Acc: {:.2f} %'
                  .format(epoch + 1, epochs, i + 1, total_step, loss.item(), accuracy))

            train_acc_it.append(accuracy)
            train_loss_it.append(loss.item())

        train_acc.append(np.mean(np.array(train_acc_it)))
        train_loss.append(np.mean(np.array(train_loss_it)))

        total = 0.0
        correct = 0.0
        total_step = len(test_loader)
        for i, (images, labels) in enumerate(test_loader):
            images = images.reshape(len(images), 1, 224, 224)
            labels = labels
            outputs = net(images)

            loss = criterion(outputs.double(), labels)

            predicted = torch.round(outputs.data)
            total += labels.size(0) * labels.size(1)
            correct += (predicted == labels).sum().item()
            accuracy = 100 * correct / total

            true = np.array(labels).reshape(-1)
            score = np.array(outputs.data).reshape(-1)
            roc.append(roc_auc_score(true, score))

            roc_score_it.extend(np.array(outputs.data).reshape(-1))
            roc_true_it.extend(np.array(labels).reshape(-1))

            test_acc_it.append(accuracy)
            test_loss_it.append(loss.item())

        test_accuracy = 100 * correct / total

        test_acc.append(np.mean(np.array(test_acc_it)))
        test_loss.append(np.mean(np.array(test_loss_it)))

        print('[Test] Epoch [{}/{}], Acc: {:.2f}'.format(epoch + 1, epochs, test_accuracy))


        if test_accuracy > best_accuracy:
            torch.save(net.state_dict(), path)
            best_accuracy = test_accuracy

        if (epoch + 1) % 10 == 0:
            roc_score.append(roc_score_it)
            roc_true.append(roc_true_it)

    # ROC
    if epochs > 9:
        true = np.array(roc_true)
        score = np.array(roc_score)

        plot_roc_binary(true, score, './results/transfer_bin_roc.pdf', 'Transfer Binary Classifier COVID')

    plot_roc(roc, './results/transfer_bin_roc_auc.pdf', 'Transfer Binary Classifier COVID')
    plot_loss(train_loss, test_loss, './results/transfer_bin_loss.pdf', 'Transfer Binary Classifier COVID')
    plot_acc(train_acc, test_acc, './results/transfer_bin_acc.pdf', 'Transfer Binary Classifier COVID')
Example #17
0
def train_model(images, labels, path, epochs=10, learning_rate=0.0001, batch_size=32):
    net = Net()
    train_loss, test_loss,  = [], []
    train_acc, test_acc,  = [], []
    roc_score = []
    roc_true = []
    roc=[]

    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    # Generate dataset
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=6)
    train_data = Dataset(X_train, y_train)
    train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=batch_size)

    test_data = Dataset(X_test, y_test)
    test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=batch_size)

    for epoch in range(epochs):
        train_loss_it, train_acc_it = [], []
        test_loss_it, test_acc_it = [], []
        roc_score_it, roc_true_it = [], []
        net.train()
        total_step = len(train_loader)
        for i, (images, labels) in enumerate(train_loader):
            # Move tensors to the configured device
            images = images.reshape(len(images), 1, 224, 224)
            labels = labels

            # Forward pass
            outputs = net(images)
            loss = criterion(outputs, labels)

            # Accuracy
            predicted = torch.round(outputs.data).reshape(len(labels))
            total = labels.size(0)
            correct = (predicted == labels).sum().item()
            accuracy = 100 * correct / total

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            print('[Train] Epoch [{}/{}], Step [{}/{}], Train-Loss: {:.4f}, Train-Acc: {:.2f} %'
                  .format(epoch + 1, epochs, i + 1, total_step, loss.item(), accuracy))

            train_acc_it.append(accuracy)
            train_loss_it.append(loss.item())

        train_acc.append(np.mean(np.array(train_acc_it)))
        train_loss.append(np.mean(np.array(train_loss_it)))

        net.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            total_step = len(test_loader)
            for i, (images, labels) in enumerate(test_loader):
                images = images.reshape(len(images), 1, 224, 224)
                labels = labels
                outputs = net(images)

                loss = criterion(outputs, labels)

                predicted = torch.round(outputs.data).reshape(len(labels))
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                accuracy = 100 * correct / total

                roc_score_it.extend(np.array(outputs.data).reshape(-1))
                roc_true_it.extend(np.array(labels).reshape(-1))

                test_acc_it.append(accuracy)
                test_loss_it.append(loss.item())

                true = np.array(labels).reshape(-1)
                score = np.array(outputs.data).reshape(-1)
                roc.append(roc_auc_score(true, score))

                print('[Test] Epoch [{}/{}], Step [{}/{}], Test-Loss: {:.4f}, Test-Acc: {:.2f}%'
                      .format(epoch + 1, epochs, i + 1, total_step, loss.item(), accuracy))

            if (epoch + 1) % 10 == 0:
                roc_score.append(roc_score_it)
                roc_true.append(roc_true_it)

            test_acc.append(np.mean(np.array(test_acc_it)))
            test_loss.append(np.mean(np.array(test_loss_it)))

    # Save the model checkpoint
    torch.save(net.state_dict(), path)

    # ROC
    if epochs > 9:
        true = np.array(roc_true)
        score = np.array(roc_score)


        plot_roc_binary(true, score, './results/bin_roc.pdf', 'Binary Classifier COVID')
    plot_loss(train_loss, test_loss, './results/bin_loss.pdf', 'Binary Classifier COVID')
    plot_acc(train_acc, test_acc, './results/bin_acc.pdf', 'Binary Classifier COVID')
    plot_roc(roc, './results/bin_roc_auc.pdf', 'Simple Binary Classifier COVID')

    return net
Example #18
0
grdtruth = []
with torch.no_grad():
    dataloader = DataLoader(train_dataset, batch_size=1000, shuffle=False)
    for data in dataloader:
        truth = data[2]
        if len(grdtruth) <= 0:
            grdtruth = truth
            continue
        grdtruth = torch.cat((grdtruth, truth))
    grdtruth = grdtruth.numpy()
prediction = classifier.fit(features, grdtruth)
precision_clf = classifier.score(features, grdtruth)
print('accuracy: ', precision_clf)


def print_acc(model, dataset, print_note=''):
    acc = models.cal_accuracy(dataset, model)
    print(print_note, 'accuracy: ', acc)


print('Network')
print_acc(model, train_dataset, print_note='train')
print_acc(model, dev_dataset, print_note='validat')
# plot roc
fpr_train, tpr_train, rocauc_train = models.get_roc(train_dataset, model)
fpr_dev, tpr_dev, rocauc_dev = models.get_roc(dev_dataset, model)
plot_roc([fpr_train, fpr_dev], [tpr_train, tpr_dev],
         [rocauc_train, rocauc_dev])

# save model
torch.save(model.state_dict(), f'{model_path}/ae_on_{data_class}.pth')
def train_model(clf_factory, X, Y, name, plot=False):
    labels = np.unique(Y)  #得到分类列表
    #随机地从X的600个元素中选出30%作为测试集,选1次
    cv = ShuffleSplit(n=len(X),
                      n_iter=1,
                      test_size=0.3,
                      indices=True,
                      random_state=0)

    train_errors = []
    test_errors = []

    scores = []  #用于保存测试集的准确率
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)  #假正率字典
    fprs = defaultdict(list)  #真正率字典

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        #train中是420个600内的随机数test是另外180个数
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        #现在X_train中存着训练集的420个13维向量y_train存着他们对应的420个类名
        #X_test中存着测试集的180个13维向量y_test存着他们对应的180个类名
        clf = clf_factory()
        clf.fit(X_train, y_train)  #利用训练集训练出逻辑回归模型

        clfs.append(clf)  #将每次训练集回归出的模型塞进去

        train_score = clf.score(X_train, y_train)  #用训练出的模型检测训练集的准确率
        test_score = clf.score(X_test, y_test)  #用训练出的模型检测测试集的准确率
        scores.append(test_score)

        train_errors.append(1 - train_score)  #训练集的错误率
        test_errors.append(1 - test_score)  #测试集的错误率

        y_pred = clf.predict(X_test)  #预测测试集中的180首歌分别对应的类型
        cm = confusion_matrix(y_test, y_pred)  #
        cms.append(cm)

        for label in labels:
            #y_test是180行代表每首歌真实属于哪个类别
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)  #X_test是180行13列
            #proba是180行6列每一列是这首歌曲被分为6个类别各自的概率
            proba_label = proba[:, label]
            #proba_label是180行1列是proba中的某一列
            #precision_recall_curve需要两个参数
            #y_label_test是01序列180行,代表每首歌是否是label类
            #proba_label也是个180行的序列,代表每首歌被预测为label类的概率
            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            print "Plotting", genre_list[label]
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

            desc = "%s %s" % (name, genre_list[label])
            plot_roc(roc_scores[label][median],
                     desc,
                     tprs[label][median],
                     fprs[label][median],
                     label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Example #20
0
 def plot_roc(self, X, y, x_size=12, y_size=12):
     """Plot the ROC curve for X_test and y_test
     """
     plot_roc(self.clf, X, y, x_size, y_size)
Example #21
0
def measure(clf_class, parameters, name, qa_X, qa_Y,data_size=None, plot=False):

    feature_names = np.array((
    'NumTextTokens',
    'NumCodeLines',
    'LinkCount',
    'AvgSentLen',
    'AvgWordLen',
    'NumAllCaps',
    'NumExclams',
    'NumImages'
    ))

    classifying_answer = "good"
    avg_scores_summary = []
    start_time_clf = time.time()
    if data_size is None:
        X = qa_X
        Y = qa_Y
    else:
        X = qa_X[:data_size]
        Y = qa_Y[:data_size]

    cv = KFold(n=len(X), n_folds=10, indices=True)

    train_errors = []
    test_errors = []

    scores = []
    roc_scores = []
    fprs, tprs = [], []

    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_class(**parameters)

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        label_idx = 1
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])
        prb = proba[:, label_idx]
        prb1 = proba[0:]
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, label_idx])

        roc_scores.append(auc(fpr, tpr))
        fprs.append(fpr)
        tprs.append(tpr)

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)
        # print(classification_report(y_test, proba[:, label_idx] >
        #       0.63, target_names=['not accepted', 'accepted']))

    # get medium clone
    scores_to_sort = pr_scores  # roc_scores
    medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    if plot:
        plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
        plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers")

        if hasattr(clf, 'coef_'):
            plot_feat_importance(feature_names, clf, name)

    summary = (name,
               np.mean(scores), np.std(scores),
               np.mean(roc_scores), np.std(roc_scores),
               np.mean(pr_scores), np.std(pr_scores),
               time.time() - start_time_clf)
    print(summary)
    avg_scores_summary.append(summary)
    precisions = precisions[medium]
    recalls = recalls[medium]
    thresholds = np.hstack(([0], thresholds[medium]))
    idx80 = precisions >= 0.8
    p1 = precisions[idx80][0]
    p2 = thresholds[idx80][0]
    print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[
         idx80][0], thresholds[idx80][0]))

    mean_train = np.mean(train_errors)
    mean_test =np.mean(test_errors)

    return mean_train,mean_test ,avg_scores_summary
Example #22
0
import time
from logistic_example import run_logistic_model
from deep_wide import run_deep_wide_model
from deepFM import run_deepfm_model
from fm2 import run_fm_model
from xdeepfm import run_xdeepfm_model
from utils import plot_roc
from sklearn.metrics import roc_curve

if __name__ == '__main__':
    fpr_list, tpr_list, auc_list, name_list, timing_list = [], [], [], [], []
    func_list = [run_logistic_model, run_deep_wide_model, run_deepfm_model, run_xdeepfm_model, run_fm_model]
    for func in func_list:
        print('Running', func.__name__)
        t = time.process_time()
        pred_ans, y_test, auc, model_name = func()
        elapsed_time = time.process_time() - t
        timing_list.append((func.__name__, elapsed_time))
        fpr, tpr, thresholds = roc_curve(y_test, pred_ans, pos_label=0)
        fpr_list.append(fpr)
        tpr_list.append(tpr)
        auc_list.append(auc)
        name_list.append(model_name)
    [print(t[0], ': ', t[1], 's') for t in timing_list]
    plot_roc(tpr_list, fpr_list, auc_list, name_list)

Example #23
0
def train(train_loader, model, optimizer, epoch):
    model.train()
    pbar = tqdm(enumerate(train_loader))
    labels, distances = [], []

    for batch_idx, (data_a, data_p, data_n, label_p, label_n) in pbar:
        if args.cuda:
            data_a, data_p, data_n = data_a.cuda(), data_p.cuda(), data_n.cuda(
            )

        # compute output
        #triplet_loss,distsAN,distsAP,len_hard_triplets0= model(data_a,data_p,data_n,label_p,label_n,args)
        out_a, out_p, out_n = model(data_a), model(data_p), model(data_n)
        #because the special loss function,we can't put loss in forward propagation

        # Choose the hard negatives
        d_p = l2_dist.forward(out_a, out_p)
        d_n = l2_dist.forward(out_a, out_n)
        all = (d_n - d_p < args.margin).cpu().data.numpy().flatten()
        hard_triplets = np.where(all == 1)
        if len(hard_triplets[0]) == 0:
            continue

        out_selected_a = out_a[hard_triplets]
        out_selected_p = out_p[hard_triplets]
        out_selected_n = out_n[hard_triplets]

        # we only use triplet loss,not combine with softmax there
        #selected_data_a = Variable(torch.from_numpy(data_a.cpu().data.numpy()[hard_triplets]).cuda())
        #selected_data_p = Variable(torch.from_numpy(data_p.cpu().data.numpy()[hard_triplets]).cuda())
        #selected_data_n = Variable(torch.from_numpy(data_n.cpu().data.numpy()[hard_triplets]).cuda())

        #selected_label_p = torch.from_numpy(label_p.cpu().numpy()[hard_triplets])
        #selected_label_n= torch.from_numpy(label_n.cpu().numpy()[hard_triplets])
        triplet_loss = TripletMarginLoss(args.margin).forward(
            out_selected_a, out_selected_p, out_selected_n)

        #cls_a = model.forward_classifier(selected_data_a)
        #cls_p = model.forward_classifier(selected_data_p)
        #cls_n = model.forward_classifier(selected_data_n)

        #criterion = nn.CrossEntropyLoss()
        #predicted_labels = torch.cat([cls_a,cls_p,cls_n])
        #true_labels = torch.cat([Variable(selected_label_p.cuda()),Variable(selected_label_p.cuda()),Variable(selected_label_n.cuda())])

        #cross_entropy_loss = criterion(predicted_labels.cuda(),true_labels.cuda())

        #loss = cross_entropy_loss + triplet_loss

        # compute gradient and update weights
        optimizer.zero_grad()
        triplet_loss.backward()
        optimizer.step()

        # update the optimizer learning rate
        #adjust_learning_rate(optimizer)

        logger.log_value('triplet_loss', triplet_loss.item()).step()

        if batch_idx % args.log_interval == 0:
            pbar.set_description(
                'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} \t # of Selected Triplets: {}'
                .format(epoch, batch_idx * len(data_a),
                        len(train_loader.dataset),
                        100. * batch_idx / len(train_loader),
                        triplet_loss.item(), len(hard_triplets[0])))

        dists = l2_dist.forward(
            out_selected_a, out_selected_n
        )  #torch.sqrt(torch.sum((out_a - out_n) ** 2, 1))  # euclidean distance
        distances.append(dists.data.cpu().numpy())
        labels.append(np.zeros(dists.size(0)))

        dists = l2_dist.forward(
            out_selected_a, out_selected_p
        )  #torch.sqrt(torch.sum((out_a - out_p) ** 2, 1))  # euclidean distance
        distances.append(dists.data.cpu().numpy())
        labels.append(np.ones(dists.size(0)))

        if batch_idx % args.val_interval == 0:  #每val_interval 个batch 一验证
            testaccuracy = validate(model, epoch)
            model.train()
        if batch_idx % args.save_interval == 0:  #and batch_idx!=0:  # 每val_interval 个batch 一验证
            torch.save({
                'epoch': epoch + 1,
                'state_dict': model.state_dict()
            }, '{}/triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth'.format(
                args.log_dir, get_time(), epoch, testaccuracy))
            print(
                '=>saving model:triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth'
                .format(get_time(), epoch, testaccuracy))

    labels = np.array([sublabel for label in labels for sublabel in label])
    distances = np.array(
        [subdist[0] for dist in distances for subdist in dist])
    tpr, fpr, accuracy, val, val_std, far = evaluate(distances, labels)

    print('\n\33[91mTrain set: Accuracy: {:.8f}\33[0m'.format(
        np.mean(accuracy)))
    logger.log_value('Train Accuracy', np.mean(accuracy))

    plot_roc(fpr, tpr, figure_name="roc_train_epoch_{}.png".format(epoch))

    # do checkpointing
    torch.save({
        'epoch': epoch + 1,
        'state_dict': model.state_dict()
    }, '{}/triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth'.format(
        args.log_dir, get_time(), epoch, testaccuracy))
    print('=>saving model:triplet_loss_checkpoint_{}_epoch{}_lfwAcc{:.4f}.pth'.
          format(get_time(), epoch, testaccuracy))