def print_and_plot_scores(scores,
                          pr_scores,
                          train_errors,
                          test_errors,
                          precisions,
                          recalls,
                          name="NaiveBayes ngram"):
    scores_to_sort = pr_scores
    median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    plot_pr(pr_scores[median],
            name,
            "01",
            precisions[median],
            recalls[median],
            label=name)

    summary = (np.mean(scores), np.std(scores), np.mean(pr_scores),
               np.std(pr_scores))
    print("AVG Scores\tSTD Scores\tAVG PR Scores\tSTD PR Scores")
    print "%.3f\t\t%.3f\t\t%.3f\t\t\t%.3f\t" % summary

    avg_train_err, avg_test_err = np.mean(train_errors), np.mean(test_errors)
    print("AVG Training Error: %.3f  -- AVG Testing Error: %.3f" %
          (avg_train_err, avg_test_err))
    return avg_train_err, avg_test_err
Exemple #2
0
    def roc_pr(self):
        '''
        tn, fp, fn, tp = metrics.confusion_matrix(self.labels, self.predicts)
        fpr = tp / (fp + tn)
        fnr = fn / (fn + tp)
        utils.plot_det(fpr, fnr)
        '''

        # using True to replace POS exmaple
        # ROC curve
        self.fpr, self.tpr, self.ths = metrics.roc_curve(y_true=self.labels,
                                                         y_score=self.predicts,
                                                         pos_label=1)
        print('fpr', self.fpr)
        print('tpr', self.tpr)
        print('ths', self.ths)
        self.auc = metrics.auc(self.fpr, self.tpr)
        print('AUC', self.auc)

        utils.plot_roc(self.fpr,
                       self.tpr,
                       self.ths,
                       self.auc,
                       save_path='output/roc.png')

        # PR-curve
        self.precision, self.recall, thresholds = metrics.precision_recall_curve(
            y_true=self.labels, probas_pred=self.predicts, pos_label=1)
        print("precision len", len(self.precision))
        print("recall len", len(self.recall))
        print("thresholds len", len(thresholds))
        utils.plot_pr(self.precision,
                      self.recall,
                      thresholds,
                      save_path='output/pr.png')
Exemple #3
0
def pmid_26033813_analysis(drug: str):
    tree = build_tree()

    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_labels = labels_all.loc[selected_samples]
    selected_expr = expr.loc[selected_samples, :]

    fit_tree(selected_expr, selected_labels, tree)

    predictions = pd.Series(
        [
            predict_sample(sample_name, selected_expr, tree)
            for sample_name in selected_samples
        ],
        index=selected_samples,
    )

    rd = RocData.calculate(selected_labels, predictions)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26033813 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, predictions)
    plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Exemple #4
0
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(n=len(X),
                      n_iter=10,
                      test_size=0.3,
                      indices=True,
                      random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = []  # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median],
                name,
                phase,
                precisions[median],
                recalls[median],
                label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores), np.mean(pr_scores),
               np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)
def train_model(clf_factory,X,Y,name,plot=False):
    #setting random_state to get deterministic behaviour
    cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3,indices=True,random_state=0)
    scores = []
    pr_scores =[]
    precisions, recalls, thresholds = [], [], []
    for train,test in cv:
        X_train,y_train = X[train],Y[train]
        X_test,y_test = X[test],Y[test]
        clf = clf_factory()
        clf.fit(X_train,y_train)
        train_score= clf.score(X_train,y_train)
        test_score = clf.score(X_test,y_test)
        scores.append(test_score)
        #clf.predict_prob returns the probability for class for entry in X_test
        proba = clf.predict_proba(X_test)

        #precision is the ability of the classifier not to label as positive  a sample that is negative
        #recall is teh ability to find all the positive samples
        precision,recall,pr_thresholds = precision_recall_curve(y_test,proba[:,1])
        pr_scores.append(auc(recall,precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        phase= "02"
        median = np.argsort(scores_to_sort)[len(scores_to_sort)/2]
        u.plot_pr(pr_scores[median],name,phase,precisions[median],recalls[median],label=name)

    summary = (np.mean(scores),np.std(scores),np.mean(pr_scores),np.std(pr_scores))

    print ("%.3f\t%.3f\t%.3f\t%.3f"%summary)
Exemple #6
0
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
    # 交差検定のデータシャッフル版
    cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        # 学習
        clf.fit(X_train, y_train)

        # 平均正解率を計算
        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        # 確率推定を求める
        proba = clf.predict_proba(X_test)

        # fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        # 適合率、再現率を求める
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        # AUC
        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    scores_to_sort = pr_scores
    median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    # グラフ描画
    if plot:
        plot_pr(pr_scores[median],
                name,
                "01",
                precisions[median],
                recalls[median],
                label=name)

        summary = (np.mean(scores), np.std(scores), np.mean(pr_scores),
                   np.std(pr_scores))
        print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
Exemple #7
0
def train_model(SVMType, X, Y, name="SVM ngram", plot=False):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, random_state=0)
    vectorizer = StemmedTfidfCountVectorizer(min_df = 1, stop_words = 'english')
    X = vectorizer.fit_transform(X)
    #cv = KFold(n=len(X), n_folds=10, indices=True)
    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []
    Fmeasures = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        #print X_train
        #print y_train
        #clf = clf_factory()
        #clf = svm.SVC(kernel='rbf')
        #clf = svm.SVC(kernel='poly')
        #clf = svm.SVC(kernel='linear')
        clf = svm.SVC(kernel= SVMType)
        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        #print train_score, test_score
        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)
        scores.append(test_score)
        
        proba = clf.predict(X_test)
        #print proba
        """
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        """
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba)
        Fmeasure = ((2 * precision * recall)/(precision + recall))
        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)
        Fmeasures.append(Fmeasure)
    scores_to_sort = pr_scores
    median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
    
    if plot:
        plot_pr(pr_scores[median], name, "01", precisions[median],
                recalls[median], label=name)

        summary = (np.mean(scores), np.std(scores),
                   np.mean(pr_scores), np.std(pr_scores))
        print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)
        print("F-measure:" + str(np.mean(Fmeasures)))
    return np.mean(train_errors), np.mean(test_errors)
Exemple #8
0
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
    # setup cross-validation
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    # loop through cross-validation/test datasets and train/test the classifier
    for train, test in cv:
        # setup datasets
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        # invoke classifier (e.g., ngram_model) and fit data
        clf = clf_factory()
        clf.fit(X_train, y_train)

        # return correct prediction rate for cv datasets
        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)


        # store errors
        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        # returns pairs of probability for each observation of being 0 / 1
        proba = clf.predict_proba(X_test)

        # extract quality of model indicators
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    scores_to_sort = pr_scores
    median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    if plot:
        plot_pr(pr_scores[median], name, "01", precisions[median],
                recalls[median], label=name)

        summary = (np.mean(scores), np.std(scores),
                   np.mean(pr_scores), np.std(pr_scores))
        print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)
Exemple #9
0
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    clfs = [] # just to later get the median

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median], name, phase, precisions[median],
                recalls[median], label=name)

        log_false_positives(clfs[median], X_test, y_test, name)

    summary = (np.mean(scores), np.std(scores),
               np.mean(pr_scores), np.std(pr_scores))
    print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)
def print_and_plot_scores(scores, pr_scores, train_errors, test_errors, precisions, recalls, name="NaiveBayes ngram"):
    scores_to_sort = pr_scores
    median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name)

    summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores))
    print ("AVG Scores\tSTD Scores\tAVG PR Scores\tSTD PR Scores")
    print "%.3f\t\t%.3f\t\t%.3f\t\t\t%.3f\t" % summary

    avg_train_err, avg_test_err = np.mean(train_errors), np.mean(test_errors)
    print ("AVG Training Error: %.3f  -- AVG Testing Error: %.3f" % (avg_train_err, avg_test_err))
    return avg_train_err, avg_test_err
Exemple #11
0
def train_model(clf_factory, X, Y, name="NB ngram", plot=False):
    cv = ShuffleSplit(
        n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    scores_to_sort = pr_scores
    median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    if plot:
        plot_pr(pr_scores[median], name, "01", precisions[median],
                recalls[median], label=name)

        summary = (np.mean(scores), np.std(scores),
                   np.mean(pr_scores), np.std(pr_scores))
        print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary

    return np.mean(train_errors), np.mean(test_errors)
Exemple #12
0
def ki67_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, gene]
    selected_labels = labels_all.loc[selected_samples]

    rd = RocData.calculate(selected_labels, selected_expr)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, selected_expr)
    plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
Exemple #13
0
def pmid_26892682_analysis(drug: str):
    feature_label_path = find_newest_data_path(
        f'compute_drug_features_labels_alpha_{args.alpha:.2f}')
    labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle')

    selected_samples = sorted_intersection(labels_all.index, expr.index)
    selected_expr = expr.loc[selected_samples, selected_genes]
    selected_labels = labels_all.loc[selected_samples]

    ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix()
    probs = expit(ln_p_over_1_minus_p)

    rd = RocData.calculate(selected_labels, probs)
    rd.save(data_path / f'roc_data_{drug}.pickle')
    plot_roc(rd, f'PMID26892682 ROC: {drug.title()}',
             output_path / f'{drug}_roc.pdf')

    pr = PrData.calculate(selected_labels, probs)
    plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}',
            output_path / f'{drug}_pr.pdf')
def train_model(clf_factory, X, Y, name, plot=False):
    labels = np.unique(Y)

    cv = ShuffleSplit(
        n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(
        list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            print("Plotting %s" % genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

            desc = "%s %s" % (name, genre_list[label])
            plot_pr(pr_scores[label][median], desc, precisions[label][median],
                    recalls[label][median], label='%s vs rest' % genre_list[label])
            plot_roc(roc_scores[label][median], desc, tprs[label][median],
                     fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),
               np.mean(all_pr_scores), np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def measure(clf_class, parameters, name, data_size=None, plot=False):
    start_time_clf = time.time()
    if data_size is None:
        X = qa_X
        Y = qa_Y
    else:
        X = qa_X[:data_size]
        Y = qa_Y[:data_size]

    cv = KFold(n=len(X), n_folds=10, indices=True)

    train_errors = []
    test_errors = []

    scores = []
    roc_scores = []
    fprs, tprs = [], []

    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_class(**parameters)

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        label_idx = 1
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, label_idx])

        roc_scores.append(auc(fpr, tpr))
        fprs.append(fpr)
        tprs.append(tpr)

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)
        print(classification_report(y_test, proba[:, label_idx] >
              0.63, target_names=['not accepted', 'accepted']))

    # get medium clone
    scores_to_sort = pr_scores  # roc_scores
    medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    if plot:
        #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
        plot_pr(pr_scores[medium], name, precisions[medium],
                recalls[medium], classifying_answer + " answers")

        if hasattr(clf, 'coef_'):
            plot_feat_importance(feature_names, clf, name)

    summary = (name,
               np.mean(scores), np.std(scores),
               np.mean(roc_scores), np.std(roc_scores),
               np.mean(pr_scores), np.std(pr_scores),
               time.time() - start_time_clf)
    print(summary)
    avg_scores_summary.append(summary)
    precisions = precisions[medium]
    recalls = recalls[medium]
    thresholds = np.hstack(([0], thresholds[medium]))
    idx80 = precisions >= 0.8
    print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[
          idx80][0], thresholds[idx80][0]))

    return np.mean(train_errors), np.mean(test_errors)
Exemple #16
0
def measure(clf_class, parameters, name, data_size=None, plot=False):
    start_time_clf = time.time()
    if data_size is None:
        X = qa_X
        Y = qa_Y
    else:
        X = qa_X[:data_size]
        Y = qa_Y[:data_size]

    cv = KFold(n=len(X), n_folds=10, indices=True)

    train_errors = []
    test_errors = []

    scores = []
    roc_scores = []
    fprs, tprs = [], []

    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for fold_idx, (train, test) in enumerate(cv):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        only_one_class_in_train = len(set(y_train)) == 1
        only_one_class_in_test = len(set(y_test)) == 1
        if only_one_class_in_train or only_one_class_in_test:
            # this would pose problems later on
            continue

        clf = clf_class(**parameters)

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        label_idx = 1
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, label_idx])

        roc_scores.append(auc(fpr, tpr))
        fprs.append(fpr)
        tprs.append(tpr)

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

        # This threshold is determined at the end of the chapter 5,
        # where we find conditions such that precision is in the area of
        # about 80%. With it we trade off recall for precision.
        threshold_for_detecting_good_answers = 0.59

        print("Clone #%i" % fold_idx)
        print(
            classification_report(
                y_test,
                proba[:, label_idx] > threshold_for_detecting_good_answers,
                target_names=['not accepted', 'accepted']))

    # get medium clone
    scores_to_sort = pr_scores  # roc_scores
    medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
    print("Medium clone is #%i" % medium)

    if plot:
        #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
        plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium],
                classifying_answer + " answers")

        if hasattr(clf, 'coef_'):
            plot_feat_importance(feature_names, clf, name)

    summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores),
               np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores),
               time.time() - start_time_clf)
    print(summary)
    avg_scores_summary.append(summary)
    precisions = precisions[medium]
    recalls = recalls[medium]
    thresholds = np.hstack(([0], thresholds[medium]))
    idx80 = precisions >= 0.8
    print("P=%.2f R=%.2f thresh=%.2f" %
          (precisions[idx80][0], recalls[idx80][0], thresholds[idx80][0]))

    return np.mean(train_errors), np.mean(test_errors)
def measure(clf_class, parameters, name, data_size=None, plot=False):
    start_time_clf = time.time()
    if data_size is None:
        X = method_name()
        Y = qa_Y
    else:
        X = qa_X[:data_size]
        Y = qa_Y[:data_size]

    good_job = KFold(n=len(X), n_folds=10, indices=True)
    cv = good_job
    l = RANGE

    train_errors = []
    test_errors = []

    scores = []
    roc_scores = []
    fprs, tprs = [], []

    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for fold_idx, (train, test) in enumerate(cv):
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        only_one_class_in_train = len(set(y_train)) == 1
        only_one_class_in_test = len(set(y_test)) == 1
        if only_one_class_in_train or only_one_class_in_test:
            # this would pose problems later on
            continue

        clf = clf_class(**parameters)

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        label_idx = 1
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, label_idx])

        roc_scores.append(auc(fpr, tpr))
        fprs.append(fpr)
        tprs.append(tpr)

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

        # This threshold is determined at the end of the chapter 5,
        # where we find conditions such that precision is in the area of
        # about 80%. With it we trade off recall for precision.
        threshold_for_detecting_good_answers = 0.59

        print("Clone #%i" % fold_idx)
        print(classification_report(y_test, proba[:, label_idx] >
              threshold_for_detecting_good_answers, target_names=['not accepted', 'accepted']))

    # get medium clone
    scores_to_sort = pr_scores  # roc_scores
    medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
    print("Medium clone is #%i" % medium)

    if plot:
        #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
        plot_pr(pr_scores[medium], name, precisions[medium],
                recalls[medium], classifying_answer + " answers")

        if hasattr(clf, 'coef_'):
            plot_feat_importance(feature_names, clf, name)

    summary = (name,
               np.mean(scores), np.std(scores),
               np.mean(roc_scores), np.std(roc_scores),
               np.mean(pr_scores), np.std(pr_scores),
               time.time() - start_time_clf)
    print(summary)
    avg_scores_summary.append(summary)
    precisions = precisions[medium]
    recalls = recalls[medium]
    thresholds = np.hstack(([0], thresholds[medium]))
    idx80 = precisions >= 0.8
    print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[
          idx80][0], thresholds[idx80][0]))

    return np.mean(train_errors), np.mean(test_errors)
Exemple #18
0
def measure(clf_class, parameters, name, data_size=None, plot=False):
    start_time_clf = time.time()
    if data_size is None:
        X = qa_X
        Y = qa_Y
    else:
        X = qa_X[:data_size]
        Y = qa_Y[:data_size]

    # cv = KFold(n=len(X), n_folds=10, indices=True)
    # cv = KFold(n=len(X), n_folds=10)
    # cv = KFold(n_splits=10, shuffle=True)
    cv = KFold(n=len(X), n_folds=10, shuffle=True)

    train_errors = []
    test_errors = []

    scores = []
    roc_scores = []
    fprs, tprs = [], []

    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_class(**parameters)

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        label_idx = 1
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, label_idx])

        roc_scores.append(auc(fpr, tpr))
        fprs.append(fpr)
        tprs.append(tpr)

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)
        print(
            classification_report(y_test,
                                  proba[:, label_idx] > 0.63,
                                  target_names=['not accepted', 'accepted']))

    # get medium clone
    scores_to_sort = pr_scores  # roc_scores
    medium = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)]

    if plot:
        # plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
        plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium],
                classifying_answer + " answers")

        if hasattr(clf, 'coef_'):
            plot_feat_importance(feature_names, clf, name)

    summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores),
               np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores),
               time.time() - start_time_clf)
    print(summary)
    avg_scores_summary.append(summary)
    precisions = precisions[medium]
    recalls = recalls[medium]
    thresholds = np.hstack(([0], thresholds[medium]))
    idx80 = precisions >= 0.8
    print("P=%.2f R=%.2f thresh=%.2f" %
          (precisions[idx80][0], recalls[idx80][0], thresholds[idx80][0]))

    return np.mean(train_errors), np.mean(test_errors)
Exemple #19
0
def measure(clf_class, parameters, name, data_size=None, plot=False):
    start_time_clf = time.time()
    if data_size is None:
        X = qa_X
        Y = qa_Y
    else:
        X = qa_X[:data_size]
        Y = qa_Y[:data_size]

    cv = KFold(n=len(X), n_folds=10, indices=True)

    train_errors = []
    test_errors = []

    scores = []
    roc_scores = []
    fprs, tprs = [], []

    pr_scores = []
    precisions, recalls, thresholds = [], [], []
    
    # loop through n cross validation folds
    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_class(**parameters)

        # fit model
        clf.fit(X_train, y_train)
        
        # predict training set, predict test set
        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        # save training error
        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)   # return probability test dataset
                                            # [0.8  0.2], --> prob for 0 and 1 (average of n closest neighbors) 

        # calculate false prediction rate, true prediction rate
        # for different threshold values (threshold values are
        # calculated as proportion m/n nearest neighbors
        label_idx = 1
        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx])
        
        # calculate precision, recall, and precision thresholds
        precision, recall, pr_thresholds = precision_recall_curve(y_test, proba[:, label_idx])

        # store results in container variables
        roc_scores.append(auc(fpr, tpr))
        fprs.append(fpr)
        tprs.append(tpr)

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)
        
        print "classification report (accepted -> p > 0.63)"
        print(classification_report(y_test, proba[:, label_idx] >
              0.63, target_names=['not accepted', 'accepted']))

    # get medium clone
    scores_to_sort = pr_scores  # roc_scores
    medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

    if plot:
        #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium])
        plot_pr(pr_scores[medium], name, precisions[medium],
                recalls[medium], classifying_answer + " answers")

        if hasattr(clf, 'coef_'):
            plot_feat_importance(feature_names, clf, name)

    summary = (name,
               np.mean(scores), np.std(scores),
               np.mean(roc_scores), np.std(roc_scores),
               np.mean(pr_scores), np.std(pr_scores),
               time.time() - start_time_clf)
    
    print "summary: name, validation_dataset_correct_prediction_mean, score_std, rocscore_mean, rocscore_std, prscore_mean, prscore_std, time"
    print(summary)
    avg_scores_summary.append(summary)
    precisions = precisions[medium]
    recalls = recalls[medium]
    thresholds = np.hstack(([0], thresholds[medium]))
    idx80 = precisions >= 0.8
    print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[
          idx80][0], thresholds[idx80][0]))

    return np.mean(train_errors), np.mean(test_errors)
Exemple #20
0
def train_model(clf_factory, X, Y, name, plot=False):
    labels = np.unique(Y)

    cv = ShuffleSplit(n=len(X),
                      n_iter=1,
                      test_size=0.3,
                      indices=True,
                      random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(
        list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            print("Plotting", genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

            desc = "%s %s" % (name, genre_list[label])
            plot_pr(pr_scores[label][median],
                    desc,
                    precisions[label][median],
                    recalls[label][median],
                    label='%s vs rest' % genre_list[label])
            plot_roc(roc_scores[label][median],
                     desc,
                     tprs[label][median],
                     fprs[label][median],
                     label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores),
               np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def train_model(clf_factory, X, Y, name, plot=False):
    """
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0)
    #print "cv = ",cv
    train_errors = []
    test_errors = []

    scores = []

    pr_scores, precisions, recalls, thresholds = list(defaultdict(list)), list(
        defaultdict(list)), list(defaultdict(list)), list(defaultdict(list))

    roc_scores, tprs, fprs = list(defaultdict(list)), list(
        defaultdict(list)), list(defaultdict(list))

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        global clf
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)
    """ for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)"""

    if plot:
        for label in labels:
            print("Plotting %s" % genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) // 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_pr(pr_scores[label][median],
                    desc,
                    precisions[label][median],
                    recalls[label][median],
                    label='%s vs rest' % genre_list[label])
            plot_roc(roc_scores[label][median],
                     desc,
                     tprs[label][median],
                     fprs[label][median],
                     label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores)
               )  #222pr_scores[label].append(auc(recall, precision))
    print(summary)

    #save the trained model to disk
    joblib.dump(
        clf,
        r'C:\Users\Rag9704\Documents\GitHub\Music_Genre_Classification\my_model.pkl'
    )

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)