def print_and_plot_scores(scores, pr_scores, train_errors, test_errors, precisions, recalls, name="NaiveBayes ngram"): scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("AVG Scores\tSTD Scores\tAVG PR Scores\tSTD PR Scores") print "%.3f\t\t%.3f\t\t%.3f\t\t\t%.3f\t" % summary avg_train_err, avg_test_err = np.mean(train_errors), np.mean(test_errors) print("AVG Training Error: %.3f -- AVG Testing Error: %.3f" % (avg_train_err, avg_test_err)) return avg_train_err, avg_test_err
def roc_pr(self): ''' tn, fp, fn, tp = metrics.confusion_matrix(self.labels, self.predicts) fpr = tp / (fp + tn) fnr = fn / (fn + tp) utils.plot_det(fpr, fnr) ''' # using True to replace POS exmaple # ROC curve self.fpr, self.tpr, self.ths = metrics.roc_curve(y_true=self.labels, y_score=self.predicts, pos_label=1) print('fpr', self.fpr) print('tpr', self.tpr) print('ths', self.ths) self.auc = metrics.auc(self.fpr, self.tpr) print('AUC', self.auc) utils.plot_roc(self.fpr, self.tpr, self.ths, self.auc, save_path='output/roc.png') # PR-curve self.precision, self.recall, thresholds = metrics.precision_recall_curve( y_true=self.labels, probas_pred=self.predicts, pos_label=1) print("precision len", len(self.precision)) print("recall len", len(self.recall)) print("thresholds len", len(thresholds)) utils.plot_pr(self.precision, self.recall, thresholds, save_path='output/pr.png')
def pmid_26033813_analysis(drug: str): tree = build_tree() feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_labels = labels_all.loc[selected_samples] selected_expr = expr.loc[selected_samples, :] fit_tree(selected_expr, selected_labels, tree) predictions = pd.Series( [ predict_sample(sample_name, selected_expr, tree) for sample_name in selected_samples ], index=selected_samples, ) rd = RocData.calculate(selected_labels, predictions) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26033813 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, predictions) plot_pr(pr, f'PMID26033813 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
def train_model(clf_factory,X,Y,name,plot=False): #setting random_state to get deterministic behaviour cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3,indices=True,random_state=0) scores = [] pr_scores =[] precisions, recalls, thresholds = [], [], [] for train,test in cv: X_train,y_train = X[train],Y[train] X_test,y_test = X[test],Y[test] clf = clf_factory() clf.fit(X_train,y_train) train_score= clf.score(X_train,y_train) test_score = clf.score(X_test,y_test) scores.append(test_score) #clf.predict_prob returns the probability for class for entry in X_test proba = clf.predict_proba(X_test) #precision is the ability of the classifier not to label as positive a sample that is negative #recall is teh ability to find all the positive samples precision,recall,pr_thresholds = precision_recall_curve(y_test,proba[:,1]) pr_scores.append(auc(recall,precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores phase= "02" median = np.argsort(scores_to_sort)[len(scores_to_sort)/2] u.plot_pr(pr_scores[median],name,phase,precisions[median],recalls[median],label=name) summary = (np.mean(scores),np.std(scores),np.mean(pr_scores),np.std(pr_scores)) print ("%.3f\t%.3f\t%.3f\t%.3f"%summary)
def train_model(clf_factory, X, Y, name="NB ngram", plot=False): # 交差検定のデータシャッフル版 cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() # 学習 clf.fit(X_train, y_train) # 平均正解率を計算 train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) # 確率推定を求める proba = clf.predict_proba(X_test) # fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) # 適合率、再現率を求める precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) # AUC pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] # グラフ描画 if plot: plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors)
def train_model(SVMType, X, Y, name="SVM ngram", plot=False): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, random_state=0) vectorizer = StemmedTfidfCountVectorizer(min_df = 1, stop_words = 'english') X = vectorizer.fit_transform(X) #cv = KFold(n=len(X), n_folds=10, indices=True) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] Fmeasures = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] #print X_train #print y_train #clf = clf_factory() #clf = svm.SVC(kernel='rbf') #clf = svm.SVC(kernel='poly') #clf = svm.SVC(kernel='linear') clf = svm.SVC(kernel= SVMType) clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) #print train_score, test_score train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict(X_test) #print proba """ fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) """ precision, recall, pr_thresholds = precision_recall_curve( y_test, proba) Fmeasure = ((2 * precision * recall)/(precision + recall)) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) Fmeasures.append(Fmeasure) scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) print("F-measure:" + str(np.mean(Fmeasures))) return np.mean(train_errors), np.mean(test_errors)
def train_model(clf_factory, X, Y, name="NB ngram", plot=False): # setup cross-validation cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] # loop through cross-validation/test datasets and train/test the classifier for train, test in cv: # setup datasets X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] # invoke classifier (e.g., ngram_model) and fit data clf = clf_factory() clf.fit(X_train, y_train) # return correct prediction rate for cv datasets train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) # store errors train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) # returns pairs of probability for each observation of being 0 / 1 proba = clf.predict_proba(X_test) # extract quality of model indicators fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
def train_model(clf, X, Y, name="NB ngram", plot=False): # create it again for plotting cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] clfs = [] # just to later get the median for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) if plot: scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, phase, precisions[median], recalls[median], label=name) log_false_positives(clfs[median], X_test, y_test, name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
def print_and_plot_scores(scores, pr_scores, train_errors, test_errors, precisions, recalls, name="NaiveBayes ngram"): scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print ("AVG Scores\tSTD Scores\tAVG PR Scores\tSTD PR Scores") print "%.3f\t\t%.3f\t\t%.3f\t\t\t%.3f\t" % summary avg_train_err, avg_test_err = np.mean(train_errors), np.mean(test_errors) print ("AVG Training Error: %.3f -- AVG Testing Error: %.3f" % (avg_train_err, avg_test_err)) return avg_train_err, avg_test_err
def train_model(clf_factory, X, Y, name="NB ngram", plot=False): cv = ShuffleSplit( n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, 1]) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) scores_to_sort = pr_scores median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: plot_pr(pr_scores[median], name, "01", precisions[median], recalls[median], label=name) summary = (np.mean(scores), np.std(scores), np.mean(pr_scores), np.std(pr_scores)) print "%.3f\t%.3f\t%.3f\t%.3f\t" % summary return np.mean(train_errors), np.mean(test_errors)
def ki67_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, gene] selected_labels = labels_all.loc[selected_samples] rd = RocData.calculate(selected_labels, selected_expr) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'Ki67 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, selected_expr) plot_pr(pr, f'Ki67 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def pmid_26892682_analysis(drug: str): feature_label_path = find_newest_data_path( f'compute_drug_features_labels_alpha_{args.alpha:.2f}') labels_all = pd.read_pickle(feature_label_path / f'labels_{drug}.pickle') selected_samples = sorted_intersection(labels_all.index, expr.index) selected_expr = expr.loc[selected_samples, selected_genes] selected_labels = labels_all.loc[selected_samples] ln_p_over_1_minus_p = selected_expr.as_matrix() @ coefs.as_matrix() probs = expit(ln_p_over_1_minus_p) rd = RocData.calculate(selected_labels, probs) rd.save(data_path / f'roc_data_{drug}.pickle') plot_roc(rd, f'PMID26892682 ROC: {drug.title()}', output_path / f'{drug}_roc.pdf') pr = PrData.calculate(selected_labels, probs) plot_pr(pr, f'PMID26892682 Precision-Recall: {drug.title()}', output_path / f'{drug}_pr.pdf')
def train_model(clf_factory, X, Y, name, plot=False): labels = np.unique(Y) cv = ShuffleSplit( n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict( list), defaultdict(list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: print("Plotting %s" % genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def measure(clf_class, parameters, name, data_size=None, plot=False): start_time_clf = time.time() if data_size is None: X = qa_X Y = qa_Y else: X = qa_X[:data_size] Y = qa_Y[:data_size] cv = KFold(n=len(X), n_folds=10, indices=True) train_errors = [] test_errors = [] scores = [] roc_scores = [] fprs, tprs = [], [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_class(**parameters) clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) label_idx = 1 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, label_idx]) roc_scores.append(auc(fpr, tpr)) fprs.append(fpr) tprs.append(tpr) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) print(classification_report(y_test, proba[:, label_idx] > 0.63, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers") if hasattr(clf, 'coef_'): plot_feat_importance(feature_names, clf, name) summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores), np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores), time.time() - start_time_clf) print(summary) avg_scores_summary.append(summary) precisions = precisions[medium] recalls = recalls[medium] thresholds = np.hstack(([0], thresholds[medium])) idx80 = precisions >= 0.8 print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[ idx80][0], thresholds[idx80][0])) return np.mean(train_errors), np.mean(test_errors)
def measure(clf_class, parameters, name, data_size=None, plot=False): start_time_clf = time.time() if data_size is None: X = qa_X Y = qa_Y else: X = qa_X[:data_size] Y = qa_Y[:data_size] cv = KFold(n=len(X), n_folds=10, indices=True) train_errors = [] test_errors = [] scores = [] roc_scores = [] fprs, tprs = [], [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for fold_idx, (train, test) in enumerate(cv): X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] only_one_class_in_train = len(set(y_train)) == 1 only_one_class_in_test = len(set(y_test)) == 1 if only_one_class_in_train or only_one_class_in_test: # this would pose problems later on continue clf = clf_class(**parameters) clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) label_idx = 1 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, label_idx]) roc_scores.append(auc(fpr, tpr)) fprs.append(fpr) tprs.append(tpr) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) # This threshold is determined at the end of the chapter 5, # where we find conditions such that precision is in the area of # about 80%. With it we trade off recall for precision. threshold_for_detecting_good_answers = 0.59 print("Clone #%i" % fold_idx) print( classification_report( y_test, proba[:, label_idx] > threshold_for_detecting_good_answers, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] print("Medium clone is #%i" % medium) if plot: #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers") if hasattr(clf, 'coef_'): plot_feat_importance(feature_names, clf, name) summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores), np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores), time.time() - start_time_clf) print(summary) avg_scores_summary.append(summary) precisions = precisions[medium] recalls = recalls[medium] thresholds = np.hstack(([0], thresholds[medium])) idx80 = precisions >= 0.8 print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[idx80][0], thresholds[idx80][0])) return np.mean(train_errors), np.mean(test_errors)
def measure(clf_class, parameters, name, data_size=None, plot=False): start_time_clf = time.time() if data_size is None: X = method_name() Y = qa_Y else: X = qa_X[:data_size] Y = qa_Y[:data_size] good_job = KFold(n=len(X), n_folds=10, indices=True) cv = good_job l = RANGE train_errors = [] test_errors = [] scores = [] roc_scores = [] fprs, tprs = [], [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for fold_idx, (train, test) in enumerate(cv): X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] only_one_class_in_train = len(set(y_train)) == 1 only_one_class_in_test = len(set(y_test)) == 1 if only_one_class_in_train or only_one_class_in_test: # this would pose problems later on continue clf = clf_class(**parameters) clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) label_idx = 1 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, label_idx]) roc_scores.append(auc(fpr, tpr)) fprs.append(fpr) tprs.append(tpr) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) # This threshold is determined at the end of the chapter 5, # where we find conditions such that precision is in the area of # about 80%. With it we trade off recall for precision. threshold_for_detecting_good_answers = 0.59 print("Clone #%i" % fold_idx) print(classification_report(y_test, proba[:, label_idx] > threshold_for_detecting_good_answers, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] print("Medium clone is #%i" % medium) if plot: #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers") if hasattr(clf, 'coef_'): plot_feat_importance(feature_names, clf, name) summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores), np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores), time.time() - start_time_clf) print(summary) avg_scores_summary.append(summary) precisions = precisions[medium] recalls = recalls[medium] thresholds = np.hstack(([0], thresholds[medium])) idx80 = precisions >= 0.8 print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[ idx80][0], thresholds[idx80][0])) return np.mean(train_errors), np.mean(test_errors)
def measure(clf_class, parameters, name, data_size=None, plot=False): start_time_clf = time.time() if data_size is None: X = qa_X Y = qa_Y else: X = qa_X[:data_size] Y = qa_Y[:data_size] # cv = KFold(n=len(X), n_folds=10, indices=True) # cv = KFold(n=len(X), n_folds=10) # cv = KFold(n_splits=10, shuffle=True) cv = KFold(n=len(X), n_folds=10, shuffle=True) train_errors = [] test_errors = [] scores = [] roc_scores = [] fprs, tprs = [], [] pr_scores = [] precisions, recalls, thresholds = [], [], [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_class(**parameters) clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) label_idx = 1 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx]) precision, recall, pr_thresholds = precision_recall_curve( y_test, proba[:, label_idx]) roc_scores.append(auc(fpr, tpr)) fprs.append(fpr) tprs.append(tpr) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) print( classification_report(y_test, proba[:, label_idx] > 0.63, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[int(len(scores_to_sort) / 2)] if plot: # plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers") if hasattr(clf, 'coef_'): plot_feat_importance(feature_names, clf, name) summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores), np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores), time.time() - start_time_clf) print(summary) avg_scores_summary.append(summary) precisions = precisions[medium] recalls = recalls[medium] thresholds = np.hstack(([0], thresholds[medium])) idx80 = precisions >= 0.8 print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[idx80][0], thresholds[idx80][0])) return np.mean(train_errors), np.mean(test_errors)
def measure(clf_class, parameters, name, data_size=None, plot=False): start_time_clf = time.time() if data_size is None: X = qa_X Y = qa_Y else: X = qa_X[:data_size] Y = qa_Y[:data_size] cv = KFold(n=len(X), n_folds=10, indices=True) train_errors = [] test_errors = [] scores = [] roc_scores = [] fprs, tprs = [], [] pr_scores = [] precisions, recalls, thresholds = [], [], [] # loop through n cross validation folds for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_class(**parameters) # fit model clf.fit(X_train, y_train) # predict training set, predict test set train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) # save training error train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) proba = clf.predict_proba(X_test) # return probability test dataset # [0.8 0.2], --> prob for 0 and 1 (average of n closest neighbors) # calculate false prediction rate, true prediction rate # for different threshold values (threshold values are # calculated as proportion m/n nearest neighbors label_idx = 1 fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, label_idx]) # calculate precision, recall, and precision thresholds precision, recall, pr_thresholds = precision_recall_curve(y_test, proba[:, label_idx]) # store results in container variables roc_scores.append(auc(fpr, tpr)) fprs.append(fpr) tprs.append(tpr) pr_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) print "classification report (accepted -> p > 0.63)" print(classification_report(y_test, proba[:, label_idx] > 0.63, target_names=['not accepted', 'accepted'])) # get medium clone scores_to_sort = pr_scores # roc_scores medium = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] if plot: #plot_roc(roc_scores[medium], name, fprs[medium], tprs[medium]) plot_pr(pr_scores[medium], name, precisions[medium], recalls[medium], classifying_answer + " answers") if hasattr(clf, 'coef_'): plot_feat_importance(feature_names, clf, name) summary = (name, np.mean(scores), np.std(scores), np.mean(roc_scores), np.std(roc_scores), np.mean(pr_scores), np.std(pr_scores), time.time() - start_time_clf) print "summary: name, validation_dataset_correct_prediction_mean, score_std, rocscore_mean, rocscore_std, prscore_mean, prscore_std, time" print(summary) avg_scores_summary.append(summary) precisions = precisions[medium] recalls = recalls[medium] thresholds = np.hstack(([0], thresholds[medium])) idx80 = precisions >= 0.8 print("P=%.2f R=%.2f thresh=%.2f" % (precisions[idx80][0], recalls[ idx80][0], thresholds[idx80][0])) return np.mean(train_errors), np.mean(test_errors)
def train_model(clf_factory, X, Y, name, plot=False): labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = clf_factory() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: print("Plotting", genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def train_model(clf_factory, X, Y, name, plot=False): """ Trains and saves model to disk. """ labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0) #print "cv = ",cv train_errors = [] test_errors = [] scores = [] pr_scores, precisions, recalls, thresholds = list(defaultdict(list)), list( defaultdict(list)), list(defaultdict(list)), list(defaultdict(list)) roc_scores, tprs, fprs = list(defaultdict(list)), list( defaultdict(list)), list(defaultdict(list)) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] global clf clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) """ for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr)""" if plot: for label in labels: print("Plotting %s" % genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) // 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores) ) #222pr_scores[label].append(auc(recall, precision)) print(summary) #save the trained model to disk joblib.dump( clf, r'C:\Users\Rag9704\Documents\GitHub\Music_Genre_Classification\my_model.pkl' ) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)