def pr_curve(i): label = labels[i] statistics_l = Statistics() print('Doing label {}'.format(label)) for train_idx, valid_idx in folds: rng = np.random.RandomState() rng.seed(seeds[i]) training_fold = developement_df.loc[train_idx, ] training_fold = training_fold.reset_index(drop=True) validation_fold = developement_df.loc[valid_idx, ] validation_fold = validation_fold.reset_index(drop=True) base_estimators = make_classifiers(method, balanced, labels, random_state=rng) # Find the best params, then do a final proper calibration. base_estimator = base_estimators[label] estimator = RandomizedSearchCV( estimator=base_estimator, param_distributions=params, n_iter=60, scoring='f1', cv=3, random_state=rng, error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs', refit=True ) # Set up the vectorizer for the bag-of-words representation if vectorizer_method == 'tf-idf': vectorizer = TfidfVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0 ) vectorizer.fit(training_fold['terms'].values) elif vectorizer_method == 'count': vectorizer = CountVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True ) vectorizer.fit(training_fold['terms'].values) # Fit an evaluate the performance of the classifier. x_train = vectorizer.transform(training_fold['terms'].values) y_train = np.asarray(training_fold[label].values, dtype=int) x_valid = vectorizer.transform(validation_fold['terms'].values) y_valid = np.asarray(validation_fold[label].values, dtype=int) estimator.fit(x_train, y_train) for t in thresholds: y_pred = [int(p[1] >= t) for p in estimator.predict_proba(x_valid)] precision = precision_score(y_valid, y_pred, labels=[0, 1], pos_label=1) recall = recall_score(y_valid, y_pred, labels=[0, 1], pos_label=1) f1 = f1_score(y_valid, y_pred, labels=[0, 1], pos_label=1) statistics_l.update_statistics(label=t, s_type='Precision', data=precision) statistics_l.update_statistics(label=t, s_type='Recall', data=recall) statistics_l.update_statistics(label=t, s_type='F1-Score', data=f1) statistics_l.frame()['reaction'] = label return statistics_l
def do_iteration(i): print("Iteration " + str(i+1)) dev_df = train.copy(deep=True) test_df = test.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) folds_i = iterative_stratification(dev_df, labels, cv_folds, rng) # Create the appropriate statistics container for this iteration. validation_stats_i = Statistics() testing_stats_i = Statistics() for train_idx, valid_idx in cv_iterator(folds_i): training_fold = dev_df.loc[train_idx, ] validation_fold = dev_df.loc[valid_idx, ] # shuffle the folds training_fold = training_fold.reindex(rng.permutation(training_fold.index)) validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index)) stats_valid, stats_test = multi_label_crf( labels=labels, df_train=training_fold, df_valid=validation_fold, df_test=test_df, binary=binary, connectivity='full', vectorizer_method=vectorizer_method ) validation_stats_i.merge(stats_valid) testing_stats_i.merge(stats_test) log.write('Iteration {}\n'.format(i)) validation_stats_i.write(log, 'a') testing_stats_i.write(log, 'a') return validation_stats_i, testing_stats_i
def do_fold(j): print("\tFold " + str(j+1)) train_idx = folds_i[j][0] valid_idx = folds_i[j][1] training_fold = developement_df.loc[train_idx, ] training_fold = training_fold.reset_index(drop=True) validation_fold = developement_df.loc[valid_idx, ] validation_fold = validation_fold.reset_index(drop=True) # shuffle the folds training_stats_i_f = Statistics() validation_stats_i_f = Statistics() testing_stats_i_f = Statistics() # Init the label ranking lists. label_pred_proba_train = [] label_pred_proba_valid = [] label_pred_proba_test = [] label_y_train = [] label_y_valid = [] label_y_test = [] # Set up the vectorizer for the bag-of-words representation if vectorizer_method == 'tf-idf': vectorizer = TfidfVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True, sublinear_tf=True, max_df=1.0, min_df=0 ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 elif vectorizer_method == 'count': vectorizer = CountVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 else: raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method))) selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag) base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng) for label in sorted(labels): print("\t\tFitting for label {}...".format(label)) # SVMs make the assumption of standardised features. Hence we scale the features # avoiding the use of mean to maintain the structure of count sparsity. Scaling # May also help with linear model convergence speed. x_train_l = vectorizer.transform(training_fold['terms'].values) y_train_l = np.asarray(training_fold[label].values, dtype=int) x_valid_l = vectorizer.transform(validation_fold['terms'].values) y_valid_l = np.asarray(validation_fold[label].values, dtype=int) x_test_l = vectorizer.transform(testing_df['terms'].values) y_test_l = np.asarray(test_df_i[label].values, dtype=int) if scale: x_train_l = mean_center(x_train_l, with_mean=False) x_valid_l = mean_center(x_valid_l, with_mean=False) x_test_l = mean_center(x_test_l, with_mean=False) # We generate the folds for randomised search up-front. We hold out one of the folds for # Probability calibration so each sampled param set gets calibrated on the same data. # This leaves cv_folds-2 folds for randomised search cross-validation. # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng) base_estimator_l = base_estimators[label] fresh_estimator = clone(base_estimator_l) # Find the best params, then do a final proper calibration. params = sk_generate_params(method, selection) estimator_l = RandomizedSearchCV( estimator=base_estimator_l, param_distributions=params, n_iter=60, scoring='f1', cv=3, random_state=rng, error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs', refit=True ) # Test if there's any signal if we permute the labels. # Classifier should do poorly if we do so. if permute: y_train_l = rng.permutation(y_train_l) threshold = 0.5 estimator_l.fit(x_train_l, y_train_l) best_params_l = estimator_l.best_params_ # Calibrate the random forest with the best hyperparameters. if method not in ['lr']: estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l), cv=3, method='sigmoid') estimator_l.fit(x_train_l, y_train_l) # Evaluate Performance characteristics and test on training to check overfitting. y_train_prob_l = estimator_l.predict_proba(x_train_l) y_valid_prob_l = estimator_l.predict_proba(x_valid_l) y_test_prob_l = estimator_l.predict_proba(x_test_l) training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold)) validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold)) # Compute independent test data performance testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold)) # Get label ranking info label_pred_proba_train.append([p[1] for p in y_train_prob_l]) label_pred_proba_valid.append([p[1] for p in y_valid_prob_l]) label_pred_proba_test.append([p[1] for p in y_test_prob_l]) label_y_train.append(y_train_l) label_y_valid.append(y_valid_l) label_y_test.append(y_test_l) print(validation_stats_i_f.frame()) # Compute multi-label performance statistics y = np.vstack(zip(*label_y_train)) y_prob = np.vstack(zip(*label_pred_proba_train)) training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_valid)) y_prob = np.vstack(zip(*label_pred_proba_valid)) validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_test)) y_prob = np.vstack(zip(*label_pred_proba_test)) testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
labels = get_labels_from_file('data/labels.tsv') n = len(labels) split_train = {l:0 for l in labels} for l in labels: split_train[l] = sum(developement_df[l].values) split_test = {l:0 for l in labels} for l in labels: split_test[l] = sum(testing_df[l].values) n_samples_train = len(developement_df) n_samples_test = len(testing_df) # Create the appropriate statistics container for the whole experiment. training_stats = Statistics() validation_stats = Statistics() testing_stats = Statistics() seeds = create_seeds(iterations) min_class_freq = min(split_train.values()) cv_folds = min([min_class_freq, cv_folds]) statistics_objects = [] best_params = {l: {'score': 0.0, 'params': {}} for l in labels} print("Running Supervised Ensemble Classification...") # def do_iteration(i): for i in range(iterations): print("Iteration " + str(i+1)) rng = np.random.RandomState() rng.seed(seeds[i]) dev_df_i = developement_df.copy(deep=True)
def evaluate_crf_model(x, y, estimator, labels, uniprot=None, verbose=0): y_pred = np.asarray(estimator.predict(x)) statistics = Statistics() statistics.update_statistics('all_labels', 'accuracy', estimator.score(x, y)) bin_labels = [0, 1] for i, l in enumerate(labels): y_true_binary_l = y[:, i].astype(int) y_pred_binary_l = y_pred[:, i].astype(int) label_stats = compute_label_statistics(y_true_binary_l, y_pred_binary_l, labels=bin_labels) statistics.update_statistics(l, 'Accuracy', accuracy_score(y_true_binary_l, y_pred_binary_l)) statistics.update_statistics(l, 'Specifcity', label_stats[1]['specificity']) statistics.update_statistics(l, 'Recall', label_stats[1]['sensitivity']) statistics.update_statistics(l, 'Precision', label_stats[1]['precision']) statistics.update_statistics(l, 'FDR', label_stats[1]['fdr']) statistics.update_statistics(l, 'F-Score (beta=0.5)', fbeta_score( y_true_binary_l, y_pred_binary_l, beta=0.5, labels=bin_labels, average='binary' )) statistics.update_statistics(l, 'F-Score (beta=1)', fbeta_score( y_true_binary_l, y_pred_binary_l, beta=1.0, labels=bin_labels, average='binary' )) try: roc_auc = roc_auc_score(y_true_binary_l, y_pred_binary_l, average="binary") statistics.update_statistics(l, 'ROC-AUC', roc_auc) except (ValueError, AssertionError): statistics.update_statistics(l, 'ROC-AUC', np.NaN) try: pr_auc = average_precision_score(y_true_binary_l, y_pred_binary_l, average="binary") statistics.update_statistics(l, 'PR-AUC', pr_auc) except (ValueError, AssertionError): statistics.update_statistics(l, 'PR-AUC', np.NaN) if verbose: for l in labels: statistics.print_statistics(l) if uniprot and verbose: for u, p1, p2 in zip(uniprot, y, y_pred): print("\t\t\tResult for {} \n\t\t\t\tTrue: \t{} ||| Pred: \t{}".format(u, p1, p2)) return statistics
def multi_label_crf( labels, df_train, df_valid, df_test, binary, connectivity='full', vectorizer_method='count' ): """ Do you suffer from acute label correlations? Are your samples a part of more than one class? Do you have signs of labels have dependency? If you answered yes to at least one of those questions then sign up for structured learning today. For a low monthly membership fee of $39.99 you can solve all your multi-label woes! @param labels: @param df_train: @param df_valid: @param df_test: @param connectivity: @param vectorizer_method: @return: """ stats_container_valid = Statistics() stats_container_test = Statistics() if vectorizer_method == 'tf-idf': vectorizer_node = TfidfVectorizer( stop_words=['go:', '', ' '], binary=binary, lowercase=True, sublinear_tf=False, max_df=1.0, min_df=0) vectorizer_node.fit(df_train['terms'].values) alpha = None percentile = 100 elif vectorizer_method == 'count': vectorizer_node = CountVectorizer(stop_words=['go', '', ' '], binary=binary, lowercase=True) vectorizer_node.fit(df_train['terms'].values) alpha = None percentile = 100 else: raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method))) x_node_train, y_train, feature_names, selector_node = prep.select_features( df = df_train, vectorizer=vectorizer_node, feature_col='terms', label_col='label', select_method=None, continuous_col=[], alpha=alpha, percentile=percentile ) x_node_valid, y_valid = prep.transform_features( df=df_valid, vectorizer=vectorizer_node, selector=selector_node, feature_col='terms', label_col='label', continuous_cols=[] ) y_train = np.asarray([prep.binarise_labels(x, labels) for x in y_train], dtype=int) y_valid = np.asarray([prep.binarise_labels(x, labels) for x in y_valid], dtype=int) if connectivity == 'full' or connectivity == 'tree': n_labels = len(labels) edges = np.vstack([x for x in itertools.combinations(range(n_labels), 2)]) model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='ad3') elif connectivity == 'tree': edges = chow_liu_tree(y_train) model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='max-product') else: edges = None model = MultiLabelClf(n_labels=len(labels), edges=edges, inference_method='unary') x_train = x_node_train.toarray() x_valid = x_node_valid.toarray() # -------------------- MAKE THE ESTIMATOR -------------------- # estimator = OneSlackSSVM(model, max_iter=2, tol=0.001, n_jobs=1) # -------------------- LEARN/STATS -------------------- # estimator.fit(x_train, y_train) stats_container_valid.merge(evaluate_crf_model(x_valid, y_valid, estimator, labels)) if isinstance(df_test, pd.DataFrame): x_node_test, y_test = prep.transform_features( df=df_test, vectorizer=vectorizer_node, selector=selector_node, feature_col='terms', label_col='label', continuous_cols=[] ) y_test = np.asarray([prep.binarise_labels(x, labels) for x in y_test], dtype=int) x_test = x_node_test.toarray() stats_container_test.merge(evaluate_crf_model(x_test, y_test, estimator, labels)) # -------------------- RETURN -------------------- # if isinstance(df_test, pd.DataFrame): return stats_container_valid, stats_container_test else: return stats_container_valid
def multi_label_evaluate(y, y_prob, threshold): statistics = Statistics() y_pred = (y_prob >= threshold).astype(int) y_pred_50 = (y_prob >= 0.5).astype(int) ranking_loss = label_ranking_loss(y, y_pred) lraps = label_ranking_average_precision_score(y, y_pred) ranking_loss_50 = label_ranking_loss(y, y_pred_50) lraps_50 = label_ranking_average_precision_score(y, y_pred_50) f1_macro = f1_score(y, y_pred, average='macro') f1_macro_50 = f1_score(y, y_pred_50, average='macro') statistics.update_statistics("Multi-Label", "Ranking Loss", ranking_loss) statistics.update_statistics("Multi-Label", "Ranking Precision", lraps) statistics.update_statistics("Multi-Label", "Ranking Loss (t=0.5)", ranking_loss_50) statistics.update_statistics("Multi-Label", "Ranking Precision (t=0.5)", lraps_50) statistics.update_statistics("Multi-Label", "Macro F1", f1_macro) statistics.update_statistics("Multi-Label", "Macro F1 (t=0.5)", f1_macro_50) try: auc_macro = roc_auc_score(y, y_pred, average='macro') auc_macro_50 = roc_auc_score(y, y_pred_50, average='macro') auc_pr_macro = roc_auc_score(y, y_prob, average='macro') statistics.update_statistics("Multi-Label", "Macro AUC", auc_macro) statistics.update_statistics("Multi-Label", "Macro AUC (t=0.5)", auc_macro_50) statistics.update_statistics("Multi-Label", "Macro AUC (Pr)", auc_pr_macro) except ValueError: statistics.update_statistics("Multi-Label", "Macro AUC", np.NaN) statistics.update_statistics("Multi-Label", "Macro AUC (t=0.5)", np.NaN) statistics.update_statistics("Multi-Label", "Macro AUC (Pr)", np.NaN) return statistics
def evaluate_model(y, y_prob, label, threshold): statistics = Statistics() y_pred = (y_prob[:, 1] >= threshold).astype(int) bin_labels = [0, 1] label_stats = compute_label_statistics(y, y_pred, labels=bin_labels) statistics.update_statistics(label, 'F (beta=1)', f1_score(y, y_pred, average='binary', labels=[0,1], pos_label=1)) statistics.update_statistics(label, 'Specificity', label_stats[1]['specificity']) statistics.update_statistics(label, 'Recall', label_stats[1]['sensitivity']) statistics.update_statistics(label, 'Precision', label_stats[1]['precision']) statistics.update_statistics(label, 'FDR', label_stats[1]['fdr']) try: statistics.update_statistics(label, 'ROC-AUC', roc_auc_score(y, y_pred, average='weighted')) except (ValueError, AssertionError): statistics.update_statistics(label, 'ROC-AUC', 0.0) try: pr_auc = average_precision_score(y, y_pred, average='weighted') if str(pr_auc) == 'nan': pr_auc = 0.0 statistics.update_statistics(label, 'PR-AUC', pr_auc) except (ValueError, AssertionError): statistics.update_statistics(label, 'PR-AUC', 0.0) return statistics
def do_iteration(data): # for i in xrange(iterations): i = data[0] print("Iteration " + str(i+1)) train_i = train.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) su_make_dir('llda/models/iteration-{}'.format(i+1)) folds_i = iterative_stratification(train_i, labels, cv_folds, rng) combinations = itertools.combinations(range(0, cv_folds), cv_folds-1) # Create the appropriate statistics container for this iteration. validation_stats_i = Statistics() testing_stats_i = Statistics() for n, j in enumerate(combinations): # print('\tFold ' + str(n+1)) su_make_dir('llda/models/iteration-{}/fold-{}'.format(i+1, n+1)) file_path = 'llda/models/iteration-{}/fold-{}/'.format(i+1, n+1) training_folds_j = folds_i[list(j)] validation_fold_j = folds_i[[f for f in range(0, cv_folds) if f not in j]] assert len(validation_fold_j) == 1 training_fold = reduce( lambda x, y: pd.concat([x, y], ignore_index=True, copy=True), training_folds_j[1:], training_folds_j[0] ) validation_fold = validation_fold_j[0] # shuffle the folds if balanced: training_fold = training_fold.reindex(rng.permutation(training_fold.index)) training_fold = prep.reduce_label_bias(training_fold, labels, 'activation', 5, random_state=rng) training_fold = training_fold.reindex(rng.permutation(training_fold.index)) validation_fold = validation_fold.reindex(rng.permutation(validation_fold.index)) write_tsv(training_fold, file_path + '/train.tsv', test=False) write_tsv(validation_fold, file_path + '/valid.tsv', test=True) write_tsv(test, file_path + '/test.tsv', test=True) # ------------------ CALL JAVA TO LLDA ON THIS PARTAY ----------------- # DEVNULL = open(os.devnull, 'w') args = [ 'java', '-jar', '-Xmx2048m', 'llda/tmt-0.4.0.jar', 'llda/llda.scala', file_path, '/train.tsv', '/valid.tsv', '/test.tsv', 'model-{}-{}'.format(i+1, n+1), '{}-{}'.format(i+1, n+1) ] p = Popen(args, stdout=DEVNULL, stderr=STDOUT) p.wait() # Perform evaluation validation_proba = np.genfromtxt('llda/results/validation-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:] test_proba = np.genfromtxt('llda/results/test-{}-{}.csv'.format(i+1, n+1), delimiter=',')[:, 1:] labels_j = get_labels_from_model(file_path + '/llda-cvb0-model-{}-{}'.format(1+i, n+1)) validation_stats_i_j = Statistics() testing_stats_i_j = Statistics() for l_index, l in enumerate(labels_j): y_validation = [p for p in validation_fold[l].values] y_proba_validation = [[1-p, p] for p in validation_proba[:, l_index]] y_pred_validation = [int(p >= threshold) for p in validation_proba[:, l_index]] y_hprd = [p for p in test[l].values] y_proba_hprd = [[1-p, p] for p in test_proba[:, l_index]] y_pred_hprd = [int(p >= threshold) for p in test_proba[:, l_index]] validation_stats_i_j = evaluate_model( y=y_validation, y_pred=y_pred_validation, y_pred_prob=y_proba_validation, label=l, statistics=validation_stats_i_j, verbose=0 ) testing_stats_i_j = evaluate_model( y=y_hprd, y_pred=y_pred_hprd, y_pred_prob=y_proba_hprd, label=l, statistics=testing_stats_i_j, verbose=0 ) validation_stats_i.merge(validation_stats_i_j) testing_stats_i.merge(testing_stats_i_j) return validation_stats_i, testing_stats_i
# ----------------------------- LOAD DATA ----------------------------------- # train, test = prep.prep_data_frames(selection, load_interactome=False) labels = get_labels_from_file('data/labels.tsv') n = len(labels) split_train = {l:0 for l in labels} for l in labels: split_train[l] = sum(train[l].values) split_test = {l:0 for l in labels} for l in labels: split_test[l] = sum(test[l].values) # Create the appropriate statistics container for the whole experiment. validation_stats = Statistics() testing_stats = Statistics() seeds = create_seeds(iterations) print("Running Labeled Latent Dirichlet Allocation...") def do_iteration(data): # for i in xrange(iterations): i = data[0] print("Iteration " + str(i+1)) train_i = train.copy(deep=True) rng = np.random.RandomState() rng.seed(seeds[i]) su_make_dir('llda/models/iteration-{}'.format(i+1)) folds_i = iterative_stratification(train_i, labels, cv_folds, rng) combinations = itertools.combinations(range(0, cv_folds), cv_folds-1)