def do_fold(j): print("\tFold " + str(j+1)) train_idx = folds_i[j][0] valid_idx = folds_i[j][1] training_fold = developement_df.loc[train_idx, ] training_fold = training_fold.reset_index(drop=True) validation_fold = developement_df.loc[valid_idx, ] validation_fold = validation_fold.reset_index(drop=True) # shuffle the folds training_stats_i_f = Statistics() validation_stats_i_f = Statistics() testing_stats_i_f = Statistics() # Init the label ranking lists. label_pred_proba_train = [] label_pred_proba_valid = [] label_pred_proba_test = [] label_y_train = [] label_y_valid = [] label_y_test = [] # Set up the vectorizer for the bag-of-words representation if vectorizer_method == 'tf-idf': vectorizer = TfidfVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True, sublinear_tf=True, max_df=1.0, min_df=0 ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 elif vectorizer_method == 'count': vectorizer = CountVectorizer( stop_words=['go', '', ' '], binary=binary, lowercase=True ) vectorizer.fit(training_fold['terms'].values) alpha = None percentile = 100 else: raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method))) selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag) base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng) for label in sorted(labels): print("\t\tFitting for label {}...".format(label)) # SVMs make the assumption of standardised features. Hence we scale the features # avoiding the use of mean to maintain the structure of count sparsity. Scaling # May also help with linear model convergence speed. x_train_l = vectorizer.transform(training_fold['terms'].values) y_train_l = np.asarray(training_fold[label].values, dtype=int) x_valid_l = vectorizer.transform(validation_fold['terms'].values) y_valid_l = np.asarray(validation_fold[label].values, dtype=int) x_test_l = vectorizer.transform(testing_df['terms'].values) y_test_l = np.asarray(test_df_i[label].values, dtype=int) if scale: x_train_l = mean_center(x_train_l, with_mean=False) x_valid_l = mean_center(x_valid_l, with_mean=False) x_test_l = mean_center(x_test_l, with_mean=False) # We generate the folds for randomised search up-front. We hold out one of the folds for # Probability calibration so each sampled param set gets calibrated on the same data. # This leaves cv_folds-2 folds for randomised search cross-validation. # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng) base_estimator_l = base_estimators[label] fresh_estimator = clone(base_estimator_l) # Find the best params, then do a final proper calibration. params = sk_generate_params(method, selection) estimator_l = RandomizedSearchCV( estimator=base_estimator_l, param_distributions=params, n_iter=60, scoring='f1', cv=3, random_state=rng, error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs', refit=True ) # Test if there's any signal if we permute the labels. # Classifier should do poorly if we do so. if permute: y_train_l = rng.permutation(y_train_l) threshold = 0.5 estimator_l.fit(x_train_l, y_train_l) best_params_l = estimator_l.best_params_ # Calibrate the random forest with the best hyperparameters. if method not in ['lr']: estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l), cv=3, method='sigmoid') estimator_l.fit(x_train_l, y_train_l) # Evaluate Performance characteristics and test on training to check overfitting. y_train_prob_l = estimator_l.predict_proba(x_train_l) y_valid_prob_l = estimator_l.predict_proba(x_valid_l) y_test_prob_l = estimator_l.predict_proba(x_test_l) training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold)) validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold)) # Compute independent test data performance testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold)) # Get label ranking info label_pred_proba_train.append([p[1] for p in y_train_prob_l]) label_pred_proba_valid.append([p[1] for p in y_valid_prob_l]) label_pred_proba_test.append([p[1] for p in y_test_prob_l]) label_y_train.append(y_train_l) label_y_valid.append(y_valid_l) label_y_test.append(y_test_l) print(validation_stats_i_f.frame()) # Compute multi-label performance statistics y = np.vstack(zip(*label_y_train)) y_prob = np.vstack(zip(*label_pred_proba_train)) training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_valid)) y_prob = np.vstack(zip(*label_pred_proba_valid)) validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) y = np.vstack(zip(*label_y_test)) y_prob = np.vstack(zip(*label_pred_proba_test)) testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold)) return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp']) pina_corpus_cc = compute_corpus(interactome_df, ['induced_go_cc']) pina_corpus_mf = compute_corpus(interactome_df, ['induced_go_mf']) mean, std = depths(interactome_df, 'terms') interactome_df['depth_mu'] = mean interactome_df['depth_std'] = std accessions = list(set(list(interactome_df.uniprot_a.values) + list(interactome_df.uniprot_b.values))) gene_keys = uniprot.get_batch_accession_data(accessions, data_types=[UniProt.GENE]) gene_keys = {k: v[uniprot.GENE].split(' ')[0] for k,v in gene_keys.iteritems()} predictions = {} vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True) vectorizer.fit(training_df['terms'].values) selectors = generate_selectors(columns, vectorizer.get_feature_names(), dag) estimators = make_classifiers(method, 'balanced', labels, selectors, columns, None) best_features = pd.DataFrame(data={'label': [], 'feature': []}) def rename(x): if 'pf' not in x and 'ipr' not in x: return 'GO:' + x.upper() else: return x.upper() for l in sorted(labels): print("Fitting/Predicting {}...".format(l)) params = sk_generate_params(method, columns) clf = estimators[l] clf, selector, vectorizer = fit( x='terms', y=l, estimator=clf, dataframe=training_df, params=params