Exemple #1
0
        def do_fold(j):
            print("\tFold " + str(j+1))
            train_idx = folds_i[j][0]
            valid_idx = folds_i[j][1]
            training_fold = developement_df.loc[train_idx, ]
            training_fold = training_fold.reset_index(drop=True)
            validation_fold = developement_df.loc[valid_idx, ]
            validation_fold = validation_fold.reset_index(drop=True)

            # shuffle the folds
            training_stats_i_f = Statistics()
            validation_stats_i_f = Statistics()
            testing_stats_i_f = Statistics()

            # Init the label ranking lists.
            label_pred_proba_train = []
            label_pred_proba_valid = []
            label_pred_proba_test = []

            label_y_train = []
            label_y_valid = []
            label_y_test = []

            # Set up the vectorizer for the bag-of-words representation
            if vectorizer_method == 'tf-idf':
                vectorizer = TfidfVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True,
                    sublinear_tf=True, max_df=1.0, min_df=0
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            elif vectorizer_method == 'count':
                vectorizer = CountVectorizer(
                    stop_words=['go', '', ' '], binary=binary, lowercase=True
                )
                vectorizer.fit(training_fold['terms'].values)
                alpha = None
                percentile = 100
            else:
                raise TypeError("Vectorizer_method has type {}.".format(type(vectorizer_method)))

            selectors = generate_selectors(selection, vectorizer.get_feature_names(), dag)
            base_estimators = make_classifiers(method, balanced, labels, selectors, selection, rng)
            for label in sorted(labels):
                print("\t\tFitting for label {}...".format(label))

                # SVMs make the assumption of standardised features. Hence we scale the features
                # avoiding the use of mean to maintain the structure of count sparsity. Scaling
                # May also help with linear model convergence speed.
                x_train_l = vectorizer.transform(training_fold['terms'].values)
                y_train_l = np.asarray(training_fold[label].values, dtype=int)

                x_valid_l = vectorizer.transform(validation_fold['terms'].values)
                y_valid_l = np.asarray(validation_fold[label].values, dtype=int)

                x_test_l = vectorizer.transform(testing_df['terms'].values)
                y_test_l = np.asarray(test_df_i[label].values, dtype=int)

                if scale:
                    x_train_l = mean_center(x_train_l, with_mean=False)
                    x_valid_l = mean_center(x_valid_l, with_mean=False)
                    x_test_l = mean_center(x_test_l, with_mean=False)

                # We generate the folds for randomised search up-front. We hold out one of the folds for
                # Probability calibration so each sampled param set gets calibrated on the same data.
                # This leaves cv_folds-2 folds for randomised search cross-validation.
                # cv_rand = StratifiedKFold(n_splits=3, shuffle=True, random_state=rng)
                base_estimator_l = base_estimators[label]
                fresh_estimator = clone(base_estimator_l)

                # Find the best params, then do a final proper calibration.
                params = sk_generate_params(method, selection)
                estimator_l = RandomizedSearchCV(
                    estimator=base_estimator_l, param_distributions=params,
                    n_iter=60, scoring='f1', cv=3, random_state=rng,
                    error_score=0.0, n_jobs=1, pre_dispatch='2*n_jobs',
                    refit=True
                )

                # Test if there's any signal if we permute the labels.
                # Classifier should do poorly if we do so.
                if permute:
                    y_train_l = rng.permutation(y_train_l)

                threshold = 0.5
                estimator_l.fit(x_train_l, y_train_l)
                best_params_l = estimator_l.best_params_

                # Calibrate the random forest with the best hyperparameters.
                if method not in ['lr']:
                    estimator_l = CalibratedClassifierCV(fresh_estimator.set_params(**best_params_l),
                                                         cv=3, method='sigmoid')
                    estimator_l.fit(x_train_l, y_train_l)

                # Evaluate Performance characteristics and test on training to check overfitting.
                y_train_prob_l = estimator_l.predict_proba(x_train_l)
                y_valid_prob_l = estimator_l.predict_proba(x_valid_l)
                y_test_prob_l = estimator_l.predict_proba(x_test_l)
                training_stats_i_f.merge(evaluate_model(y_train_l, y_train_prob_l, label, threshold))
                validation_stats_i_f.merge(evaluate_model(y_valid_l, y_valid_prob_l, label,threshold))

                # Compute independent test data performance
                testing_stats_i_f.merge(evaluate_model(y_test_l, y_test_prob_l, label, threshold))

                # Get label ranking info
                label_pred_proba_train.append([p[1] for p in y_train_prob_l])
                label_pred_proba_valid.append([p[1] for p in y_valid_prob_l])
                label_pred_proba_test.append([p[1] for p in y_test_prob_l])

                label_y_train.append(y_train_l)
                label_y_valid.append(y_valid_l)
                label_y_test.append(y_test_l)

                print(validation_stats_i_f.frame())

            # Compute multi-label performance statistics
            y = np.vstack(zip(*label_y_train))
            y_prob = np.vstack(zip(*label_pred_proba_train))
            training_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_valid))
            y_prob = np.vstack(zip(*label_pred_proba_valid))
            validation_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            y = np.vstack(zip(*label_y_test))
            y_prob = np.vstack(zip(*label_pred_proba_test))
            testing_stats_i_f.merge(multi_label_evaluate(y, y_prob, threshold))

            return training_stats_i_f, validation_stats_i_f, testing_stats_i_f
    pina_corpus_bp = compute_corpus(interactome_df, ['induced_go_bp'])
    pina_corpus_cc = compute_corpus(interactome_df, ['induced_go_cc'])
    pina_corpus_mf = compute_corpus(interactome_df, ['induced_go_mf'])

    mean, std = depths(interactome_df, 'terms')
    interactome_df['depth_mu'] = mean
    interactome_df['depth_std'] = std

    accessions = list(set(list(interactome_df.uniprot_a.values) + list(interactome_df.uniprot_b.values)))
    gene_keys = uniprot.get_batch_accession_data(accessions, data_types=[UniProt.GENE])
    gene_keys = {k: v[uniprot.GENE].split(' ')[0] for k,v in gene_keys.iteritems()}

    predictions = {}
    vectorizer = CountVectorizer(stop_words=['go', '', ' '], binary=False, lowercase=True)
    vectorizer.fit(training_df['terms'].values)
    selectors = generate_selectors(columns, vectorizer.get_feature_names(), dag)
    estimators = make_classifiers(method, 'balanced', labels, selectors, columns, None)
    best_features = pd.DataFrame(data={'label': [], 'feature': []})

    def rename(x):
        if 'pf' not in x and 'ipr' not in x:
            return 'GO:' + x.upper()
        else:
            return x.upper()

    for l in sorted(labels):
        print("Fitting/Predicting {}...".format(l))
        params = sk_generate_params(method, columns)
        clf = estimators[l]
        clf, selector, vectorizer = fit(
            x='terms', y=l, estimator=clf, dataframe=training_df, params=params