コード例 #1
0
    def _get_support_mask(self):
        k = self.k
        chi2_scores = self.scores_
        chi2_mask = np.ones(chi2_scores.shape, dtype=bool)

        if k != 'all' and k < len(chi2_scores):
            # we don't want all features to be kept, and the number we want is less than the number available
            chi2_scores = _clean_nans(chi2_scores)
            selected_indices = np.argsort(chi2_scores)[:k]
            chi2_mask[selected_indices] = False

        mask = chi2_mask & self.vectors_mask & self.log_odds_mask
        logging.info('%d/%d features survived feature selection', np.count_nonzero(mask), len(mask))

        # Only keep the scores of the features that survived. This array is used to check the
        # input data shape at train and decode time matches. However, because the post-feature-selections
        # vocabulary is passed back into the vectorizer, at decode time the input will likely be smaller. This is
        # like doing feature selection in the vectorizer.
        self.scores_ = self.scores_[mask]
        self.log_odds_mask = self.log_odds_mask[mask]
        self.vectors_mask = self.vectors_mask[mask]

        self.vocabulary_ = update_dict_according_to_mask(self.vocabulary_, mask)
        return mask
コード例 #2
0
def test_update_dict_according_to_mask():
    assert update_dict_according_to_mask({'a': 0, 'b': 1, 'c': 2}, [True, False, True]) == {'a': 0, 'c': 1}

    mask = np.array([True, False, True])
    assert update_dict_according_to_mask({'a': 0, 'b': 1, 'c': 2}, mask) == {'a': 0, 'c': 1}
コード例 #3
0
ファイル: evaluate.py プロジェクト: mbatchkarov/dc_evaluation
def _cv_loop(config, cv_i, score_func, test_idx, train_idx, X, y):
    logging.info('Starting CV fold %d', cv_i)
    pipeline, fit_params = _build_pipeline(config, cv_i)

    # code below is a simplified version of sklearn's _cross_val_score
    train_text = [X[idx] for idx in train_idx]
    test_text = [X[idx] for idx in test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]

    # vectorize all data in advance, it's the same across all classifiers
    tr_matrix = pipeline.fit_transform(train_text, y_train, **fit_params)
    # Update the vocabulary of the vectorizer. The feature selector may remove some vocabulary entries,
    # but the vectorizer will be unaware of this. Because the vectorizer's logic is conditional on whether
    # features are IV or OOV, this is a problem. (The issue is caused by the fact that the vectorizer and feature
    # selector are not independent. The special logic of the vectorizer should have been implemented as a third
    # transformer, but this would require too much work at this time.
    if 'fs' in pipeline.named_steps:
        pipeline.named_steps['vect'].vocabulary_ = pipeline.named_steps['fs'].vocabulary_
    test_matrix = pipeline.transform(test_text)
    stats = pipeline.named_steps['vect'].stats

    # remove documents with too few features
    to_keep_train = tr_matrix.sum(axis=1) >= config['min_train_features']
    to_keep_train = np.ravel(np.array(to_keep_train))
    logging.info('%d/%d train documents have enough features', sum(to_keep_train), len(y_train))
    tr_matrix = tr_matrix[to_keep_train, :]
    y_train = y_train[to_keep_train]

    # the slice above may remove all occurrences of a feature,
    # e.g. when it only occurs in one document (very common) and the document
    # doesn't have enough features. Drop empty columns in the term-doc matrix
    column_mask = tr_matrix.sum(axis=0) > 0
    column_mask = np.squeeze(np.array(column_mask))
    tr_matrix = tr_matrix[:, column_mask]

    voc = update_dict_according_to_mask(pipeline.named_steps['vect'].vocabulary_, column_mask)
    inv_voc = {index: feature for (feature, index) in voc.items()}

    # do the same for the test set
    to_keep_test = test_matrix.sum(axis=1) >= config['min_test_features']  # todo need unit test
    to_keep_test = np.ravel(np.array(to_keep_test))
    logging.info('%d/%d test documents have enough features', np.count_nonzero(to_keep_test), len(y_test))
    test_matrix = test_matrix[to_keep_test, :]
    y_test = y_test[to_keep_test]

    np.savetxt(os.path.join(config['output_dir'], 'gold-cv%d.csv' % cv_i),
               y_test, delimiter=',', fmt="%s")

    scores_this_cv_run = []
    for clf in _build_classifiers(config['classifiers']):
        if not (np.count_nonzero(to_keep_train) and np.count_nonzero(to_keep_test)):
            logging.error('There isnt enough test data for a proper evaluation, skipping this fold!!!')
            continue  # if there's no training data or test data just ignore the fold
        logging.info('Starting training of %s', clf)
        clf = clf.fit(tr_matrix, y_train)
        predictions = clf.predict(test_matrix)
        scores = score_func(y_test, predictions)

        tr_set_scores = score_func(y_train, clf.predict(tr_matrix))
        logging.info('Training set scores: %r', tr_set_scores)
        clf_name = clf.__class__.__name__.split('.')[-1]
        np.savetxt(os.path.join(config['output_dir'], 'predictions-%s-cv%d.csv' % (clf_name, cv_i)),
                   predictions, delimiter=',', fmt="%s")

        if config['debug_level'] > 1:
            # if a feature selectors exist, use its vocabulary
            # step_name = 'fs' if 'fs' in pipeline.named_steps else 'vect'
            with open('%s.%s.pkl' % (stats.prefix, clf_name), 'wb') as outf:
                logging.info('Pickling trained classifier to %s', outf.name)
                b = Bunch(clf=clf, inv_voc=inv_voc, tr_matrix=tr_matrix,
                          test_matrix=test_matrix, predictions=predictions,
                          y_tr=y_train, y_ev=y_test, train_mask=to_keep_train,
                          test_mask=to_keep_test)
                pickle.dump(b, outf)

        for metric, score in scores.items():
            scores_this_cv_run.append(
                [type(clf).__name__,
                 cv_i,
                 metric.split('.')[-1],
                 score])
        logging.info('Done with %s', clf)
    logging.info('Finished CV fold %d', cv_i)
    try:
        v = shared_fit_args['vector_source']
        logging.info('Cache info: %s', v.get_nearest_neighbours.cache_info())
    except Exception:
        # can fail for a number of reasons, don't care much if it does
        pass
    return scores_this_cv_run