def _get_support_mask(self): k = self.k chi2_scores = self.scores_ chi2_mask = np.ones(chi2_scores.shape, dtype=bool) if k != 'all' and k < len(chi2_scores): # we don't want all features to be kept, and the number we want is less than the number available chi2_scores = _clean_nans(chi2_scores) selected_indices = np.argsort(chi2_scores)[:k] chi2_mask[selected_indices] = False mask = chi2_mask & self.vectors_mask & self.log_odds_mask logging.info('%d/%d features survived feature selection', np.count_nonzero(mask), len(mask)) # Only keep the scores of the features that survived. This array is used to check the # input data shape at train and decode time matches. However, because the post-feature-selections # vocabulary is passed back into the vectorizer, at decode time the input will likely be smaller. This is # like doing feature selection in the vectorizer. self.scores_ = self.scores_[mask] self.log_odds_mask = self.log_odds_mask[mask] self.vectors_mask = self.vectors_mask[mask] self.vocabulary_ = update_dict_according_to_mask(self.vocabulary_, mask) return mask
def test_update_dict_according_to_mask(): assert update_dict_according_to_mask({'a': 0, 'b': 1, 'c': 2}, [True, False, True]) == {'a': 0, 'c': 1} mask = np.array([True, False, True]) assert update_dict_according_to_mask({'a': 0, 'b': 1, 'c': 2}, mask) == {'a': 0, 'c': 1}
def _cv_loop(config, cv_i, score_func, test_idx, train_idx, X, y): logging.info('Starting CV fold %d', cv_i) pipeline, fit_params = _build_pipeline(config, cv_i) # code below is a simplified version of sklearn's _cross_val_score train_text = [X[idx] for idx in train_idx] test_text = [X[idx] for idx in test_idx] y_train = y[train_idx] y_test = y[test_idx] # vectorize all data in advance, it's the same across all classifiers tr_matrix = pipeline.fit_transform(train_text, y_train, **fit_params) # Update the vocabulary of the vectorizer. The feature selector may remove some vocabulary entries, # but the vectorizer will be unaware of this. Because the vectorizer's logic is conditional on whether # features are IV or OOV, this is a problem. (The issue is caused by the fact that the vectorizer and feature # selector are not independent. The special logic of the vectorizer should have been implemented as a third # transformer, but this would require too much work at this time. if 'fs' in pipeline.named_steps: pipeline.named_steps['vect'].vocabulary_ = pipeline.named_steps['fs'].vocabulary_ test_matrix = pipeline.transform(test_text) stats = pipeline.named_steps['vect'].stats # remove documents with too few features to_keep_train = tr_matrix.sum(axis=1) >= config['min_train_features'] to_keep_train = np.ravel(np.array(to_keep_train)) logging.info('%d/%d train documents have enough features', sum(to_keep_train), len(y_train)) tr_matrix = tr_matrix[to_keep_train, :] y_train = y_train[to_keep_train] # the slice above may remove all occurrences of a feature, # e.g. when it only occurs in one document (very common) and the document # doesn't have enough features. Drop empty columns in the term-doc matrix column_mask = tr_matrix.sum(axis=0) > 0 column_mask = np.squeeze(np.array(column_mask)) tr_matrix = tr_matrix[:, column_mask] voc = update_dict_according_to_mask(pipeline.named_steps['vect'].vocabulary_, column_mask) inv_voc = {index: feature for (feature, index) in voc.items()} # do the same for the test set to_keep_test = test_matrix.sum(axis=1) >= config['min_test_features'] # todo need unit test to_keep_test = np.ravel(np.array(to_keep_test)) logging.info('%d/%d test documents have enough features', np.count_nonzero(to_keep_test), len(y_test)) test_matrix = test_matrix[to_keep_test, :] y_test = y_test[to_keep_test] np.savetxt(os.path.join(config['output_dir'], 'gold-cv%d.csv' % cv_i), y_test, delimiter=',', fmt="%s") scores_this_cv_run = [] for clf in _build_classifiers(config['classifiers']): if not (np.count_nonzero(to_keep_train) and np.count_nonzero(to_keep_test)): logging.error('There isnt enough test data for a proper evaluation, skipping this fold!!!') continue # if there's no training data or test data just ignore the fold logging.info('Starting training of %s', clf) clf = clf.fit(tr_matrix, y_train) predictions = clf.predict(test_matrix) scores = score_func(y_test, predictions) tr_set_scores = score_func(y_train, clf.predict(tr_matrix)) logging.info('Training set scores: %r', tr_set_scores) clf_name = clf.__class__.__name__.split('.')[-1] np.savetxt(os.path.join(config['output_dir'], 'predictions-%s-cv%d.csv' % (clf_name, cv_i)), predictions, delimiter=',', fmt="%s") if config['debug_level'] > 1: # if a feature selectors exist, use its vocabulary # step_name = 'fs' if 'fs' in pipeline.named_steps else 'vect' with open('%s.%s.pkl' % (stats.prefix, clf_name), 'wb') as outf: logging.info('Pickling trained classifier to %s', outf.name) b = Bunch(clf=clf, inv_voc=inv_voc, tr_matrix=tr_matrix, test_matrix=test_matrix, predictions=predictions, y_tr=y_train, y_ev=y_test, train_mask=to_keep_train, test_mask=to_keep_test) pickle.dump(b, outf) for metric, score in scores.items(): scores_this_cv_run.append( [type(clf).__name__, cv_i, metric.split('.')[-1], score]) logging.info('Done with %s', clf) logging.info('Finished CV fold %d', cv_i) try: v = shared_fit_args['vector_source'] logging.info('Cache info: %s', v.get_nearest_neighbours.cache_info()) except Exception: # can fail for a number of reasons, don't care much if it does pass return scores_this_cv_run