def get_sparse_repr(docs, V, sort_data): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(stop_words="english", max_features=V) default_preproc = vectorizer.build_preprocessor() def preproc(s): return re.sub(r' \d+ ', 'anumber ', default_preproc(s)) vectorizer.preprocessor = preproc counts = vectorizer.fit_transform(docs).astype(np.uint32) words = vectorizer.get_feature_names() if sort_data: counts, words = sort_vocab(counts, words) assert is_column_sorted(counts) print('loaded {} documents with a size {} vocabulary'.format(*counts.shape)) print('with {} words per document on average'.format(np.mean(counts.sum(1)))) print() return counts, words