def get_temporal_mnist_dataset(smoothing_steps=1000, **mnist_kwargs): tr_x, tr_y, ts_x, ts_y = get_mnist_dataset(**mnist_kwargs).xyxy tr_ixs = temporalize(tr_x, smoothing_steps=smoothing_steps) ts_ixs = temporalize(ts_x, smoothing_steps=smoothing_steps) return DataSet.from_xyxy(tr_x[tr_ixs], tr_y[tr_ixs], ts_x[ts_ixs], ts_y[ts_ixs])
def get_20_newsgroups_dataset(filter_most_common=2000, numeric=False, shuffling_seed=1234, bag_of_words=False, count_scaling=None): """ The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in posts in the forums. Words have been preprocessed to the "stemmed" version, as explained on the website: http://ana.cachopo.org/datasets-for-single-label-text-categorization :param filter_most_common: Can be: None: Don't filter out words int N: Filter out words that are not in the N most common workds (int N, int M): Filter out words that are not between the Nth and Mth most common words. :param numeric: Convert everything from words to numbers :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic) :param bag_of_words: Return count vectors for each word :param count_scaling: If using bag_of_words, apply the transformation: vector = log(1+word_counts) To generate the input data (this scaling makes it more suitable for some types of classifiers). :return: A DataSet object """ training_set_file = get_file( relative_name='data/20ng-train-stemmed.txt', url= 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt' ) test_set_file = get_file( relative_name='data/20ng-test-stemmed.txt', url= 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt' ) train_words, train_labels = _read_formatted_file(training_set_file) test_words, test_labels = _read_formatted_file(test_set_file) # Shuffle it up... rng = np.random.RandomState(shuffling_seed) train_words, train_labels = _shuffle((train_words, train_labels), rng) test_words, test_labels = _shuffle((test_words, test_labels), rng) # Filter out most-common-but-not-too-common-words all_train_words = np.concatenate(train_words) filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common) train_words = _filter_lists_of_words(train_words, filtered_vocab) test_words = _filter_lists_of_words(test_words, filtered_vocab) if numeric or bag_of_words: train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab) test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab) label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))} train_labels = _words_to_ints(train_labels, label_vocab) test_labels = _words_to_ints(test_labels, label_vocab) if bag_of_words: train_counts = _list_of_ixs_to_count_matrix( train_ixs_list, n_words=len(filtered_vocab)) test_counts = _list_of_ixs_to_count_matrix( test_ixs_list, n_words=len(filtered_vocab)) if count_scaling == 'log': train_counts = np.log(1 + train_counts) test_counts = np.log(1 + test_counts) return DataSet.from_xyxy(training_inputs=train_counts, training_targets=train_labels, test_inputs=test_counts, test_targets=test_labels) else: return DataSet.from_xyxy(training_inputs=train_ixs_list, training_targets=train_labels, test_inputs=test_ixs_list, test_targets=test_labels) else: return DataSet.from_xyxy(training_inputs=train_words, training_targets=train_labels, test_inputs=test_words, test_targets=test_labels)
def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None): """ The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in posts in the forums. Words have been preprocessed to the "stemmed" version, as explained on the website: http://ana.cachopo.org/datasets-for-single-label-text-categorization :param filter_most_common: Can be: None: Don't filter out words int N: Filter out words that are not in the N most common workds (int N, int M): Filter out words that are not between the Nth and Mth most common words. :param numeric: Convert everything from words to numbers :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic) :param bag_of_words: Return count vectors for each word :param count_scaling: If using bag_of_words, apply the transformation: vector = log(1+word_counts) To generate the input data (this scaling makes it more suitable for some types of classifiers). :return: A DataSet object """ training_set_file = get_file( relative_name = 'data/20ng-train-stemmed.txt', url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt' ) test_set_file = get_file( relative_name = 'data/20ng-test-stemmed.txt', url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt' ) train_words, train_labels = _read_formatted_file(training_set_file) test_words, test_labels = _read_formatted_file(test_set_file) # Shuffle it up... rng = np.random.RandomState(shuffling_seed) train_words, train_labels =_shuffle((train_words, train_labels), rng) test_words, test_labels =_shuffle((test_words, test_labels), rng) # Filter out most-common-but-not-too-common-words all_train_words = np.concatenate(train_words) filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common) train_words = _filter_lists_of_words(train_words, filtered_vocab) test_words = _filter_lists_of_words(test_words, filtered_vocab) if numeric or bag_of_words: train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab) test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab) label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))} train_labels = _words_to_ints(train_labels, label_vocab) test_labels = _words_to_ints(test_labels, label_vocab) if bag_of_words: train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab)) test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab)) if count_scaling == 'log': train_counts = np.log(1+train_counts) test_counts = np.log(1+test_counts) return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels) else: return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels) else: return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)