def get_temporal_mnist_dataset(smoothing_steps=1000, **mnist_kwargs):

    tr_x, tr_y, ts_x, ts_y = get_mnist_dataset(**mnist_kwargs).xyxy
    tr_ixs = temporalize(tr_x, smoothing_steps=smoothing_steps)
    ts_ixs = temporalize(ts_x, smoothing_steps=smoothing_steps)
    return DataSet.from_xyxy(tr_x[tr_ixs], tr_y[tr_ixs], ts_x[ts_ixs],
                             ts_y[ts_ixs])
Exemple #2
0
def get_20_newsgroups_dataset(filter_most_common=2000,
                              numeric=False,
                              shuffling_seed=1234,
                              bag_of_words=False,
                              count_scaling=None):
    """
    The 20 newsgroups dataset.  In this dataset, you try to predict the topic of a forum from the words contained in
    posts in the forums.

    Words have been preprocessed to the "stemmed" version, as explained on the website:
    http://ana.cachopo.org/datasets-for-single-label-text-categorization

    :param filter_most_common: Can be:
        None: Don't filter out words
        int N: Filter out words that are not in the N most common workds
        (int N, int M): Filter out words that are not between the Nth and Mth most common words.
    :param numeric: Convert everything from words to numbers
    :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
    :param bag_of_words: Return count vectors for each word
    :param count_scaling: If using bag_of_words, apply the transformation:
        vector = log(1+word_counts)
        To generate the input data (this scaling makes it more suitable for some types of classifiers).
    :return: A DataSet object
    """

    training_set_file = get_file(
        relative_name='data/20ng-train-stemmed.txt',
        url=
        'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
    )

    test_set_file = get_file(
        relative_name='data/20ng-test-stemmed.txt',
        url=
        'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
    )

    train_words, train_labels = _read_formatted_file(training_set_file)
    test_words, test_labels = _read_formatted_file(test_set_file)

    # Shuffle it up...
    rng = np.random.RandomState(shuffling_seed)
    train_words, train_labels = _shuffle((train_words, train_labels), rng)
    test_words, test_labels = _shuffle((test_words, test_labels), rng)

    # Filter out most-common-but-not-too-common-words
    all_train_words = np.concatenate(train_words)
    filtered_vocab, counts = _find_most_common(all_train_words,
                                               filter_most_common)
    train_words = _filter_lists_of_words(train_words, filtered_vocab)
    test_words = _filter_lists_of_words(test_words, filtered_vocab)

    if numeric or bag_of_words:
        train_ixs_list = _list_of_posts_to_list_of_ixs(train_words,
                                                       filtered_vocab)
        test_ixs_list = _list_of_posts_to_list_of_ixs(test_words,
                                                      filtered_vocab)
        label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
        train_labels = _words_to_ints(train_labels, label_vocab)
        test_labels = _words_to_ints(test_labels, label_vocab)

        if bag_of_words:
            train_counts = _list_of_ixs_to_count_matrix(
                train_ixs_list, n_words=len(filtered_vocab))
            test_counts = _list_of_ixs_to_count_matrix(
                test_ixs_list, n_words=len(filtered_vocab))
            if count_scaling == 'log':
                train_counts = np.log(1 + train_counts)
                test_counts = np.log(1 + test_counts)
            return DataSet.from_xyxy(training_inputs=train_counts,
                                     training_targets=train_labels,
                                     test_inputs=test_counts,
                                     test_targets=test_labels)
        else:
            return DataSet.from_xyxy(training_inputs=train_ixs_list,
                                     training_targets=train_labels,
                                     test_inputs=test_ixs_list,
                                     test_targets=test_labels)
    else:
        return DataSet.from_xyxy(training_inputs=train_words,
                                 training_targets=train_labels,
                                 test_inputs=test_words,
                                 test_targets=test_labels)
Exemple #3
0
def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None):
    """
    The 20 newsgroups dataset.  In this dataset, you try to predict the topic of a forum from the words contained in
    posts in the forums.

    Words have been preprocessed to the "stemmed" version, as explained on the website:
    http://ana.cachopo.org/datasets-for-single-label-text-categorization

    :param filter_most_common: Can be:
        None: Don't filter out words
        int N: Filter out words that are not in the N most common workds
        (int N, int M): Filter out words that are not between the Nth and Mth most common words.
    :param numeric: Convert everything from words to numbers
    :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
    :param bag_of_words: Return count vectors for each word
    :param count_scaling: If using bag_of_words, apply the transformation:
        vector = log(1+word_counts)
        To generate the input data (this scaling makes it more suitable for some types of classifiers).
    :return: A DataSet object
    """

    training_set_file = get_file(
        relative_name = 'data/20ng-train-stemmed.txt',
        url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
        )

    test_set_file = get_file(
        relative_name = 'data/20ng-test-stemmed.txt',
        url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
        )

    train_words, train_labels = _read_formatted_file(training_set_file)
    test_words, test_labels = _read_formatted_file(test_set_file)

    # Shuffle it up...
    rng = np.random.RandomState(shuffling_seed)
    train_words, train_labels =_shuffle((train_words, train_labels), rng)
    test_words, test_labels =_shuffle((test_words, test_labels), rng)

    # Filter out most-common-but-not-too-common-words
    all_train_words = np.concatenate(train_words)
    filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common)
    train_words = _filter_lists_of_words(train_words, filtered_vocab)
    test_words = _filter_lists_of_words(test_words, filtered_vocab)

    if numeric or bag_of_words:
        train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab)
        test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab)
        label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
        train_labels = _words_to_ints(train_labels, label_vocab)
        test_labels = _words_to_ints(test_labels, label_vocab)

        if bag_of_words:
            train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab))
            test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab))
            if count_scaling == 'log':
                train_counts = np.log(1+train_counts)
                test_counts = np.log(1+test_counts)
            return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels)
        else:
            return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels)
    else:
        return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)
Exemple #4
0
def get_temporal_mnist_dataset(smoothing_steps=1000, **mnist_kwargs):

    tr_x, tr_y, ts_x, ts_y = get_mnist_dataset(**mnist_kwargs).xyxy
    tr_ixs = temporalize(tr_x, smoothing_steps=smoothing_steps)
    ts_ixs = temporalize(ts_x, smoothing_steps=smoothing_steps)
    return DataSet.from_xyxy(tr_x[tr_ixs], tr_y[tr_ixs], ts_x[ts_ixs], ts_y[ts_ixs])