def get_cifar_10_dataset(n_training_samples=None, n_test_samples=None, normalize_inputs=False): """ :param n_training_samples: Number of training samples, or None to leave it at 50000 :param n_test_samples: Number of test samples, or None to leave it at 10000 :param normalize_inputs: True to normalize inputs, and turn them from uint8 to double :param swap_axes: True to arrange images as (n_samples, n_colors, n_rows, n_cols) instead of (n_samples, n_rows, n_cols, n_colors) :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images. Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects. Targets are integer labels in the range [0, 9] """ # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz) # We have this for single files already, but in this case the gz contains a folder with the files in it. directory = get_archive( relative_path='data/cifar-10', url='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz') n_batches_to_read = 5 if n_training_samples is None else int( np.ceil(n_training_samples / 10000.)) file_paths = [get_file(os.path.join(directory, 'cifar-10-batches-py', 'data_batch_%s' % (i, ))) for i in xrange(1, n_batches_to_read+1)] \ + [get_file(os.path.join(directory, 'cifar-10-batches-py', 'test_batch'))] data = [] for file_path in file_paths: with open(file_path) as f: batch_data = pickle.load(f) data.append(batch_data) x_tr = np.concatenate([d['data'] for d in data[:-1]], axis=0).reshape(-1, 3, 32, 32) y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis=0) x_ts = data[-1]['data'].reshape(-1, 3, 32, 32) y_ts = np.array(data[-1]['labels']) if normalize_inputs: mean = x_tr.mean(axis=0, keepdims=True) std = x_tr.std(axis=0, keepdims=True) x_tr = (x_tr - mean) / std x_ts = (x_ts - mean) / std if n_training_samples is not None: x_tr = x_tr[:n_training_samples] y_tr = y_tr[:n_training_samples] if n_test_samples is not None: x_ts = x_ts[:n_test_samples] y_ts = y_ts[:n_test_samples] return DataSet(training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), name='CIFAR-10')
def get_normalized_vgg_net(up_to_layer=None, force_shared_parameters=True): """ Load the normalized version of VGG19 discussed here: https://bethgelab.org/deepneuralart/ """ norm_vgg19_file = get_file( relative_name='data/norm-vgg-19.pkl', url = 'https://s3.amazonaws.com/lasagne/recipes/pretrained/imagenet/vgg19_normalized.pkl', ) with open(norm_vgg19_file) as f: vgg_struct = pickle.load(f) layer_names = ['conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4', 'pool5', 'fc6', 'relu6', 'fc7', 'relu7', 'fc8', 'prob'] if isinstance(up_to_layer, list): up_to_layer = up_to_layer[np.argmax([layer_names.index(layer_name) for layer_name in up_to_layer])] assert up_to_layer is not None and layer_names.index(up_to_layer) < layer_names.index('fc6'), "This can only be used to load the convolutional portion of vggnet. Set " net_spec = OrderedDict() param_iterator = (p for p in vgg_struct['param values']) for layer_name in layer_names: if layer_name.startswith('conv'): w = param_iterator.next() b = param_iterator.next() assert w.ndim==4 and b.ndim==1 layer = ConvolverSpec(w=w, b=b, mode = 'same') elif layer_name.startswith('relu'): layer = NonlinearitySpec('relu') elif layer_name.startswith('pool'): layer = PoolerSpec(region=2, stride=2, mode='max') elif layer_name.startswith('fc'): w = param_iterator.next() b = param_iterator.next() # Here we'll express the "full" layers as convolutional. if layer_name == 'fc6': w = w.T.reshape(4096, 512, 7, 7) elif layer_name == 'fc7': w = w.T.reshape(4096, 4096, 1, 1) elif layer_name == 'fc8': w = w.T.reshape(1000, 4096, 1, 1) else: bad_value(layer_name) layer = ConvolverSpec(w=w, b=b, mode = 'valid') elif layer_name == 'prob': layer = NonlinearitySpec('softmax') else: raise Exception("Don't know how to handle layer: '%s'" % (layer_name, )) net_spec[layer_name] = layer if layer_name == up_to_layer: break if up_to_layer is None: assert_raises(StopIteration) return net_spec
def smart_file(location, use_cache=False, make_dir=False): """ :param location: Specifies where the file is. If it's formatted as a url, it's downloaded. If it begins with a "/", it's assumed to be a local path. Otherwise, it is assumed to be referenced relative to the data directory. :param use_cache: If True, and the location is a url, make a local cache of the file for future use (note: if the file at this url changes, the cached file will not). :param make_dir: Make the directory for this file, if it does not exist. :yield: The local path to the file. """ its_a_url = is_url(location) if its_a_url: assert not make_dir, "We cannot 'make the directory' for a URL" local_path = get_file(url=location, use_cache=use_cache) else: local_path = get_artemis_data_path(location) if make_dir: make_file_dir(local_path) yield local_path if its_a_url and not use_cache: os.remove(local_path)
def smart_file(location, use_cache = False, make_dir = False): """ :param location: Specifies where the file is. If it's formatted as a url, it's downloaded. If it begins with a "/", it's assumed to be a local path. Otherwise, it is assumed to be referenced relative to the data directory. :param use_cache: If True, and the location is a url, make a local cache of the file for future use (note: if the file at this url changes, the cached file will not). :param make_dir: Make the directory for this file, if it does not exist. :yield: The local path to the file. """ its_a_url = is_url(location) if its_a_url: assert not make_dir, "We cannot 'make the directory' for a URL" local_path = get_file(url=location, use_cache=use_cache) else: local_path = get_artemis_data_path(location) if make_dir: make_file_dir(local_path) yield local_path if its_a_url and not use_cache: os.remove(local_path)
def get_vgg_layer_specifiers(up_to_layer=None): """ Load the 19-layer VGGNet from the mat file and produce a list of layer specifications which can be used to create layers in your architecture of choice. Info: https://gist.github.com/ksimonyan/3785162f95cd2d5fee77#file-readme-md More Details: http://cs231n.github.io/convolutional-networks/#case :param up_to_layer: The layer to stop at. Or a list of layers, in which case the network will go to the highest. Layers are identified by their string names: ['conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3', 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4', 'pool5', 'fc6', 'relu6', 'fc7', 'relu7', 'fc8', 'prob'] :param force_shared_parameters: Create net with shared paremeters. :return: An OrderedDict<str,PrimativeSpecifier> where PrimativeSpecifier objects represent the layers of the network. """ filename = get_file( relative_name='data/vgg-19.mat', url='http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat', ) network_params = loadmat(filename) def struct_to_layer(struct): layer_type = struct[1][0] layer_name = str(struct[0][0]) assert isinstance(layer_type, basestring) if layer_type == 'conv': w_orig = struct[2][0, 0] # (n_rows, n_cols, n_in_maps, n_out_maps) w = w_orig.T.swapaxes(2, 3) b = struct[2][0, 1][:, 0] layer = ConvolverSpec(w=w, b=b, mode = 'valid' if layer_name.startswith('fc') else 'same' if layer_name.startswith('conv') else bad_value(layer_name)) elif layer_type in ('relu', 'softmax'): layer = NonlinearitySpec(layer_type) elif layer_type == 'pool': layer = PoolerSpec( region = tuple(struct[3][0].astype(int)), stride = tuple(struct[4][0].astype(int)), mode=struct[2][0]) else: raise Exception( "Don't know about this '%s' layer type." % layer_type) return layer_name, layer print 'Loading VGG Net...' network_layers = OrderedDict(struct_to_layer(network_params['layers'][0, i][ 0, 0]) for i in xrange(network_params['layers'].shape[1])) if up_to_layer is not None: if isinstance(up_to_layer, (list, tuple)): up_to_layer = network_layers.keys()[max( network_layers.keys().index(layer_name) for layer_name in up_to_layer)] layer_names = [network_params['layers'][0, i][0, 0][0][0] for i in xrange(network_params['layers'].shape[1])] network_layers = OrderedDict((k, network_layers[k]) for k in layer_names[ :layer_names.index(up_to_layer) + 1]) print 'Done.' return network_layers
def get_vggnet_labels(): file_loc = get_file( relative_name='data/labels.txt', url = 'https://raw.githubusercontent.com/HoldenCaulfieldRye/caffe/master/data/ilsvrc12/synset_words.txt') with open(file_loc) as f: lines = f.readlines() labels = [line[10:-1] for line in lines] return labels
def get_imagenet_label_names(): url = 'https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/596b27d23537e5a1b5751d2b0481ef172f58b539/imagenet1000_clsid_to_human.txt' with open(get_file('data/imagenet/labels.json', url=url)) as f: label_items = f.read() labels = [line[line.index(':')+1:].lstrip(' \'').rstrip('}, \'') for line in label_items.split('\n')] return labels
def _read_formatted_file(file_relative_path): with open(get_file(file_relative_path)) as f: text = f.read() pairs = [line.split('\t') for line in text.split('\n')[:-1]] labels = [group for group, _ in pairs] words = [sentence.split(' ') for _, sentence in pairs] return words, labels
def get_cifar_10_dataset(n_training_samples = None, n_test_samples = None, normalize_inputs = False): """ :param n_training_samples: Number of training samples, or None to leave it at 50000 :param n_test_samples: Number of test samples, or None to leave it at 10000 :param normalize_inputs: True to normalize inputs, and turn them from uint8 to double :param swap_axes: True to arrange images as (n_samples, n_colors, n_rows, n_cols) instead of (n_samples, n_rows, n_cols, n_colors) :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images. Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects. Targets are integer labels in the range [0, 9] """ # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz) # We have this for single files already, but in this case the gz contains a folder with the files in it. directory = get_archive(relative_path='data/cifar-10', url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz') n_batches_to_read = 5 if n_training_samples is None else int(np.ceil(n_training_samples/10000.)) file_paths = [get_file(os.path.join(directory, 'cifar-10-batches-py', 'data_batch_%s' % (i, ))) for i in xrange(1, n_batches_to_read+1)] \ + [get_file(os.path.join(directory, 'cifar-10-batches-py', 'test_batch'))] data = [] for file_path in file_paths: with open(file_path) as f: batch_data = pickle.load(f) data.append(batch_data) x_tr = np.concatenate([d['data'] for d in data[:-1]], axis = 0).reshape(-1, 3, 32, 32) y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis = 0) x_ts = data[-1]['data'].reshape(-1, 3, 32, 32) y_ts = np.array(data[-1]['labels']) if normalize_inputs: mean = x_tr.mean(axis=0, keepdims=True) std = x_tr.std(axis=0, keepdims=True) x_tr = (x_tr - mean)/std x_ts = (x_ts - mean)/std if n_training_samples is not None: x_tr = x_tr[:n_training_samples] y_tr = y_tr[:n_training_samples] if n_test_samples is not None: x_ts = x_ts[:n_test_samples] y_ts = y_ts[:n_test_samples] return DataSet(training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), name = 'CIFAR-10')
def get_imagenet_images(indices): """ Get imagenet images at the given indices :param indices: :return: """ highest_index = np.max(indices) code_url_pairs = get_imagenet_fall11_urls(highest_index+1) files = [get_file('data/imagenet/%s%s' % (code_url_pairs[index][0], os.path.splitext(code_url_pairs[index][1])[1]), code_url_pairs[index][1]) for index in indices] return [smart_load(f) for f in files]
def test_get_file(): with hold_file_root(get_artemis_data_path('file_getter_tests'), delete_after=True, delete_before=True): print('Testing get_file on unnamed file') path = get_file(url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp') with open(path) as f: assert f.read()=='a,b,c' # Should not download this time path = get_file(url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp') with open(path) as f: assert f.read()=='a,b,c' print('Testing get_file on named file') path = get_file(relative_name='my-test.txt', url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp') with open(path) as f: assert f.read()=='a,b,c' # Should not download this time path = get_file(relative_name='my-test.txt', url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp') with open(path) as f: assert f.read()=='a,b,c'
def test_get_file(): with hold_file_root(get_artemis_data_path('file_getter_tests'), delete_after=True, delete_before=True): print('Testing get_file on unnamed file') path = get_file( url= 'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp' ) with open(path) as f: assert f.read() == 'a,b,c' # Should not download this time path = get_file( url= 'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp' ) with open(path) as f: assert f.read() == 'a,b,c' print('Testing get_file on named file') path = get_file( relative_name='my-test.txt', url= 'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp' ) with open(path) as f: assert f.read() == 'a,b,c' # Should not download this time path = get_file( relative_name='my-test.txt', url= 'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp' ) with open(path) as f: assert f.read() == 'a,b,c'
def get_mnist_dataset(n_training_samples=None, n_test_samples=None, flat=False, join_train_and_val=False, binarize=False): """ The MNIST DataSet - the Drosophila of machine learning. :param n_training_samples: Cap on the number of training samples :param n_test_samples: Cap on the number of test samples :param flat: Set to True if we just want flat 784-dimensional input data instead of 28x28 images. :param join_train_and_val: If true, merge the validation set into the training set. (giving 60000 training examples, and 10000 test examples). Otherwise you'll have 50000 training, 10000 validation, 10000 test. :param binarize: Binarize inputs by thresholding them at 0.5 :return: A DataSet object containing the MNIST data """ filename = get_file(relative_name='data/mnist.pkl', url='http://deeplearning.net/data/mnist/mnist.pkl.gz', data_transformation=unzip_gz) with open(filename, 'rb') as f: # data = pickle.load(f, encoding='latin1') data = np.load(f, encoding='latin1') x_tr, y_tr = data[0] if n_training_samples is None else ( data[0][0][:n_training_samples], data[0][1][:n_training_samples]) x_ts, y_ts = data[1] if n_test_samples is None else ( data[1][0][:n_test_samples], data[1][1][:n_test_samples]) x_vd, y_vd = data[2] if not flat: x_tr = x_tr.reshape(-1, 28, 28) x_ts = x_ts.reshape(-1, 28, 28) x_vd = x_vd.reshape(-1, 28, 28) if binarize: x_tr = x_tr > 0.5 x_ts = x_ts > 0.5 x_vd = x_vd > 0.5 return \ DataSet( training_set=DataCollection(np.concatenate([x_tr, x_vd], axis=0), np.concatenate([y_tr, y_vd], axis=0)), test_set=DataCollection(x_ts, y_ts) ) \ if join_train_and_val else \ DataSet( training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), validation_set=DataCollection(x_vd, y_vd) )
def get_imagenet_fall11_urls(n_images=None): if n_images is None: n_images = 14197121 imagenet_urls = get_file( 'data/imagnet_urls.txt', url='http://image-net.org/imagenet_data/urls/imagenet_fall11_urls.tgz', data_transformation=unzip_gz) print 'Loading %s image URLs....' % (n_images, ) with open(imagenet_urls) as f: f.readline() lines = list(line for _, line in izip(xrange(n_images), f)) indices = [s.index('\t') for s in lines] pairs = [(line[:s], line[s + 1:-1]) for line, s in zip(lines, indices)] print 'Done.' return pairs
def read_the_bible(max_characters=None): """ Returns the King James Bible as a single string. Thanks to Janel (http://janelwashere.com/pages/bible_daily_reading.html) for compiling it. :param max_characters: You have the option to truncate it to a length of max_characters (If you're Jewish, for instance) :return: A string. """ filename = get_file( relative_name='data/king_james_bible.txt', url='http://janelwashere.com/files/bible_daily.txt', ) with open(filename) as f: text = f.read(-1 if max_characters is None else max_characters) return text
def read_fifty_shades_of_grey(max_characters=None): """ Returns Fifty Shades of Gray, by EL James. :param max_characters: :return: """ filename = get_file(relative_name='data/fifty_shades_of_grey.txt', url=None) with open(filename) as f: text = f.read(-1 if max_characters is None else max_characters) # Need to remove some weird non-ascii stuff. # http://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space text = re.sub(r'[^\x00-\x7F]+', ' ', text) return text
def get_mnist_dataset(n_training_samples = None, n_test_samples = None, flat = False, join_train_and_val = False, binarize = False): """ The MNIST DataSet - the Drosophila of machine learning. :param n_training_samples: Cap on the number of training samples :param n_test_samples: Cap on the number of test samples :param flat: Set to True if we just want flat 784-dimensional input data instead of 28x28 images. :param join_train_and_val: If true, merge the validation set into the training set. (giving 60000 training examples, and 10000 test examples). Otherwise you'll have 50000 training, 10000 validation, 10000 test. :param binarize: Binarize inputs by thresholding them at 0.5 :return: A DataSet object containing the MNIST data """ filename = get_file( relative_name = 'data/mnist.pkl', url = 'http://deeplearning.net/data/mnist/mnist.pkl.gz', data_transformation = unzip_gz) with open(filename, 'rb') as f: # data = pickle.load(f, encoding='latin1') data = np.load(f, encoding='latin1') x_tr, y_tr = data[0] if n_training_samples is None else (data[0][0][:n_training_samples], data[0][1][:n_training_samples]) x_ts, y_ts = data[1] if n_test_samples is None else (data[1][0][:n_test_samples], data[1][1][:n_test_samples]) x_vd, y_vd = data[2] if not flat: x_tr = x_tr.reshape(-1, 28, 28) x_ts = x_ts.reshape(-1, 28, 28) x_vd = x_vd.reshape(-1, 28, 28) if binarize: x_tr = x_tr>0.5 x_ts = x_ts>0.5 x_vd = x_vd>0.5 return \ DataSet( training_set=DataCollection(np.concatenate([x_tr, x_vd], axis=0), np.concatenate([y_tr, y_vd], axis=0)), test_set=DataCollection(x_ts, y_ts) ) \ if join_train_and_val else \ DataSet( training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), validation_set=DataCollection(x_vd, y_vd) )
def get_image(name, size = None): """ Get an image by name. :param name: A string identifying an image from our dictionary. :return: """ assert name in IMAGE_COLLECTION, "We don't have the image '%s' in the gallary" % (name, ) _, ext = os.path.splitext(IMAGE_COLLECTION[name]) relative_path = os.path.join('images', name)+ext filename = get_file( relative_name = relative_path, url = IMAGE_COLLECTION[name], ) im_array = imread(filename) if im_array.ndim==2: im_array = im_array[:, :, None] + [0, 0, 0] if size is not None: im_array = imresize(im_array, get_new_size(im_array.shape[:2], new_size=size)) return im_array
def get_imagenet_fall11_urls(n_images = None): if n_images is None: n_images = 14197121 imagenet_urls = get_file( 'data/imagnet_urls.txt', url = 'http://image-net.org/imagenet_data/urls/imagenet_fall11_urls.tgz', data_transformation=unzip_gz ) print('Loading %s image URLs....' % (n_images, )) with open(imagenet_urls) as f: f.readline() lines = list(line for _, line in izip(xrange(n_images), f)) indices = [s.index('\t') for s in lines] pairs = [(line[:s], line[s+1:-1]) for line, s in zip(lines, indices)] print('Done.') return pairs
def get_20_newsgroups_dataset(filter_most_common=2000, numeric=False, shuffling_seed=1234, bag_of_words=False, count_scaling=None): """ The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in posts in the forums. Words have been preprocessed to the "stemmed" version, as explained on the website: http://ana.cachopo.org/datasets-for-single-label-text-categorization :param filter_most_common: Can be: None: Don't filter out words int N: Filter out words that are not in the N most common workds (int N, int M): Filter out words that are not between the Nth and Mth most common words. :param numeric: Convert everything from words to numbers :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic) :param bag_of_words: Return count vectors for each word :param count_scaling: If using bag_of_words, apply the transformation: vector = log(1+word_counts) To generate the input data (this scaling makes it more suitable for some types of classifiers). :return: A DataSet object """ training_set_file = get_file( relative_name='data/20ng-train-stemmed.txt', url= 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt' ) test_set_file = get_file( relative_name='data/20ng-test-stemmed.txt', url= 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt' ) train_words, train_labels = _read_formatted_file(training_set_file) test_words, test_labels = _read_formatted_file(test_set_file) # Shuffle it up... rng = np.random.RandomState(shuffling_seed) train_words, train_labels = _shuffle((train_words, train_labels), rng) test_words, test_labels = _shuffle((test_words, test_labels), rng) # Filter out most-common-but-not-too-common-words all_train_words = np.concatenate(train_words) filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common) train_words = _filter_lists_of_words(train_words, filtered_vocab) test_words = _filter_lists_of_words(test_words, filtered_vocab) if numeric or bag_of_words: train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab) test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab) label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))} train_labels = _words_to_ints(train_labels, label_vocab) test_labels = _words_to_ints(test_labels, label_vocab) if bag_of_words: train_counts = _list_of_ixs_to_count_matrix( train_ixs_list, n_words=len(filtered_vocab)) test_counts = _list_of_ixs_to_count_matrix( test_ixs_list, n_words=len(filtered_vocab)) if count_scaling == 'log': train_counts = np.log(1 + train_counts) test_counts = np.log(1 + test_counts) return DataSet.from_xyxy(training_inputs=train_counts, training_targets=train_labels, test_inputs=test_counts, test_targets=test_labels) else: return DataSet.from_xyxy(training_inputs=train_ixs_list, training_targets=train_labels, test_inputs=test_ixs_list, test_targets=test_labels) else: return DataSet.from_xyxy(training_inputs=train_words, training_targets=train_labels, test_inputs=test_words, test_targets=test_labels)
def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None): """ The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in posts in the forums. Words have been preprocessed to the "stemmed" version, as explained on the website: http://ana.cachopo.org/datasets-for-single-label-text-categorization :param filter_most_common: Can be: None: Don't filter out words int N: Filter out words that are not in the N most common workds (int N, int M): Filter out words that are not between the Nth and Mth most common words. :param numeric: Convert everything from words to numbers :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic) :param bag_of_words: Return count vectors for each word :param count_scaling: If using bag_of_words, apply the transformation: vector = log(1+word_counts) To generate the input data (this scaling makes it more suitable for some types of classifiers). :return: A DataSet object """ training_set_file = get_file( relative_name = 'data/20ng-train-stemmed.txt', url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt' ) test_set_file = get_file( relative_name = 'data/20ng-test-stemmed.txt', url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt' ) train_words, train_labels = _read_formatted_file(training_set_file) test_words, test_labels = _read_formatted_file(test_set_file) # Shuffle it up... rng = np.random.RandomState(shuffling_seed) train_words, train_labels =_shuffle((train_words, train_labels), rng) test_words, test_labels =_shuffle((test_words, test_labels), rng) # Filter out most-common-but-not-too-common-words all_train_words = np.concatenate(train_words) filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common) train_words = _filter_lists_of_words(train_words, filtered_vocab) test_words = _filter_lists_of_words(test_words, filtered_vocab) if numeric or bag_of_words: train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab) test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab) label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))} train_labels = _words_to_ints(train_labels, label_vocab) test_labels = _words_to_ints(test_labels, label_vocab) if bag_of_words: train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab)) test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab)) if count_scaling == 'log': train_counts = np.log(1+train_counts) test_counts = np.log(1+test_counts) return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels) else: return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels) else: return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)