def Mnist(binary=True, flatten=False, onehot=True, datadir='.'): datadir += '/mnist/' url = 'http://yann.lecun.com/exdb/mnist' paths = [] for fname in ['train-images-idx3-ubyte', 'train-labels-idx1-ubyte', 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte']: path = get_mnist_file('{}/{}'.format(datadir,fname), origin='{}/{}.gz'.format(url,fname)) paths.append(path) X_train = read_mnist_images(paths[0], dtype='float32')[:,:,:,np.newaxis] y_train = read_mnist_labels(paths[1]) X_test = read_mnist_images(paths[2], dtype='float32')[:,:,:,np.newaxis] y_test = read_mnist_labels(paths[3]) if flatten: X_train = X_train.reshape(X_train.shape[0], np.prod(X_train.shape[1:])) X_test = X_test.reshape(X_test.shape[0], np.prod(X_test.shape[1:])) X = np.concatenate((X_train, X_test), axis=0) if binary: X = (X >= 0.5).astype(int) if onehot: y_train = make_one_hot(y_train, 10) y_test = make_one_hot(y_test, 10) return X_train, y_train, X_test, y_test
def Cifar100(flatten=False, onehot=True, datadir='./cifar100/', fine_label=True): ''' Args: fine_label (bool): True (100 classes) False (20 classes) ''' url = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' save_path = '{}/cifar-100-python.tar.gz'.format(datadir) datadir = get_file_from_url(save_path=save_path, origin=url, untar=True) print('untar dir', datadir) sav_dir = datadir + '/cifar-100-python' nclass = None def make_data(batchnames): X = [] y = [] for data_batch in batchnames: fp = sav_dir + '/' + data_batch with open(fp, 'rb') as fin: # python2 if sys.version_info.major == 2: import cPickle tbl = cPickle.load(fin) # python 3 elif sys.version_info.major == 3: import pickle tbl = pickle.load(fin, encoding='bytes') else: raise Exception('python version not 2 or 3') X.append(tbl[b'data']) if fine_label: y.append(tbl[b'fine_labels']) nclass = 100 else: y.append(tbl[b'coarse_labels']) nclass = 20 X = np.concatenate(X, axis=0).astype('f4') y = np.concatenate(y, axis=0).astype('int') X /= 255.0 return X, y, nclass X_train, y_train, nclass = make_data(['train']) X_test, y_test, nclass = make_data(['test']) if onehot: y_train = make_one_hot(y_train, nclass) y_test = make_one_hot(y_test, nclass) if not flatten: X_train = X_train.reshape((-1, 3, 32, 32)).swapaxes(1, 3) X_test = X_test.reshape((-1, 3, 32, 32)).swapaxes(1, 3) return X_train, y_train, X_test, y_test
def Cifar10(flatten=False, onehot=True, contrast_normalize=False, whiten=False, datadir='./cifar10/'): url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' save_path = '{}/cifar-10-python.tar.gz'.format(datadir) datadir = get_file_from_url(save_path=save_path, origin=url, untar=True) sav_dir = datadir + '/cifar-10-batches-py' def make_data(batchnames): X = [] y = [] for data_batch in batchnames: fp = sav_dir + '/' + data_batch with open(fp, 'rb') as fin: # python2 if sys.version_info.major == 2: import cPickle tbl = cPickle.load(fin) # python 3 elif sys.version_info.major == 3: import pickle tbl = pickle.load(fin, encoding='bytes') else: raise Exception('python version not 2 or 3') X.append(tbl[b'data']) y.append(tbl[b'labels']) X = np.concatenate(X, axis=0).astype('f4') y = np.concatenate(y, axis=0).astype('int') X /= 255.0 return X, y X_train, y_train = make_data(['data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5']) X_test , y_test = make_data(['test_batch']) if contrast_normalize: norm_scale = 55.0 # Goodfellow X_train = global_contrast_normalize(X_train, scale=norm_scale) X_test = global_contrast_normalize(X_test, scale=norm_scale) if whiten: zca_cache = os.path.join(datadir, 'cifar-10-zca-cache.pkl') X_train, X_test = zca_whiten(X_train, X_test, cache=zca_cache) if onehot: y_train = make_one_hot(y_train, 10) y_test = make_one_hot(y_test, 10) if not flatten: X_train = X_train.reshape((-1, 3, 32, 32)).swapaxes(1, 3) X_test = X_test.reshape((-1, 3, 32, 32)).swapaxes(1, 3) return X_train, y_train, X_test, y_test
def binary_f1_test(): ph1 = tf.placeholder('int32', [None, 2]) ph2 = tf.placeholder('int32', [None, 2]) f1_sb = tg.cost.binary_f1(ph1, ph2) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) y1 = np.random.randint(0, 2, 100) y2 = np.random.randint(0 ,2, 100) print(f1_score(y1, y2)) y1_oh = make_one_hot(y1, 2) y2_oh = make_one_hot(y2, 2) print(sess.run(f1_sb, feed_dict={ph1:y1_oh, ph2:y2_oh}))
def char2num(self, charlen=None, onehot=False): '''convert characters in a sentence to numbers limit by charlen charlen (int). Convert characters to numbers directly. ''' if self.char_map is None: print('..no char_map, building new character map') self.build_char_map() print(('..char_map size = {}'.format(len(self.char_map)))) sents = [] for paragraph in self.data_iterator: sent_vec = [] for c in str(paragraph): if c not in self.char_map: print(('{} not in character map'.format(c))) else: sent_vec.append(self.char_map[c]) sents.append(sent_vec) if charlen is None: sents_lens = [len(sent) for sent in sents] mean_len = int(np.mean(sents_lens)) std_len = int(np.std(sents_lens)) max_len = mean_len + 2 * std_len print( ('..mean char len = {}, std char len = {}, max char len = {}'. format(mean_len, std_len, max_len))) charlen = max_len else: print(('..char len = {}'.format(charlen))) new_sents = [] for sent_vec in sents: if len(sent_vec) > charlen: new_sents.append(sent_vec[:charlen]) else: zero_pad = np.zeros(charlen - len(sent_vec)).astype(int) sent_vec = sent_vec + list(zero_pad) new_sents.append(sent_vec) sents = new_sents if onehot: onehot_sents = [] for sent in sents: onehot_sents += sent onehot_sents = make_one_hot(onehot_sents, len(self.char_map)) sents = onehot_sents.reshape((-1, charlen, len(self.char_map))) return np.asarray(sents)
def tweets(word_len, sent_len, train_valid_ratio=[5,1]): df = pandas.read_csv('tweets_large.csv') field = 'text' label = 'label' tokenizer = RegexpTokenizer(r'\w+') # encode characters into numbers encoder = CharNumberEncoder(df[field].values, tokenizer=tokenizer, word_len=word_len, sent_len=sent_len) encoder.build_char_map() encode_X = encoder.make_char_embed() # encode categories into one hot array cat_encoder = CatNumberEncoder(df[label]) cat_encoder.build_cat_map() encode_y = cat_encoder.make_cat_embed() nclass = len(np.unique(encode_y)) encode_y = make_one_hot(encode_y, nclass) return encode_X, encode_y, nclass
def make_char_embed(self, onehot=False, reverse_words=False, pad_mode='back', return_seqlen=False): '''DESCRIPTIONS: build array vectors of words and sentence, automatically skip non-ascii words. First tokenize the sentence into words, then convert each word into numbers, then stack together. PARAMS: reverse_words (bool): reverse the word order in a sentence pad_mode (back or front): pad zero at the back or front of sentence return_seqlen (bool): return sequence length ''' if self.char_map is None: print('..no char_map, building new character map') self.build_char_map() print(('..total {} characters in char_map'.format(len(self.char_map)))) sents = [] seqlens = [] char_set = set() for paragraph in self.data_iterator: word_toks = self.tokenizer.tokenize(str(paragraph)) word_vecs = [] for word in word_toks: word = word.strip() word_vec = [] for c in word: if c not in self.char_map: print(('{} not in character map'.format(c))) else: word_vec.append(self.char_map[c]) if len(word_vec) > 0: word_vecs.append(self.spawn_word_vec(word_vec)) word_vecs = np.asarray(word_vecs).astype('int16') seqlen = len( word_vecs) if len(word_vecs) < self.sent_len else self.sent_len seqlens.append(seqlen) if len(word_vecs) > self.sent_len: words = word_vecs[:self.sent_len].astype('int16') if reverse_words: words = np.flipud(words) sents.append(words) else: if reverse_words: word_vecs = np.flipud(word_vecs) zero_pad = np.zeros( (self.sent_len - len(word_vecs), self.word_len)) if len(word_vecs) > 0: if pad_mode == 'back': sents.append( (np.vstack([np.asarray(word_vecs), zero_pad])).astype('int16')) elif pad_mode == 'front': sents.append( (np.vstack([zero_pad, np.asarray(word_vecs) ])).astype('int16')) else: raise Exception( 'pad_mode ({}) is neither (front) nor (back)'. format(pad_mode)) else: sents.append(zero_pad.astype('int16')) arr = np.asarray(sents).astype('int16') seqlens = np.asarray(seqlens).astype('int16') if onehot: b, sl, wl = arr.shape arr = arr.flatten().astype('int16') arr = make_one_hot(arr, len(self.char_map)) arr = arr.reshape(b, sl, wl, len(self.char_map)) arr = arr.swapaxes(1, 3) if return_seqlen: return arr, seqlens else: return arr