Example #1
0
def Mnist(binary=True, flatten=False, onehot=True, datadir='.'):
    datadir += '/mnist/'

    url = 'http://yann.lecun.com/exdb/mnist'
    paths = []
    for fname in ['train-images-idx3-ubyte', 'train-labels-idx1-ubyte',
                  't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte']:
        path = get_mnist_file('{}/{}'.format(datadir,fname), origin='{}/{}.gz'.format(url,fname))
        paths.append(path)

    X_train = read_mnist_images(paths[0], dtype='float32')[:,:,:,np.newaxis]
    y_train = read_mnist_labels(paths[1])

    X_test = read_mnist_images(paths[2], dtype='float32')[:,:,:,np.newaxis]
    y_test = read_mnist_labels(paths[3])

    if flatten:
        X_train = X_train.reshape(X_train.shape[0], np.prod(X_train.shape[1:]))
        X_test = X_test.reshape(X_test.shape[0], np.prod(X_test.shape[1:]))

    X = np.concatenate((X_train, X_test), axis=0)
    if binary:
        X = (X >= 0.5).astype(int)

    if onehot:
        y_train = make_one_hot(y_train, 10)
        y_test = make_one_hot(y_test, 10)

    return X_train, y_train, X_test, y_test
Example #2
0
def Cifar100(flatten=False,
             onehot=True,
             datadir='./cifar100/',
             fine_label=True):
    '''
    Args:
        fine_label (bool): True (100 classes) False (20 classes)
    '''

    url = 'http://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'
    save_path = '{}/cifar-100-python.tar.gz'.format(datadir)
    datadir = get_file_from_url(save_path=save_path, origin=url, untar=True)
    print('untar dir', datadir)
    sav_dir = datadir + '/cifar-100-python'
    nclass = None

    def make_data(batchnames):
        X = []
        y = []
        for data_batch in batchnames:
            fp = sav_dir + '/' + data_batch
            with open(fp, 'rb') as fin:
                # python2
                if sys.version_info.major == 2:
                    import cPickle
                    tbl = cPickle.load(fin)
                # python 3
                elif sys.version_info.major == 3:
                    import pickle
                    tbl = pickle.load(fin, encoding='bytes')

                else:
                    raise Exception('python version not 2 or 3')
                X.append(tbl[b'data'])

                if fine_label:
                    y.append(tbl[b'fine_labels'])
                    nclass = 100
                else:
                    y.append(tbl[b'coarse_labels'])
                    nclass = 20

        X = np.concatenate(X, axis=0).astype('f4')
        y = np.concatenate(y, axis=0).astype('int')
        X /= 255.0
        return X, y, nclass

    X_train, y_train, nclass = make_data(['train'])
    X_test, y_test, nclass = make_data(['test'])
    if onehot:
        y_train = make_one_hot(y_train, nclass)
        y_test = make_one_hot(y_test, nclass)

    if not flatten:
        X_train = X_train.reshape((-1, 3, 32, 32)).swapaxes(1, 3)
        X_test = X_test.reshape((-1, 3, 32, 32)).swapaxes(1, 3)

    return X_train, y_train, X_test, y_test
Example #3
0
def Cifar10(flatten=False, onehot=True, contrast_normalize=False, whiten=False, datadir='./cifar10/'):
    url = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
    save_path = '{}/cifar-10-python.tar.gz'.format(datadir)
    datadir = get_file_from_url(save_path=save_path, origin=url, untar=True)
    sav_dir = datadir + '/cifar-10-batches-py'

    def make_data(batchnames):
        X = []
        y = []
        for data_batch in batchnames:
            fp = sav_dir + '/' + data_batch
            with open(fp, 'rb') as fin:
                # python2
                if sys.version_info.major == 2:
                    import cPickle
                    tbl = cPickle.load(fin)
                # python 3
                elif sys.version_info.major == 3:
                    import pickle
                    tbl = pickle.load(fin, encoding='bytes')

                else:
                    raise Exception('python version not 2 or 3')
                X.append(tbl[b'data'])
                y.append(tbl[b'labels'])
        X = np.concatenate(X, axis=0).astype('f4')
        y = np.concatenate(y, axis=0).astype('int')
        X /= 255.0
        return X, y

    X_train, y_train = make_data(['data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5'])
    X_test , y_test = make_data(['test_batch'])


    if contrast_normalize:
        norm_scale = 55.0  # Goodfellow
        X_train = global_contrast_normalize(X_train, scale=norm_scale)
        X_test = global_contrast_normalize(X_test, scale=norm_scale)


    if whiten:
        zca_cache = os.path.join(datadir, 'cifar-10-zca-cache.pkl')
        X_train, X_test = zca_whiten(X_train, X_test, cache=zca_cache)


    if onehot:
        y_train = make_one_hot(y_train, 10)
        y_test = make_one_hot(y_test, 10)

    if not flatten:
        X_train = X_train.reshape((-1, 3, 32, 32)).swapaxes(1, 3)
        X_test = X_test.reshape((-1, 3, 32, 32)).swapaxes(1, 3)

    return X_train, y_train, X_test, y_test
Example #4
0
def binary_f1_test():
    ph1 = tf.placeholder('int32', [None, 2])
    ph2 = tf.placeholder('int32', [None, 2])

    f1_sb = tg.cost.binary_f1(ph1, ph2)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        y1 = np.random.randint(0, 2, 100)
        y2 = np.random.randint(0 ,2, 100)
        print(f1_score(y1, y2))

        y1_oh = make_one_hot(y1, 2)
        y2_oh = make_one_hot(y2, 2)
        print(sess.run(f1_sb, feed_dict={ph1:y1_oh, ph2:y2_oh}))
Example #5
0
    def char2num(self, charlen=None, onehot=False):
        '''convert characters in a sentence to numbers limit by charlen
           charlen (int). Convert characters to numbers directly.
        '''
        if self.char_map is None:
            print('..no char_map, building new character map')
            self.build_char_map()
            print(('..char_map size = {}'.format(len(self.char_map))))
        sents = []
        for paragraph in self.data_iterator:

            sent_vec = []
            for c in str(paragraph):
                if c not in self.char_map:
                    print(('{} not in character map'.format(c)))
                else:
                    sent_vec.append(self.char_map[c])

            sents.append(sent_vec)

        if charlen is None:
            sents_lens = [len(sent) for sent in sents]
            mean_len = int(np.mean(sents_lens))
            std_len = int(np.std(sents_lens))
            max_len = mean_len + 2 * std_len
            print(
                ('..mean char len = {}, std char len = {}, max char len = {}'.
                 format(mean_len, std_len, max_len)))
            charlen = max_len
        else:
            print(('..char len = {}'.format(charlen)))

        new_sents = []
        for sent_vec in sents:
            if len(sent_vec) > charlen:
                new_sents.append(sent_vec[:charlen])
            else:
                zero_pad = np.zeros(charlen - len(sent_vec)).astype(int)
                sent_vec = sent_vec + list(zero_pad)
                new_sents.append(sent_vec)

        sents = new_sents
        if onehot:
            onehot_sents = []
            for sent in sents:
                onehot_sents += sent

            onehot_sents = make_one_hot(onehot_sents, len(self.char_map))
            sents = onehot_sents.reshape((-1, charlen, len(self.char_map)))

        return np.asarray(sents)
def tweets(word_len, sent_len, train_valid_ratio=[5,1]):
    df = pandas.read_csv('tweets_large.csv')
    field = 'text'
    label = 'label'
    tokenizer = RegexpTokenizer(r'\w+')

    # encode characters into numbers
    encoder = CharNumberEncoder(df[field].values, tokenizer=tokenizer,
                                word_len=word_len, sent_len=sent_len)
    encoder.build_char_map()
    encode_X = encoder.make_char_embed()

    # encode categories into one hot array
    cat_encoder = CatNumberEncoder(df[label])
    cat_encoder.build_cat_map()

    encode_y = cat_encoder.make_cat_embed()
    nclass = len(np.unique(encode_y))
    encode_y = make_one_hot(encode_y, nclass)

    return encode_X, encode_y, nclass
Example #7
0
    def make_char_embed(self,
                        onehot=False,
                        reverse_words=False,
                        pad_mode='back',
                        return_seqlen=False):
        '''DESCRIPTIONS:
               build array vectors of words and sentence, automatically skip non-ascii
               words. First tokenize the sentence into words, then convert each word
               into numbers, then stack together.
           PARAMS:
               reverse_words (bool): reverse the word order in a sentence
               pad_mode (back or front): pad zero at the back or front of sentence
               return_seqlen (bool): return sequence length
        '''
        if self.char_map is None:
            print('..no char_map, building new character map')
            self.build_char_map()

        print(('..total {} characters in char_map'.format(len(self.char_map))))

        sents = []
        seqlens = []
        char_set = set()
        for paragraph in self.data_iterator:
            word_toks = self.tokenizer.tokenize(str(paragraph))
            word_vecs = []
            for word in word_toks:
                word = word.strip()
                word_vec = []
                for c in word:
                    if c not in self.char_map:
                        print(('{} not in character map'.format(c)))
                    else:
                        word_vec.append(self.char_map[c])

                if len(word_vec) > 0:
                    word_vecs.append(self.spawn_word_vec(word_vec))
            word_vecs = np.asarray(word_vecs).astype('int16')
            seqlen = len(
                word_vecs) if len(word_vecs) < self.sent_len else self.sent_len
            seqlens.append(seqlen)

            if len(word_vecs) > self.sent_len:
                words = word_vecs[:self.sent_len].astype('int16')
                if reverse_words:
                    words = np.flipud(words)
                sents.append(words)

            else:
                if reverse_words:
                    word_vecs = np.flipud(word_vecs)

                zero_pad = np.zeros(
                    (self.sent_len - len(word_vecs), self.word_len))
                if len(word_vecs) > 0:
                    if pad_mode == 'back':
                        sents.append(
                            (np.vstack([np.asarray(word_vecs),
                                        zero_pad])).astype('int16'))
                    elif pad_mode == 'front':
                        sents.append(
                            (np.vstack([zero_pad,
                                        np.asarray(word_vecs)
                                        ])).astype('int16'))
                    else:
                        raise Exception(
                            'pad_mode ({}) is neither (front) nor (back)'.
                            format(pad_mode))
                else:
                    sents.append(zero_pad.astype('int16'))

        arr = np.asarray(sents).astype('int16')
        seqlens = np.asarray(seqlens).astype('int16')
        if onehot:
            b, sl, wl = arr.shape
            arr = arr.flatten().astype('int16')
            arr = make_one_hot(arr, len(self.char_map))
            arr = arr.reshape(b, sl, wl, len(self.char_map))
            arr = arr.swapaxes(1, 3)

        if return_seqlen:
            return arr, seqlens
        else:
            return arr