Ejemplo n.º 1
0
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (X_train, y_train), (X_test, y_test) = reuters.load_data()
        (X_train, y_train), (X_test, y_test) = reuters.load_data(maxlen=10)
Ejemplo n.º 2
0
def test_reuters_load_does_not_affect_global_rng(fake_downloaded_reuters_path):
    np.random.seed(1337)
    before = np.random.randint(0, 100, size=10)

    np.random.seed(1337)
    reuters.load_data(path=fake_downloaded_reuters_path, seed=9876)
    after = np.random.randint(0, 100, size=10)

    assert np.array_equal(before, after)
Ejemplo n.º 3
0
def test_reuters():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = reuters.load_data()
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        assert len(x_train) + len(x_test) == 11228
        (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = reuters.get_word_index()
        assert isinstance(word_index, dict)
    def load_data(self, sample_size=None):
        print('Load Data...')
        (X_train, y_train), (X_test, y_test) = reuters.load_data(
            start_char=None, index_from=None, nb_words=self.word_vocab_size)
        if sample_size:
            sample_indices_train = random.sample(range(len(X_train)),
                                                 sample_size)
            X_train = itemgetter(*sample_indices_train)(X_train)
            y_train = itemgetter(*sample_indices_train)(y_train)

            sample_indices_test = random.sample(range(len(X_test)),sample_size)
            X_test = itemgetter(*sample_indices_test)(X_test)
            y_test = itemgetter(*sample_indices_test)(y_test)
        index_word = dict((v, k) for k, v in reuters.get_word_index().items())
        X_train_char = [[index_word[idx] for idx in x] for x in X_train]
        X_test_char = [[index_word[idx] for idx in x] for x in X_test]
        X_test_char, X_train_char, vocab_char_size = \
            self.tokenize(X_test_char, X_train_char)
        X_test, X_train, X_test_char, X_train_char = \
            self.pad(X_test_char, X_train_char,X_test, X_train)
        nb_classes = np.max(y_train+y_test)+1
        Y_train = np_utils.to_categorical(y_train, nb_classes)
        Y_test = np_utils.to_categorical(y_test, nb_classes)
        return X_train, X_train_char, Y_train, X_test, X_test_char, Y_test, \
               vocab_char_size, nb_classes
def run_keras_example():
	max_words = 1000
	batch_size = 32
	nb_epoch = 5

	print('Loading data...')
	(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)
	print(len(X_train), 'train sequences')
	print(len(X_test), 'test sequences')

	nb_classes = np.max(y_train)+1
	print(nb_classes, 'classes')

	print('Vectorizing sequence data...')
	tokenizer = Tokenizer(nb_words=max_words)
	X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
	X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
	print('X_train shape:', X_train.shape)
	print('X_test shape:', X_test.shape)

	print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
	Y_train = np_utils.to_categorical(y_train, nb_classes)
	Y_test = np_utils.to_categorical(y_test, nb_classes)
	print('Y_train shape:', Y_train.shape)
	print('Y_test shape:', Y_test.shape)

	print('Building model...')
	model = Sequential()
	model.add(Dense(512, input_shape=(max_words,)))
	model.add(Activation('tanh'))
	model.add(Dropout(0.5))
	model.add(Dense(nb_classes))
	model.add(Activation('softmax'))

	model.compile(loss='categorical_crossentropy', optimizer='adam')

	history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1)
	score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True)
	print('Test score:', score[0])
	print('Test accuracy:', score[1])
Ejemplo n.º 6
0
from keras.datasets import cifar10
((trainX4, trainY4), (testX4, testY4)) = cifar10.load_data()

print(trainX4.shape)
print(testX4.shape)

from keras.datasets import cifar100
((trainX5, trainY5), (testX5, testY5)) = cifar100.load_data()

print(trainX5.shape)
print(testX5.shape)

print('')

from keras.datasets import reuters
((trainX6, trainY6), (testX6, testY6)) = reuters.load_data()

print(trainX6.shape)
print(testX6.shape)

from keras.datasets import boston_housing
((trainX7, trainY7), (testX7, testY7)) = boston_housing.load_data()

print(trainX7.shape)
print(testX7.shape)

print('')

# use: https://medium.com/startup-grind/fueling-the-ai-gold-rush-7ae438505bc2
# we use: https://medium.com/startup-grind/fueling-the-ai-gold-rush-7ae438505bc2
Ejemplo n.º 7
0
import keras
from keras.layers import SimpleRNN, Embedding, Dense
from keras.models import Sequential
import numpy as np
from keras.datasets import reuters
from keras.preprocessing import sequence
import sys
from keras.utils import np_utils

max_features = 10000
maxlen = 500
batch_size = 64

print('Loading Data...')
(input_train, y_train), (input_test,
                         y_test) = reuters.load_data(num_words=max_features)

input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)

y_train = np_utils.to_categorical(y_train, 46)
y_test = np_utils.to_categorical(y_test, 46)

model = Sequential()
model.add(Embedding(max_features, 64))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(46, activation='softmax'))
from sklearn.metrics import accuracy_score
from keras.datasets import reuters
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Activation
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier

# parameters for data load
num_words = 30000
maxlen = 50
test_split = 0.3

(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=num_words,
                                                         maxlen=maxlen,
                                                         test_split=test_split)

# pad the sequences with zeros
# padding parameter is set to 'post' => 0's are appended to end of sequences
X_train = pad_sequences(X_train, padding='post')
X_test = pad_sequences(X_test, padding='post')

X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1], 1))

y_data = np.concatenate((y_train, y_test))
y_data = to_categorical(y_data)
y_train = y_data[:1395]
y_test = y_data[1395:]
from keras.datasets import reuters
from keras import models
from keras import layers
import numpy as np
from keras.utils.np_utils import to_categorical
import matplotlib.pyplot as plt


def vectorize_sequence(data, dimension=10000):
    results = np.zeros((len(data), dimension))
    for ind, cols in enumerate(data):
        results[ind, cols] = 1
    return results


(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000)
y_train = np.array(y_train)
y_test = np.array(y_test)
x_train = vectorize_sequence(x_train)
x_test = vectorize_sequence(x_test)
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000, )))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
x_val = x_train[:1000]
partial_x = x_train[1000:]
y_val = y_train[:1000]
Ejemplo n.º 10
0
import numpy as np
from keras.datasets import reuters
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from utils import init_keras, vectorize_sequences

init_keras()

(x_train, t_train), (x_test, t_test) = reuters.load_data(num_words=10000)

x_train = vectorize_sequences(x_train)
x_test = vectorize_sequences(x_test)
t_train = to_categorical(t_train)
t_test = to_categorical(t_test)

x_train, x_cv, t_train, t_cv = train_test_split(
    x_train, t_train, test_size=0.33)

network = models.Sequential()
network.add(layers.Dense(64, activation='relu', input_shape=(10000,)))
network.add(layers.Dense(64, activation='relu'))
network.add(layers.Dense(46, activation='softmax'))

network.compile(optimizer=optimizers.RMSprop(lr=0.001),
                loss=losses.categorical_crossentropy, metrics=[metrics.categorical_accuracy])
network.fit(x=x_train, y=t_train, epochs=15,
Ejemplo n.º 11
0
from keras import models
from keras import layers

# ### Binary representation

# In[104]:

from keras.datasets import reuters
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

# In[139]:

# load reuters dataset
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None,
                                                         test_split=0.2)

no_features = max(y_train) + 1

max_words = 20000

# one-hot encoding
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

y_train = keras.utils.to_categorical(y_train, no_features)
y_test = keras.utils.to_categorical(y_test, no_features)

print(x_train[0])
print(len(x_train[0]))
Ejemplo n.º 12
0
def test_reuters():
    (X_train, y_train), (X_test, y_test) = reuters.load_data()
Ejemplo n.º 13
0
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer

import cnn_utils


vocab_size = 20000
batch_size = 128
embedding_size = 500
max_len = 100
n_filters = 300
n_gram = 6

print("Loading data...")
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=vocab_size, test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

nb_classes = np.max(y_train)+1
print(nb_classes, 'classes')

X_train = cnn_utils.prepare_sequence(X_train, length=max_len)
X_train = np.array(X_train)
X_test = cnn_utils.prepare_sequence(X_test, length=max_len)
X_test = np.array(X_test)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print("Convert class vector to binary class matrix (for use with categorical_crossentropy)")
Y_train = np_utils.to_categorical(y_train, nb_classes)
Ejemplo n.º 14
0
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer

MAX_WORDS = 1000

# network and training
N_HIDDEN = 512
NB_EPOCH = 10
BATCH_SIZE = 32
VERBOSE = 1
VALIDATION_SPLIT=0.2


print('Loading data...')
(X_train, y_train), (X_test, y_test) = \
  reuters.load_data(nb_words=MAX_WORDS, test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

nb_classes = np.max(y_train)+1
print(nb_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(nb_words=MAX_WORDS)
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Convert class vector to binary class matrix (for use with categorical_crossentropy)')
Y_train = np_utils.to_categorical(y_train, nb_classes)
Ejemplo n.º 15
0
 def test_reuters(self):
     print('reuters')
     (X_train, y_train), (X_test, y_test) = reuters.load_data()
Ejemplo n.º 16
0
#!/usr/bin/env python3
#
# Reuters newswire topic classification training example
#
from keras.datasets import reuters
from lib.featurizer import Featurizer
from lib.categorizer import Categorizer
from lib.classifier import Classifier

# load the reuters dataset
(x, y), _ = reuters.load_data(test_split=0, index_from=2)
word_index = reuters.get_word_index()

def dict_inv(d):
    '''Invert a dictionary'''
    return {v: k for k, v in d.items()}

def x2text(x, word_index_inv):
    '''Return text from an x vector and inverted word index'''
    words = [word_index_inv.get(i) for i in x]
    words = [w for w in words if w]
    return ' '.join(words)

# we use our own featurizer, so first reconstruct input text
word_index_inv = dict_inv(word_index)
texts = [{'text': x2text(a, word_index_inv)} for a in x]
del x, word_index, word_index_inv

# extract features, save results
f = Featurizer()
x = f.fit_transform(texts)
Ejemplo n.º 17
0
def main():
    # model parameters:
    maxlen = 400
    max_words = 10000
    batch_size = 32
    epochs = 20
    embedding_dims = 50
    cnn_filters = 100
    cnn_kernel_size = 5
    dense_hidden_dims = 200

    # 1. Loading started
    (x_train, y_train), (x_test,
                         y_test) = reuters.load_data(num_words=max_words,
                                                     test_split=0.2)
    word_index = reuters.get_word_index(path="reuters_word_index.json")

    num_classes = max(y_train) + 1
    # 2. pad_sequences
    x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen)
    x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen)

    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    # 3. Build CNN model...
    model = Sequential()
    model.add(Embedding(max_words, embedding_dims, input_length=maxlen))
    model.add(Dropout(0.2))
    model.add(
        Conv1D(cnn_filters,
               cnn_kernel_size,
               padding='valid',
               activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(dense_hidden_dims, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='sigmoid'))
    model.summary()

    # 4. compile network
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    # 5.  train model
    history = model.fit(x_train,
                        y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.1)

    # 6. evaluate model
    loss_and_metrics = model.evaluate(x_test, y_test, batch_size, verbose=1)
    print('Test loss:{}\nTest accuracy:{}'.format(loss_and_metrics[0],
                                                  loss_and_metrics[1]))

    # Create a graph of accuracy and loss over time
    history_dict = history.history
    history_dict.keys()

    acc = history_dict['categorical_accuracy']
    val_acc = history_dict['val_categorical_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']

    epochs = range(1, len(acc) + 1)
    # "bo" is for "blue dot"
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    theano.config.exception_verbosity = 'high'
    theano.config.optimizer = 'None'
    theano.config.optimizer = 'fast_run' # also 'fast_run' or 'None' for debugging
    theano.config.linker = 'py'
    theano.config.floatX = 'float32'

    print 'initialising...'

    V = 1001
    E = 12
    total_trainset = 10000
    total_iterations = 9000
    train_x_entropy = 0

    (X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.pkl", \
                nb_words=None, skip_top=0, maxlen=None, test_split=0.1, seed=10086)

    word_map_tmp = reuters.get_word_index(path="reuters_word_index.pkl")
    word_dict = dict((v, k) for k, v in word_map_tmp.iteritems())
    word_dict[0] = "<UNK>"

    def real_words(l, eos):
        sent = []
        for word in l:
            if word == eos :
                sent.append("<EOS>")
            elif word > eos:
                sent.append(word_dict[0])
            else:
                sent.append(word_dict[word])
import numpy as np
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import models, layers

# 设定随机种子
np.random.seed(0)

# 设定想要的特征数量
number_of_features = 5000

# 从影评中加载数据和目标向量
(data_train, target_vector_train), (data_test,
                                    target_vector_test) = reuters.load_data(
                                        num_words=number_of_features)

# 将影评数据转化为one-hot编码过的特征矩阵
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

# 将one-hot编码的特征数据转换成特征矩阵
target_train = to_categorical(target_vector_train)
target_test = to_categorical(target_vector_test)

# 创建神经网络对象
network = models.Sequential()

# 添加使用ReLU激活函数的全连接层
network.add(
        results[i, sequence] = 1.
    return results


# prints the original newswire
def print_newswire(newswire):
    word_index = reuters.get_word_index()
    word_lookup = dict([(value, key) for (key, value) in word_index.items()])
    print(' '.join([word_lookup.get(i - 3, '?') for i in newswire]))


# extract training and test data
# num_words=LEN_DICT extracts only the LEN_DICT most frequently used words; the rest is discarded
(train_data,
 train_labels), (test_data,
                 test_labels) = reuters.load_data(num_words=LEN_DICT)

# prepare input data
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

# prepare target data
one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

x_val = x_train[:VAL_SET_SIZE]
y_val = one_hot_train_labels[:VAL_SET_SIZE]
partial_x_train = x_train[VAL_SET_SIZE:]
partial_y_train = one_hot_train_labels[VAL_SET_SIZE:]

model = models.Sequential()
Ejemplo n.º 21
0
from keras.datasets import reuters
from keras.utils.np_utils import to_categorical
import numpy as np
from keras import models
from keras import layers




(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)



def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    
    return results
def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results

one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)
one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

x_train = vectorize_sequences(train_data)
Ejemplo n.º 22
0
def vectorize_sequences(sequences, dimension=10000):
    result = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        result[i, sequence] = 1
    return result


def to_one_hot(labels, dimensions=46):  #to samo co to_caterogical!
    result = np.zeros((len(labels), dimensions))
    for i, label in enumerate(labels):
        result[i, label] = 1
    return result


(train_data, train_labels), (test_data, test_labels) = reuters.load_data(
    num_words=10000)  #chcemy tylko 10000 najpopularniejszych

x_test = vectorize_sequences(test_data)
y_test = to_categorical(test_labels)

x_train = vectorize_sequences(train_data)
y_train = to_categorical(train_labels)

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(10000, )))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
Ejemplo n.º 23
0
# classifying newswires, a multiclass classification examples
# for text classification
from keras.datasets import reuters
from utils import decoding_newswires, vectorize_sequences, to_one_hot, create_validation_set
from keras.utils.np_utils import to_categorical
from keras import models, layers
from keras import optimizers, losses, metrics

(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

# trian_dataset's length
print(len(train_data))

# test_dataset's length
print(len(test_data))

# decoding a sequence
print(decoding_newswires(reuters, train_data[0]))

# the label is an integer between 0 and 45.
# preparing the data
x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

# set the labels to one_hot encoding
one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)

# there is a built-in way to do this in keras
one_hot_train_labels_ = to_categorical(train_labels)
one_hot_test_labels_ = to_categorical(test_labels)
Ejemplo n.º 24
0
from keras.datasets import reuters
(train_data, train_labels), (test_data,
                             test_labels) = reuters.load_data(num_words=10000)

word_index = reuters.get_word_index()

reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()])


def decodeData(data, index):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in data[index]])


#print(decodeData(train_data, 0))

import numpy as np


def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results


x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)


def to_one_hot(labels, dimension=46):
Ejemplo n.º 25
0
gberg_sents = gutenberg.sents()

len(gutenberg.words())

# In[45]:

word2index = reuters.get_word_index()
index2word = dict([(i, w) for (w, i) in word2index.items()])

# In[57]:

print(index2word[124])

# In[51]:

(x_train, y_train) = reuters.load_data()

# In[64]:

for i, j in x_train:
    print(i)
    print(type(i))
    for j in index2word:
        if int(i) + 3 == j:
            print(j)

# In[27]:

model = Word2Vec(sentences=gberg_sents,
                 size=64,
                 sg=1,
model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train,
          y_train,
          validation_data=(X_test, y_test),
          epochs=3,
          batch_size=64)
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1] * 100))

(X2_train, y2_train), (X2_test,
                       y2_test) = reuters.load_data(num_words=top_words)
X2_train = sequence.pad_sequences(X2_train, maxlen=max_review_length)
X2_test = sequence.pad_sequences(X2_test, maxlen=max_review_length)

embedding_vecor_length = 32
model = Sequential()
model.add(
    Embedding(top_words,
              embedding_vecor_length,
              input_length=max_review_length))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()
Ejemplo n.º 27
0
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape, Merge
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences

vocab_size = 20000
batch_size = 128
embedding_size = 100
maxlen = 75
nb_feature_maps = 100

print("Loading data...")
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=vocab_size,
                                                         test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

nb_classes = np.max(y_train) + 1
print(nb_classes, 'classes')

X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print(
    "Convert class vector to binary class matrix (for use with categorical_crossentropy)"
)
Y_train = np_utils.to_categorical(y_train, nb_classes)
   keras.utils.to_categorical fonksiyonu ile herbir etiket 46 elemanlı
   çıkış vektörüne dönüştürülmüştür. Yani haber 10 numaralı kategorideyse
   vektörde bu konum 1 diğerleri 0 olarak tanımlanır. 
"""

import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing.text import Tokenizer

#(x_train, y_train), (x_test, y_test) =\
# reuters.load_data(num_words=None, test_split=0.2)
max_words = 10000
(train_data, train_labels), (test_data, test_labels) =\
reuters.load_data(num_words=max_words)
 
 
word_index = reuters.get_word_index()

print('# of Training Samples: {}'.format(len(train_data)))
print('# of Test Samples: {}'.format(len(test_data)))
      
word_index = reuters.get_word_index()

    
num_classes = max(train_labels) + 1

print('# of Classes: {}'.format(num_classes))
      
index_to_word = {}
Ejemplo n.º 29
0
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.optimizers import SGD

word_index = reuters.get_word_index(path="reuters_word_index.json")
idx_to_word = dict(zip(word_index.values(), word_index.keys()))

max_words = 20706
batch_size = 32
epochs = 5

print('Loading data...')
#(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2)
(x_train, y_train), (x_test, y_test) = reuters.load_data(test_split=0.2)

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

#print(x_train[1], y_train[1])
#print("words")
#words = [idx_to_word[i] for i in x_train[1]]
#print(" ".join(words))

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
from keras.models import Sequential
from keras.layers import Dense,Flatten,Dropout
from keras.layers.embeddings import Embedding
from sklearn.metrics import classification_report,confusion_matrix
from tensorflow.keras.callbacks import EarlyStopping
from keras import preprocessing
from keras.utils import np_utils
from nltk.tokenize import word_tokenize

# Number of words to consider as features
max_words = 7000
# Cut texts after this number of words 
maxlen = 200

#Loading the data from built in Reuters dataset in keras
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=max_words,test_split=0.3, maxlen=maxlen)
print("Train_data ", X_train.shape)
print("Train_labels ", y_train.shape)
print("Test_data ", X_test.shape)
print("Test_labels ", y_test.shape)

#This dataset also makes available the word index used for encoding the sequences:
#Note there are 30979 words (will be used for our embedding)
word_index = reuters.get_word_index(path="reuters_word_index.json")

#********************************************************************************************************
#Building reverse data
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

#Building text for first x articles from Training data
x=len(X_train)
Ejemplo n.º 31
0
# GRU
gru_output_size = 64
#LSTM
lstm_output_size = 70


def newacti(x, alpha=m.exp(-1)):
    return K.elu(x, alpha)


#(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words)
(X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.npz",
                                                         num_words=top_words,
                                                         skip_top=0,
                                                         maxlen=None,
                                                         test_split=0.2,
                                                         seed=113,
                                                         start_char=1,
                                                         oov_char=2,
                                                         index_from=3)
# truncate and pad input sequences
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

#one hot encoding
i = 0
out_train = np.zeros((len(y_train), max(y_train) + 1))
for x in y_train:
    out_train[i][x] = 1
    i = i + 1
Ejemplo n.º 32
0
np.random.seed(1337)  # for reproducibility

from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer

max_words = 1000  # vocab大小
batch_size = 32  # mini_batch_size
nb_epoch = 5  # 大循环次数

print('Loading data...')
(X_train,
 y_train), (X_test,
            y_test) = reuters.load_data(nb_words=max_words,
                                        test_split=0.2)  # 载入路透社语料<br>#打印
print('train sequences', X_train.shape)
print(len(X_test), 'test sequences')
# 分类数目--原版路透社我记着是10来着,应该是语料用的是大的那个
nb_classes = np.max(y_train) + 1
print(nb_classes, 'classes')

print('Vectorizing sequence data...')  # tokenize
tokenizer = Tokenizer(
    nb_words=max_words
)  # 序列化,取df前1000大<br>#这里有个非常好玩的事, X_train 里面初始存的是wordindex,wordindex是按照词大小来的(应该是,因为直接就给撇了)<br>#所以这个效率上还是很高的<br>#转化的还是binary,默认不是用tfidf
X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test = tokenizer.sequences_to_matrix(X_test, mode='binary')
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
from keras.datasets import cifar10
from math import pi, floor
from random import random
from codeepneat import codeepneat, config, population, chromosome, genome, visualize
import pickle
import numpy as np
import keras
from keras.utils import plot_model
from keras.datasets import reuters
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras import preprocessing

#from .config import Config
max_words = 10000
(x_train_all, y_train_all), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2)
num_classes = np.max(y_train_all) + 1
x_train = preprocessing.sequence.pad_sequences(x_train_all, maxlen=30)[:8970]
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=30)[:2208]
y_train = keras.utils.to_categorical(y_train_all, num_classes)[:8970]
y_test = keras.utils.to_categorical(y_test, num_classes)[:2208]

print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')



def visualize_model(model_file):
    model = keras.models.load_model(model_file)
    layer_list = model.layers
    #module_1 = model.get_layer("model_869")
os.environ['KERAS_BACKEND'] = 'tensorflow'

from keras.datasets import reuters
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical
import numpy as np
from reuters_model import createHierarchicalAttentionModel

batch_size = 16
max_features = 20000
maxlen = 80  # 为序列的最大长度。大于此长度的序列将被截短,小于此长度的序列将在后部填0

print('loading data...')
(X_train, y_train), (X_test,
                     y_test) = reuters.load_data(path="reuters.npz",
                                                 num_words=max_features,
                                                 test_split=0.2)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

print(len(X_train), type(X_train), X_train.shape)
print(len(X_test), type(X_test), X_test.shape)
print('wds')
print(y_test[0:300])
#8982 <class 'numpy.ndarray'> (8982,)
#2246 <class 'numpy.ndarray'> (2246,)
print(len(X_train[0]), X_train[0], '\n')
print(len(X_train[1]), X_train[1], '\n')

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing import sequence
from keras.utils import np_utils

import numpy
import tensorflow as tf
import matplotlib.pyplot as plt

# seed 값 설정
seed = 0
numpy.random.seed(seed)
tf.set_random_seed(seed)

# 불러온 데이터를 학습셋, 테스트셋으로 나누기
(X_train, Y_train), (X_test, Y_test) = reuters.load_data(num_words=1000,
                                                         test_split=0.2)

# 데이터 확인하기
category = numpy.max(Y_train) + 1
print(category, '카테고리')
print(len(X_train), '학습용 뉴스 기사')
print(len(X_test), '테스트용 뉴스 기사')
print(X_train[0])

# 데이터 전처리
x_train = sequence.pad_sequences(X_train, maxlen=100)
x_test = sequence.pad_sequences(X_test, maxlen=100)
y_train = np_utils.to_categorical(Y_train)
y_test = np_utils.to_categorical(Y_test)

# 모델의 설정
Ejemplo n.º 36
0
from keras.utils import to_categorical

seed = 1337
np.random.seed(seed)

# IMPORTANT! => In case h5py has been installed, please restart the kernel by clicking on "Kernel"->"Restart and Clear Outout" and wait until all output disapears. Then your changes are beeing picked up
#
# As you can see, we use Keras' Sequential model with only two types of layers: Dense and Dropout. We also specify a random seed to make our results reproducible. Next, we load the Reuters data set:

# In[5]:

from keras.datasets import reuters

max_words = 1000
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words,
                                                         test_split=0.2,
                                                         seed=seed)
num_classes = np.max(y_train) + 1  # 46 topics

# Note that we cap the maximum number of words in a news item to 1000 by specifying the *num_words* key word. Also, 20% of the data will be test data and we ensure reproducibility by setting our random seed.
#
# Our training features are still simply sequences of indexes and we need to further preprocess them, so that we can plug them into a *Dense* layer. For this we use a *Tokenizer* from Keras' text preprocessing module. This tokenizer will take an index sequence and map it to a vector of length *max_words=1000*. Each of the 1000 vector positions corresponds to one of the words in our newswire corpus. The output of the tokenizer has a 1 at the i-th position of the vector, if the word corresponding to i is in the description of the newswire, and 0 otherwise. Even if this word appears multiple times, we still just put a 1 into our vector, i.e. our tokenizer is binary. We use this tokenizer to transform both train and test features:

# In[6]:

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
Ejemplo n.º 37
0
def load_data(max_words, test_split_rate):
    (x_train, y_train), (x_test, y_test) = reuters.load_data(
        num_words=max_words, test_split=test_split_rate)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')
    return (x_train, y_train), (x_test, y_test)
def load_data_set(type,max_len,vocab_size,batch_size):
    """
        Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification
 
        Args:
            type   : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set
            max_len: {int} timesteps used for padding
			vocab_size: {int} size of the vocabulary
			batch_size: batch_size
        Returns:
            train_loader: {torch.Dataloader} train dataloader
            x_test_pad  : padded tokenized test_data for cross validating
			y_test      : y_test
            word_to_id  : {dict} words mapped to indices
 
      
        """
   
    INDEX_FROM=3
    if not bool(type):
        NUM_WORDS=vocab_size # only use top 1000 words
           # word index offset
 
        train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = imdb.get_word_index()
        word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
 
        id_to_word = {value:key for key,value in word_to_id.items()}
        x = np.concatenate([x_train, x_test])
        y = np.concatenate([y_train, y_test])
        n_train = x.shape[0] - 1000
        n_valid = 1000
 
        x_train = x[:n_train]
        y_train = y[:n_train]
        x_test = x[n_train:n_train+n_valid]
        y_test = y[n_train:n_train+n_valid]
 
 
        #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50)
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,x_test_pad,y_test,word_to_id
       
    else:
        from keras.datasets import reuters
 
        train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = reuters.get_word_index(path="reuters_word_index.json")
        word_to_id = {k:(v+3) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        word_to_id['<EOS>'] = 3
        id_to_word = {value:key for key,value in word_to_id.items()}
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,train_set,test_set,x_test_pad,word_to_id
Ejemplo n.º 39
0
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
'''
    Train and evaluate a simple MLP on the Reuters newswire topic classification task.
    GPU run command:
        THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python examples/reuters_mlp.py
    CPU run command:
        python examples/reuters_mlp.py
'''

max_words = 1000
batch_size = 32
nb_epoch = 5

print("Loading data...")
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words,
                                                         test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

nb_classes = np.max(y_train) + 1
print(nb_classes, 'classes')

print("Vectorizing sequence data...")
tokenizer = Tokenizer(nb_words=max_words)
X_train = tokenizer.sequences_to_matrix(X_train, mode="binary")
X_test = tokenizer.sequences_to_matrix(X_test, mode="binary")
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print(
    "Convert class vector to binary class matrix (for use with categorical_crossentropy)"
Ejemplo n.º 40
0
#Time:2019.04.20

from keras.datasets import reuters
from keras import models
from keras import layers
import numpy as np
import matplotlib.pyplot as plt
import copy
#load datasets
#(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)
paths = r'C:\Users\lianliang\Desktop\keras_deeplearing\reuters.npz'
(train_data, train_labels), (test_data,
                             test_labels) = reuters.load_data(path=paths,
                                                              num_words=10000)


#encoder datas and lebels
def vector_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results


x_train = vector_sequences(train_data)
x_test = vector_sequences(test_data)


def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
Ejemplo n.º 41
0
from keras.datasets import reuters

n_samp = 5000
n_subset = 1000
n_epochs = 10

(train_data, train_labels), (test_data,
                             test_labels) = reuters.load_data(num_words=n_samp)

import numpy as np


def vectorize_sequences(sequences, dimension=n_samp):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results


x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

# def to_one_hot(labels, dimension=46):
#     results = np.zeros((len(labels), dimension))
#     for i, label in enumerate(labels):
#         results[i, label] = 1.
#     return results
#
# one_hot_train_labels = to_one_hot(train_labels)
# one_hot_test_labels = to_one_hot(test_labels)
Ejemplo n.º 42
0
    results = np.zeros((len(sequences), dimension))
    for i, seq in enumerate(sequences):
        results[i, seq] = 1
    return results


def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results


(train_data,
 train_labels), (test_data,
                 test_labels) = reuters.load_data(num_words=NUM_WORDS)

# word_index = reuters.get_word_index()
# reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]])

x_train = vectorize_seq(train_data)
x_test = vectorize_seq(test_data)

one_hot_train_labels = to_one_hot(train_labels)
one_hot_test_labels = to_one_hot(test_labels)

model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(10000, )))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
def vectorize_sequence(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results


def to_one_hot(labels, dimension=46):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results


(train_data, train_labels), (test_data,
                             test_labels) = reuters.load_data(num_words=10000)

print(f"Length of Train data: {len(train_data)}")
print(f"Length of Test data: {len(test_data)}")

print(f"A look at Train data: \n train_data[1] = {train_data[1]}\n")

# indexing

word_index = reuters.get_word_index()
reverse_word_index = dict([(value, key)
                           for (key, value) in word_index.items()])

decoded_newswire = lambda x: ' '.join(
    [reverse_word_index.get(i - 3, '?') for i in train_data[x]])
Ejemplo n.º 44
0
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    '%(asctime)s %(name)s %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(LOG_LEVEL)

# Avoiding this issue:
# https://stackoverflow.com/questions/55890813
np_load_old = np.load
np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)

# Loading dataset
logger.debug("Loading dataset. | sf_split=%s", TEST_SPLIT)
train_set, test_set = reuters.load_data(num_words=None, test_split=TEST_SPLIT)
x_train, y_train = train_set
x_test, y_test = test_set
logger.debug("Dataset Loaded. | sf_train=%s | sf_test=%s", len(x_train),
             len(x_test))

# Loading the words index.
word_index = reuters.get_word_index()
logger.debug("Word index loaded. | sf_index=%s", len(word_index))
# Indexing all labels in the dataset.
word_by_id_index = {}
for key, value in word_index.items():
    word_by_id_index[value] = key
logger.debug("Indexed words by ID. | sf_index=%s", len(word_by_id_index))

# Avoiding this issue:
Ejemplo n.º 45
0
'''
    Train and evaluate a simple MLP on the Reuters newswire topic classification task.

    GPU run command:
        THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python examples/reuters_mlp.py

    CPU run command:
        python examples/reuters_mlp.py
'''

max_words = 10000
batch_size = 16

print "Loading data..."
(X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2)
print len(X_train), 'train sequences'
print len(X_test), 'test sequences'

nb_classes = np.max(y_train)+1
print nb_classes, 'classes'

print "Vectorizing sequence data..."
tokenizer = Tokenizer(nb_words=max_words)
X_train = tokenizer.sequences_to_matrix(X_train, mode="binary")
X_test = tokenizer.sequences_to_matrix(X_test, mode="binary")
print 'X_train shape:', X_train.shape
print 'X_test shape:', X_test.shape

print "Convert class vector to binary class matrix (for use with categorical_crossentropy)"
Y_train = np_utils.to_categorical(y_train, nb_classes)
Ejemplo n.º 46
0
from keras.models import Sequential

from keras.preprocessing.text import Tokenizer
# using reuters data set from keras datasets
from keras.datasets import reuters
import matplotlib.pyplot as plt

# to create callbacks list
from keras.callbacks import EarlyStopping, ModelCheckpoint
# To set pickle = True
old = np.load
np.load = lambda *a, **k: old(*a, allow_pickle=True, **k)

n = 5000
#Loading the data for training and testing
(X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=n)

#Tokenizing
tokenizer = Tokenizer(num_words=n)
X_train_ = tokenizer.sequences_to_matrix(X_train, mode='binary')
X_test_ = tokenizer.sequences_to_matrix(X_test, mode='binary')

#building the Network

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(n, )))
#Using dropout to handle overfitting of the model
#This creates an ensemble network
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(46, activation='softmax'))