def test_reuters(): # only run data download tests 20% of the time # to speed up frequent testing random.seed(time.time()) if random.random() > 0.8: (X_train, y_train), (X_test, y_test) = reuters.load_data() (X_train, y_train), (X_test, y_test) = reuters.load_data(maxlen=10)
def test_reuters_load_does_not_affect_global_rng(fake_downloaded_reuters_path): np.random.seed(1337) before = np.random.randint(0, 100, size=10) np.random.seed(1337) reuters.load_data(path=fake_downloaded_reuters_path, seed=9876) after = np.random.randint(0, 100, size=10) assert np.array_equal(before, after)
def test_reuters(): # only run data download tests 20% of the time # to speed up frequent testing random.seed(time.time()) if random.random() > 0.8: (x_train, y_train), (x_test, y_test) = reuters.load_data() assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) assert len(x_train) + len(x_test) == 11228 (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = reuters.get_word_index() assert isinstance(word_index, dict)
def load_data(self, sample_size=None): print('Load Data...') (X_train, y_train), (X_test, y_test) = reuters.load_data( start_char=None, index_from=None, nb_words=self.word_vocab_size) if sample_size: sample_indices_train = random.sample(range(len(X_train)), sample_size) X_train = itemgetter(*sample_indices_train)(X_train) y_train = itemgetter(*sample_indices_train)(y_train) sample_indices_test = random.sample(range(len(X_test)),sample_size) X_test = itemgetter(*sample_indices_test)(X_test) y_test = itemgetter(*sample_indices_test)(y_test) index_word = dict((v, k) for k, v in reuters.get_word_index().items()) X_train_char = [[index_word[idx] for idx in x] for x in X_train] X_test_char = [[index_word[idx] for idx in x] for x in X_test] X_test_char, X_train_char, vocab_char_size = \ self.tokenize(X_test_char, X_train_char) X_test, X_train, X_test_char, X_train_char = \ self.pad(X_test_char, X_train_char,X_test, X_train) nb_classes = np.max(y_train+y_test)+1 Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) return X_train, X_train_char, Y_train, X_test, X_test_char, Y_test, \ vocab_char_size, nb_classes
def run_keras_example(): max_words = 1000 batch_size = 32 nb_epoch = 5 print('Loading data...') (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') nb_classes = np.max(y_train)+1 print(nb_classes, 'classes') print('Vectorizing sequence data...') tokenizer = Tokenizer(nb_words=max_words) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Convert class vector to binary class matrix (for use with categorical_crossentropy)') Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) print('Y_train shape:', Y_train.shape) print('Y_test shape:', Y_test.shape) print('Building model...') model = Sequential() model.add(Dense(512, input_shape=(max_words,))) model.add(Activation('tanh')) model.add(Dropout(0.5)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') history = model.fit(X_train, Y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1, show_accuracy=True, validation_split=0.1) score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1, show_accuracy=True) print('Test score:', score[0]) print('Test accuracy:', score[1])
from keras.datasets import cifar10 ((trainX4, trainY4), (testX4, testY4)) = cifar10.load_data() print(trainX4.shape) print(testX4.shape) from keras.datasets import cifar100 ((trainX5, trainY5), (testX5, testY5)) = cifar100.load_data() print(trainX5.shape) print(testX5.shape) print('') from keras.datasets import reuters ((trainX6, trainY6), (testX6, testY6)) = reuters.load_data() print(trainX6.shape) print(testX6.shape) from keras.datasets import boston_housing ((trainX7, trainY7), (testX7, testY7)) = boston_housing.load_data() print(trainX7.shape) print(testX7.shape) print('') # use: https://medium.com/startup-grind/fueling-the-ai-gold-rush-7ae438505bc2 # we use: https://medium.com/startup-grind/fueling-the-ai-gold-rush-7ae438505bc2
import keras from keras.layers import SimpleRNN, Embedding, Dense from keras.models import Sequential import numpy as np from keras.datasets import reuters from keras.preprocessing import sequence import sys from keras.utils import np_utils max_features = 10000 maxlen = 500 batch_size = 64 print('Loading Data...') (input_train, y_train), (input_test, y_test) = reuters.load_data(num_words=max_features) input_train = sequence.pad_sequences(input_train, maxlen=maxlen) input_test = sequence.pad_sequences(input_test, maxlen=maxlen) y_train = np_utils.to_categorical(y_train, 46) y_test = np_utils.to_categorical(y_test, 46) model = Sequential() model.add(Embedding(max_features, 64)) model.add(SimpleRNN(64, return_sequences=True)) model.add(SimpleRNN(64, return_sequences=True)) model.add(SimpleRNN(64, return_sequences=True)) model.add(SimpleRNN(64)) model.add(Dense(46, activation='softmax'))
from sklearn.metrics import accuracy_score from keras.datasets import reuters from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.models import Sequential from keras.layers import Dense, LSTM, Activation from keras import optimizers from keras.wrappers.scikit_learn import KerasClassifier # parameters for data load num_words = 30000 maxlen = 50 test_split = 0.3 (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=num_words, maxlen=maxlen, test_split=test_split) # pad the sequences with zeros # padding parameter is set to 'post' => 0's are appended to end of sequences X_train = pad_sequences(X_train, padding='post') X_test = pad_sequences(X_test, padding='post') X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1)) X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1], 1)) y_data = np.concatenate((y_train, y_test)) y_data = to_categorical(y_data) y_train = y_data[:1395] y_test = y_data[1395:]
from keras.datasets import reuters from keras import models from keras import layers import numpy as np from keras.utils.np_utils import to_categorical import matplotlib.pyplot as plt def vectorize_sequence(data, dimension=10000): results = np.zeros((len(data), dimension)) for ind, cols in enumerate(data): results[ind, cols] = 1 return results (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000) y_train = np.array(y_train) y_test = np.array(y_test) x_train = vectorize_sequence(x_train) x_test = vectorize_sequence(x_test) model = models.Sequential() model.add(layers.Dense(64, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(4, activation='relu')) model.add(layers.Dense(46, activation='softmax')) model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy']) x_val = x_train[:1000] partial_x = x_train[1000:] y_val = y_train[:1000]
import numpy as np from keras.datasets import reuters from keras import models from keras import layers from keras import optimizers from keras import losses from keras import metrics from keras.utils import to_categorical from sklearn.model_selection import train_test_split from utils import init_keras, vectorize_sequences init_keras() (x_train, t_train), (x_test, t_test) = reuters.load_data(num_words=10000) x_train = vectorize_sequences(x_train) x_test = vectorize_sequences(x_test) t_train = to_categorical(t_train) t_test = to_categorical(t_test) x_train, x_cv, t_train, t_cv = train_test_split( x_train, t_train, test_size=0.33) network = models.Sequential() network.add(layers.Dense(64, activation='relu', input_shape=(10000,))) network.add(layers.Dense(64, activation='relu')) network.add(layers.Dense(46, activation='softmax')) network.compile(optimizer=optimizers.RMSprop(lr=0.001), loss=losses.categorical_crossentropy, metrics=[metrics.categorical_accuracy]) network.fit(x=x_train, y=t_train, epochs=15,
from keras import models from keras import layers # ### Binary representation # In[104]: from keras.datasets import reuters from keras.preprocessing.text import Tokenizer from keras.models import Sequential from keras.layers import Dense, Dropout, Activation # In[139]: # load reuters dataset (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2) no_features = max(y_train) + 1 max_words = 20000 # one-hot encoding tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') y_train = keras.utils.to_categorical(y_train, no_features) y_test = keras.utils.to_categorical(y_test, no_features) print(x_train[0]) print(len(x_train[0]))
def test_reuters(): (X_train, y_train), (X_test, y_test) = reuters.load_data()
from keras.layers.normalization import BatchNormalization from keras.utils import np_utils from keras.preprocessing.text import Tokenizer import cnn_utils vocab_size = 20000 batch_size = 128 embedding_size = 500 max_len = 100 n_filters = 300 n_gram = 6 print("Loading data...") (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=vocab_size, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') nb_classes = np.max(y_train)+1 print(nb_classes, 'classes') X_train = cnn_utils.prepare_sequence(X_train, length=max_len) X_train = np.array(X_train) X_test = cnn_utils.prepare_sequence(X_test, length=max_len) X_test = np.array(X_test) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print("Convert class vector to binary class matrix (for use with categorical_crossentropy)") Y_train = np_utils.to_categorical(y_train, nb_classes)
from keras.utils import np_utils from keras.preprocessing.text import Tokenizer MAX_WORDS = 1000 # network and training N_HIDDEN = 512 NB_EPOCH = 10 BATCH_SIZE = 32 VERBOSE = 1 VALIDATION_SPLIT=0.2 print('Loading data...') (X_train, y_train), (X_test, y_test) = \ reuters.load_data(nb_words=MAX_WORDS, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') nb_classes = np.max(y_train)+1 print(nb_classes, 'classes') print('Vectorizing sequence data...') tokenizer = Tokenizer(nb_words=MAX_WORDS) X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Convert class vector to binary class matrix (for use with categorical_crossentropy)') Y_train = np_utils.to_categorical(y_train, nb_classes)
def test_reuters(self): print('reuters') (X_train, y_train), (X_test, y_test) = reuters.load_data()
#!/usr/bin/env python3 # # Reuters newswire topic classification training example # from keras.datasets import reuters from lib.featurizer import Featurizer from lib.categorizer import Categorizer from lib.classifier import Classifier # load the reuters dataset (x, y), _ = reuters.load_data(test_split=0, index_from=2) word_index = reuters.get_word_index() def dict_inv(d): '''Invert a dictionary''' return {v: k for k, v in d.items()} def x2text(x, word_index_inv): '''Return text from an x vector and inverted word index''' words = [word_index_inv.get(i) for i in x] words = [w for w in words if w] return ' '.join(words) # we use our own featurizer, so first reconstruct input text word_index_inv = dict_inv(word_index) texts = [{'text': x2text(a, word_index_inv)} for a in x] del x, word_index, word_index_inv # extract features, save results f = Featurizer() x = f.fit_transform(texts)
def main(): # model parameters: maxlen = 400 max_words = 10000 batch_size = 32 epochs = 20 embedding_dims = 50 cnn_filters = 100 cnn_kernel_size = 5 dense_hidden_dims = 200 # 1. Loading started (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) word_index = reuters.get_word_index(path="reuters_word_index.json") num_classes = max(y_train) + 1 # 2. pad_sequences x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen) x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen) y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # 3. Build CNN model... model = Sequential() model.add(Embedding(max_words, embedding_dims, input_length=maxlen)) model.add(Dropout(0.2)) model.add( Conv1D(cnn_filters, cnn_kernel_size, padding='valid', activation='relu')) model.add(GlobalMaxPooling1D()) model.add(Dense(dense_hidden_dims, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation='sigmoid')) model.summary() # 4. compile network model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) # 5. train model history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1) # 6. evaluate model loss_and_metrics = model.evaluate(x_test, y_test, batch_size, verbose=1) print('Test loss:{}\nTest accuracy:{}'.format(loss_and_metrics[0], loss_and_metrics[1])) # Create a graph of accuracy and loss over time history_dict = history.history history_dict.keys() acc = history_dict['categorical_accuracy'] val_acc = history_dict['val_categorical_accuracy'] loss = history_dict['loss'] val_loss = history_dict['val_loss'] epochs = range(1, len(acc) + 1) # "bo" is for "blue dot" plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show()
theano.config.exception_verbosity = 'high' theano.config.optimizer = 'None' theano.config.optimizer = 'fast_run' # also 'fast_run' or 'None' for debugging theano.config.linker = 'py' theano.config.floatX = 'float32' print 'initialising...' V = 1001 E = 12 total_trainset = 10000 total_iterations = 9000 train_x_entropy = 0 (X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.pkl", \ nb_words=None, skip_top=0, maxlen=None, test_split=0.1, seed=10086) word_map_tmp = reuters.get_word_index(path="reuters_word_index.pkl") word_dict = dict((v, k) for k, v in word_map_tmp.iteritems()) word_dict[0] = "<UNK>" def real_words(l, eos): sent = [] for word in l: if word == eos : sent.append("<EOS>") elif word > eos: sent.append(word_dict[0]) else: sent.append(word_dict[word])
import numpy as np from keras.datasets import reuters from keras.utils.np_utils import to_categorical from keras.preprocessing.text import Tokenizer from keras import models, layers # 设定随机种子 np.random.seed(0) # 设定想要的特征数量 number_of_features = 5000 # 从影评中加载数据和目标向量 (data_train, target_vector_train), (data_test, target_vector_test) = reuters.load_data( num_words=number_of_features) # 将影评数据转化为one-hot编码过的特征矩阵 tokenizer = Tokenizer(num_words=number_of_features) features_train = tokenizer.sequences_to_matrix(data_train, mode="binary") features_test = tokenizer.sequences_to_matrix(data_test, mode="binary") # 将one-hot编码的特征数据转换成特征矩阵 target_train = to_categorical(target_vector_train) target_test = to_categorical(target_vector_test) # 创建神经网络对象 network = models.Sequential() # 添加使用ReLU激活函数的全连接层 network.add(
results[i, sequence] = 1. return results # prints the original newswire def print_newswire(newswire): word_index = reuters.get_word_index() word_lookup = dict([(value, key) for (key, value) in word_index.items()]) print(' '.join([word_lookup.get(i - 3, '?') for i in newswire])) # extract training and test data # num_words=LEN_DICT extracts only the LEN_DICT most frequently used words; the rest is discarded (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=LEN_DICT) # prepare input data x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) # prepare target data one_hot_train_labels = to_categorical(train_labels) one_hot_test_labels = to_categorical(test_labels) x_val = x_train[:VAL_SET_SIZE] y_val = one_hot_train_labels[:VAL_SET_SIZE] partial_x_train = x_train[VAL_SET_SIZE:] partial_y_train = one_hot_train_labels[VAL_SET_SIZE:] model = models.Sequential()
from keras.datasets import reuters from keras.utils.np_utils import to_categorical import numpy as np from keras import models from keras import layers (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results def to_one_hot(labels, dimension=46): results = np.zeros((len(labels), dimension)) for i, label in enumerate(labels): results[i, label] = 1. return results one_hot_train_labels = to_one_hot(train_labels) one_hot_test_labels = to_one_hot(test_labels) one_hot_train_labels = to_categorical(train_labels) one_hot_test_labels = to_categorical(test_labels) x_train = vectorize_sequences(train_data)
def vectorize_sequences(sequences, dimension=10000): result = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): result[i, sequence] = 1 return result def to_one_hot(labels, dimensions=46): #to samo co to_caterogical! result = np.zeros((len(labels), dimensions)) for i, label in enumerate(labels): result[i, label] = 1 return result (train_data, train_labels), (test_data, test_labels) = reuters.load_data( num_words=10000) #chcemy tylko 10000 najpopularniejszych x_test = vectorize_sequences(test_data) y_test = to_categorical(test_labels) x_train = vectorize_sequences(train_data) y_train = to_categorical(train_labels) from keras import models from keras import layers model = models.Sequential() model.add(layers.Dense(128, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(128, activation='relu')) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(64, activation='relu'))
# classifying newswires, a multiclass classification examples # for text classification from keras.datasets import reuters from utils import decoding_newswires, vectorize_sequences, to_one_hot, create_validation_set from keras.utils.np_utils import to_categorical from keras import models, layers from keras import optimizers, losses, metrics (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) # trian_dataset's length print(len(train_data)) # test_dataset's length print(len(test_data)) # decoding a sequence print(decoding_newswires(reuters, train_data[0])) # the label is an integer between 0 and 45. # preparing the data x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) # set the labels to one_hot encoding one_hot_train_labels = to_one_hot(train_labels) one_hot_test_labels = to_one_hot(test_labels) # there is a built-in way to do this in keras one_hot_train_labels_ = to_categorical(train_labels) one_hot_test_labels_ = to_categorical(test_labels)
from keras.datasets import reuters (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) word_index = reuters.get_word_index() reverse_word_index = dict( [(value, key) for (key, value) in word_index.items()]) def decodeData(data, index): return ' '.join([reverse_word_index.get(i - 3, '?') for i in data[index]]) #print(decodeData(train_data, 0)) import numpy as np def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) def to_one_hot(labels, dimension=46):
gberg_sents = gutenberg.sents() len(gutenberg.words()) # In[45]: word2index = reuters.get_word_index() index2word = dict([(i, w) for (w, i) in word2index.items()]) # In[57]: print(index2word[124]) # In[51]: (x_train, y_train) = reuters.load_data() # In[64]: for i, j in x_train: print(i) print(type(i)) for j in index2word: if int(i) + 3 == j: print(j) # In[27]: model = Word2Vec(sentences=gberg_sents, size=64, sg=1,
model.summary() model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64) # Final evaluation of the model scores = model.evaluate(X_test, y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1] * 100)) (X2_train, y2_train), (X2_test, y2_test) = reuters.load_data(num_words=top_words) X2_train = sequence.pad_sequences(X2_train, maxlen=max_review_length) X2_test = sequence.pad_sequences(X2_test, maxlen=max_review_length) embedding_vecor_length = 32 model = Sequential() model.add( Embedding(top_words, embedding_vecor_length, input_length=max_review_length)) model.add(Dropout(0.2)) model.add(LSTM(100)) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.summary()
from keras.datasets import reuters from keras.models import Sequential from keras.layers.embeddings import Embedding from keras.layers.convolutional import Convolution2D, MaxPooling2D from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape, Merge from keras.utils import np_utils from keras.preprocessing.sequence import pad_sequences vocab_size = 20000 batch_size = 128 embedding_size = 100 maxlen = 75 nb_feature_maps = 100 print("Loading data...") (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=vocab_size, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') nb_classes = np.max(y_train) + 1 print(nb_classes, 'classes') X_train = pad_sequences(X_train, maxlen=maxlen) X_test = pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print( "Convert class vector to binary class matrix (for use with categorical_crossentropy)" ) Y_train = np_utils.to_categorical(y_train, nb_classes)
keras.utils.to_categorical fonksiyonu ile herbir etiket 46 elemanlı çıkış vektörüne dönüştürülmüştür. Yani haber 10 numaralı kategorideyse vektörde bu konum 1 diğerleri 0 olarak tanımlanır. """ import keras from keras.datasets import reuters from keras.models import Sequential from keras.layers import Dense, Activation from keras.preprocessing.text import Tokenizer #(x_train, y_train), (x_test, y_test) =\ # reuters.load_data(num_words=None, test_split=0.2) max_words = 10000 (train_data, train_labels), (test_data, test_labels) =\ reuters.load_data(num_words=max_words) word_index = reuters.get_word_index() print('# of Training Samples: {}'.format(len(train_data))) print('# of Test Samples: {}'.format(len(test_data))) word_index = reuters.get_word_index() num_classes = max(train_labels) + 1 print('# of Classes: {}'.format(num_classes)) index_to_word = {}
from keras.datasets import reuters from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.preprocessing.text import Tokenizer from keras.optimizers import SGD word_index = reuters.get_word_index(path="reuters_word_index.json") idx_to_word = dict(zip(word_index.values(), word_index.keys())) max_words = 20706 batch_size = 32 epochs = 5 print('Loading data...') #(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) (x_train, y_train), (x_test, y_test) = reuters.load_data(test_split=0.2) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') #print(x_train[1], y_train[1]) #print("words") #words = [idx_to_word[i] for i in x_train[1]] #print(" ".join(words)) num_classes = np.max(y_train) + 1 print(num_classes, 'classes') print('Vectorizing sequence data...') tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
from keras.models import Sequential from keras.layers import Dense,Flatten,Dropout from keras.layers.embeddings import Embedding from sklearn.metrics import classification_report,confusion_matrix from tensorflow.keras.callbacks import EarlyStopping from keras import preprocessing from keras.utils import np_utils from nltk.tokenize import word_tokenize # Number of words to consider as features max_words = 7000 # Cut texts after this number of words maxlen = 200 #Loading the data from built in Reuters dataset in keras (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=max_words,test_split=0.3, maxlen=maxlen) print("Train_data ", X_train.shape) print("Train_labels ", y_train.shape) print("Test_data ", X_test.shape) print("Test_labels ", y_test.shape) #This dataset also makes available the word index used for encoding the sequences: #Note there are 30979 words (will be used for our embedding) word_index = reuters.get_word_index(path="reuters_word_index.json") #******************************************************************************************************** #Building reverse data reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) #Building text for first x articles from Training data x=len(X_train)
# GRU gru_output_size = 64 #LSTM lstm_output_size = 70 def newacti(x, alpha=m.exp(-1)): return K.elu(x, alpha) #(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words) (X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.npz", num_words=top_words, skip_top=0, maxlen=None, test_split=0.2, seed=113, start_char=1, oov_char=2, index_from=3) # truncate and pad input sequences max_review_length = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) #one hot encoding i = 0 out_train = np.zeros((len(y_train), max(y_train) + 1)) for x in y_train: out_train[i][x] = 1 i = i + 1
np.random.seed(1337) # for reproducibility from keras.datasets import reuters from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.utils import np_utils from keras.preprocessing.text import Tokenizer max_words = 1000 # vocab大小 batch_size = 32 # mini_batch_size nb_epoch = 5 # 大循环次数 print('Loading data...') (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2) # 载入路透社语料<br>#打印 print('train sequences', X_train.shape) print(len(X_test), 'test sequences') # 分类数目--原版路透社我记着是10来着,应该是语料用的是大的那个 nb_classes = np.max(y_train) + 1 print(nb_classes, 'classes') print('Vectorizing sequence data...') # tokenize tokenizer = Tokenizer( nb_words=max_words ) # 序列化,取df前1000大<br>#这里有个非常好玩的事, X_train 里面初始存的是wordindex,wordindex是按照词大小来的(应该是,因为直接就给撇了)<br>#所以这个效率上还是很高的<br>#转化的还是binary,默认不是用tfidf X_train = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test = tokenizer.sequences_to_matrix(X_test, mode='binary') print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape)
from keras.datasets import cifar10 from math import pi, floor from random import random from codeepneat import codeepneat, config, population, chromosome, genome, visualize import pickle import numpy as np import keras from keras.utils import plot_model from keras.datasets import reuters from keras.preprocessing.text import Tokenizer from keras.utils.np_utils import to_categorical from keras import preprocessing #from .config import Config max_words = 10000 (x_train_all, y_train_all), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) num_classes = np.max(y_train_all) + 1 x_train = preprocessing.sequence.pad_sequences(x_train_all, maxlen=30)[:8970] x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=30)[:2208] y_train = keras.utils.to_categorical(y_train_all, num_classes)[:8970] y_test = keras.utils.to_categorical(y_test, num_classes)[:2208] print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') def visualize_model(model_file): model = keras.models.load_model(model_file) layer_list = model.layers #module_1 = model.get_layer("model_869")
os.environ['KERAS_BACKEND'] = 'tensorflow' from keras.datasets import reuters from keras.preprocessing import sequence from keras.utils.np_utils import to_categorical import numpy as np from reuters_model import createHierarchicalAttentionModel batch_size = 16 max_features = 20000 maxlen = 80 # 为序列的最大长度。大于此长度的序列将被截短,小于此长度的序列将在后部填0 print('loading data...') (X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.npz", num_words=max_features, test_split=0.2) y_train = to_categorical(y_train) y_test = to_categorical(y_test) print(len(X_train), type(X_train), X_train.shape) print(len(X_test), type(X_test), X_test.shape) print('wds') print(y_test[0:300]) #8982 <class 'numpy.ndarray'> (8982,) #2246 <class 'numpy.ndarray'> (2246,) print(len(X_train[0]), X_train[0], '\n') print(len(X_train[1]), X_train[1], '\n') X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
from keras.models import Sequential from keras.layers import Dense, LSTM, Embedding from keras.preprocessing import sequence from keras.utils import np_utils import numpy import tensorflow as tf import matplotlib.pyplot as plt # seed 값 설정 seed = 0 numpy.random.seed(seed) tf.set_random_seed(seed) # 불러온 데이터를 학습셋, 테스트셋으로 나누기 (X_train, Y_train), (X_test, Y_test) = reuters.load_data(num_words=1000, test_split=0.2) # 데이터 확인하기 category = numpy.max(Y_train) + 1 print(category, '카테고리') print(len(X_train), '학습용 뉴스 기사') print(len(X_test), '테스트용 뉴스 기사') print(X_train[0]) # 데이터 전처리 x_train = sequence.pad_sequences(X_train, maxlen=100) x_test = sequence.pad_sequences(X_test, maxlen=100) y_train = np_utils.to_categorical(Y_train) y_test = np_utils.to_categorical(Y_test) # 모델의 설정
from keras.utils import to_categorical seed = 1337 np.random.seed(seed) # IMPORTANT! => In case h5py has been installed, please restart the kernel by clicking on "Kernel"->"Restart and Clear Outout" and wait until all output disapears. Then your changes are beeing picked up # # As you can see, we use Keras' Sequential model with only two types of layers: Dense and Dropout. We also specify a random seed to make our results reproducible. Next, we load the Reuters data set: # In[5]: from keras.datasets import reuters max_words = 1000 (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2, seed=seed) num_classes = np.max(y_train) + 1 # 46 topics # Note that we cap the maximum number of words in a news item to 1000 by specifying the *num_words* key word. Also, 20% of the data will be test data and we ensure reproducibility by setting our random seed. # # Our training features are still simply sequences of indexes and we need to further preprocess them, so that we can plug them into a *Dense* layer. For this we use a *Tokenizer* from Keras' text preprocessing module. This tokenizer will take an index sequence and map it to a vector of length *max_words=1000*. Each of the 1000 vector positions corresponds to one of the words in our newswire corpus. The output of the tokenizer has a 1 at the i-th position of the vector, if the word corresponding to i is in the description of the newswire, and 0 otherwise. Even if this word appears multiple times, we still just put a 1 into our vector, i.e. our tokenizer is binary. We use this tokenizer to transform both train and test features: # In[6]: from keras.preprocessing.text import Tokenizer tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')
def load_data(max_words, test_split_rate): (x_train, y_train), (x_test, y_test) = reuters.load_data( num_words=max_words, test_split=test_split_rate) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') return (x_train, y_train), (x_test, y_test)
def load_data_set(type,max_len,vocab_size,batch_size): """ Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification Args: type : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set max_len: {int} timesteps used for padding vocab_size: {int} size of the vocabulary batch_size: batch_size Returns: train_loader: {torch.Dataloader} train dataloader x_test_pad : padded tokenized test_data for cross validating y_test : y_test word_to_id : {dict} words mapped to indices """ INDEX_FROM=3 if not bool(type): NUM_WORDS=vocab_size # only use top 1000 words # word index offset train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = imdb.get_word_index() word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value:key for key,value in word_to_id.items()} x = np.concatenate([x_train, x_test]) y = np.concatenate([y_train, y_test]) n_train = x.shape[0] - 1000 n_valid = 1000 x_train = x[:n_train] y_train = y[:n_train] x_test = x[n_train:n_train+n_valid] y_test = y[n_train:n_train+n_valid] #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50) x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,x_test_pad,y_test,word_to_id else: from keras.datasets import reuters train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = reuters.get_word_index(path="reuters_word_index.json") word_to_id = {k:(v+3) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id['<EOS>'] = 3 id_to_word = {value:key for key,value in word_to_id.items()} x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,train_set,test_set,x_test_pad,word_to_id
from keras.utils import np_utils from keras.preprocessing.text import Tokenizer ''' Train and evaluate a simple MLP on the Reuters newswire topic classification task. GPU run command: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python examples/reuters_mlp.py CPU run command: python examples/reuters_mlp.py ''' max_words = 1000 batch_size = 32 nb_epoch = 5 print("Loading data...") (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') nb_classes = np.max(y_train) + 1 print(nb_classes, 'classes') print("Vectorizing sequence data...") tokenizer = Tokenizer(nb_words=max_words) X_train = tokenizer.sequences_to_matrix(X_train, mode="binary") X_test = tokenizer.sequences_to_matrix(X_test, mode="binary") print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print( "Convert class vector to binary class matrix (for use with categorical_crossentropy)"
#Time:2019.04.20 from keras.datasets import reuters from keras import models from keras import layers import numpy as np import matplotlib.pyplot as plt import copy #load datasets #(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) paths = r'C:\Users\lianliang\Desktop\keras_deeplearing\reuters.npz' (train_data, train_labels), (test_data, test_labels) = reuters.load_data(path=paths, num_words=10000) #encoder datas and lebels def vector_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results x_train = vector_sequences(train_data) x_test = vector_sequences(test_data) def to_one_hot(labels, dimension=46): results = np.zeros((len(labels), dimension)) for i, label in enumerate(labels):
from keras.datasets import reuters n_samp = 5000 n_subset = 1000 n_epochs = 10 (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=n_samp) import numpy as np def vectorize_sequences(sequences, dimension=n_samp): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) # def to_one_hot(labels, dimension=46): # results = np.zeros((len(labels), dimension)) # for i, label in enumerate(labels): # results[i, label] = 1. # return results # # one_hot_train_labels = to_one_hot(train_labels) # one_hot_test_labels = to_one_hot(test_labels)
results = np.zeros((len(sequences), dimension)) for i, seq in enumerate(sequences): results[i, seq] = 1 return results def to_one_hot(labels, dimension=46): results = np.zeros((len(labels), dimension)) for i, label in enumerate(labels): results[i, label] = 1. return results (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=NUM_WORDS) # word_index = reuters.get_word_index() # reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) x_train = vectorize_seq(train_data) x_test = vectorize_seq(test_data) one_hot_train_labels = to_one_hot(train_labels) one_hot_test_labels = to_one_hot(test_labels) model = models.Sequential() model.add(layers.Dense(128, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(128, activation='relu')) model.add(layers.Dense(128, activation='relu'))
def vectorize_sequence(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results def to_one_hot(labels, dimension=46): results = np.zeros((len(labels), dimension)) for i, label in enumerate(labels): results[i, label] = 1. return results (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) print(f"Length of Train data: {len(train_data)}") print(f"Length of Test data: {len(test_data)}") print(f"A look at Train data: \n train_data[1] = {train_data[1]}\n") # indexing word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_newswire = lambda x: ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[x]])
handler = logging.StreamHandler(sys.stdout) handler.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(name)s %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(LOG_LEVEL) # Avoiding this issue: # https://stackoverflow.com/questions/55890813 np_load_old = np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # Loading dataset logger.debug("Loading dataset. | sf_split=%s", TEST_SPLIT) train_set, test_set = reuters.load_data(num_words=None, test_split=TEST_SPLIT) x_train, y_train = train_set x_test, y_test = test_set logger.debug("Dataset Loaded. | sf_train=%s | sf_test=%s", len(x_train), len(x_test)) # Loading the words index. word_index = reuters.get_word_index() logger.debug("Word index loaded. | sf_index=%s", len(word_index)) # Indexing all labels in the dataset. word_by_id_index = {} for key, value in word_index.items(): word_by_id_index[value] = key logger.debug("Indexed words by ID. | sf_index=%s", len(word_by_id_index)) # Avoiding this issue:
''' Train and evaluate a simple MLP on the Reuters newswire topic classification task. GPU run command: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python examples/reuters_mlp.py CPU run command: python examples/reuters_mlp.py ''' max_words = 10000 batch_size = 16 print "Loading data..." (X_train, y_train), (X_test, y_test) = reuters.load_data(nb_words=max_words, test_split=0.2) print len(X_train), 'train sequences' print len(X_test), 'test sequences' nb_classes = np.max(y_train)+1 print nb_classes, 'classes' print "Vectorizing sequence data..." tokenizer = Tokenizer(nb_words=max_words) X_train = tokenizer.sequences_to_matrix(X_train, mode="binary") X_test = tokenizer.sequences_to_matrix(X_test, mode="binary") print 'X_train shape:', X_train.shape print 'X_test shape:', X_test.shape print "Convert class vector to binary class matrix (for use with categorical_crossentropy)" Y_train = np_utils.to_categorical(y_train, nb_classes)
from keras.models import Sequential from keras.preprocessing.text import Tokenizer # using reuters data set from keras datasets from keras.datasets import reuters import matplotlib.pyplot as plt # to create callbacks list from keras.callbacks import EarlyStopping, ModelCheckpoint # To set pickle = True old = np.load np.load = lambda *a, **k: old(*a, allow_pickle=True, **k) n = 5000 #Loading the data for training and testing (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=n) #Tokenizing tokenizer = Tokenizer(num_words=n) X_train_ = tokenizer.sequences_to_matrix(X_train, mode='binary') X_test_ = tokenizer.sequences_to_matrix(X_test, mode='binary') #building the Network model = Sequential() model.add(Dense(128, activation='relu', input_shape=(n, ))) #Using dropout to handle overfitting of the model #This creates an ensemble network model.add(Dropout(0.2)) model.add(Dense(128, activation='relu')) model.add(Dense(46, activation='softmax'))