def embToEnglish(str): word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ' '.join( [reverse_word_index.get(i - 3, '?') for i in str]) return decoded_review
def load_data(self, sample_size=None): print('Load Data...') (X_train, y_train), (X_test, y_test) = reuters.load_data( start_char=None, index_from=None, nb_words=self.word_vocab_size) if sample_size: sample_indices_train = random.sample(range(len(X_train)), sample_size) X_train = itemgetter(*sample_indices_train)(X_train) y_train = itemgetter(*sample_indices_train)(y_train) sample_indices_test = random.sample(range(len(X_test)),sample_size) X_test = itemgetter(*sample_indices_test)(X_test) y_test = itemgetter(*sample_indices_test)(y_test) index_word = dict((v, k) for k, v in reuters.get_word_index().items()) X_train_char = [[index_word[idx] for idx in x] for x in X_train] X_test_char = [[index_word[idx] for idx in x] for x in X_test] X_test_char, X_train_char, vocab_char_size = \ self.tokenize(X_test_char, X_train_char) X_test, X_train, X_test_char, X_train_char = \ self.pad(X_test_char, X_train_char,X_test, X_train) nb_classes = np.max(y_train+y_test)+1 Y_train = np_utils.to_categorical(y_train, nb_classes) Y_test = np_utils.to_categorical(y_test, nb_classes) return X_train, X_train_char, Y_train, X_test, X_test_char, Y_test, \ vocab_char_size, nb_classes
def decode_words(): word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_words = ' '.join( [reverse_word_index.get(i - 3, "?") for i in train_data[0]]) return decoded_words
def reuters_to_hdf5(file_name): if os.path.exists(file_name): return (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=1000) word_index = reuters.get_word_index(path="reuters_word_index.json") print('x_train shape:', x_train.shape) print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) x_train = x_train.astype('object') x_test = x_test.astype('object') x = numpy.concatenate((x_train, x_test), axis=0) x = sequence.pad_sequences(x, maxlen=400) y = numpy.concatenate((y_train, y_test), axis=0) # y = sequence.pad_sequences(y, maxlen=400) with h5py.File(file_name, 'w') as f: f.create_dataset('x', data=x, compression='gzip') f.create_dataset('y', data=y, compression='gzip') dictionary = f.create_group('dictionary') for key in word_index: dictionary[key] = word_index[key]
def decodeToText(textSequence): wordIndex = reuters.get_word_index() reverseWordIndex = dict([(value, key) for (key, value) in wordIndex.items()]) decodedText = ' '.join( [reverseWordIndex.get(i - 3, '?') for i in textSequence]) return decodedText
def running_retuter(modelname): maxlen = 400 max_words = 10000 # 1. Loading started (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) word_index = reuters.get_word_index(path="reuters_word_index.json") num_classes = np.max(y_train) + 1 # 2. pad_sequences keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0) if (modelname == 'cnn'): x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen) x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen) y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) elif(modelname == 'nn'): tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) bulidModel(modelname, num_classes, x_test, y_test, x_train, y_train)
def decode_data(sentence_vec): word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_newswire = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) return decoded_newswire
def transpose_word(train_data): word_index = reuters.get_word_index() revese_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_newswire = ' '.join( [revese_word_index.get(i - 3, '?') for i in train_data[0]]) return decoded_newswire
def keras_reuters_info(): (X_train, y_train), (X_test, y_test) = reuters.load_data(path=os.path.join( root_path, "data", "reuters", "reuters.npz"), skip_top=0, maxlen=None, test_split=0.2, seed=113, start_char=1, oov_char=2, index_from=3) logger.info(X_train.shape) logger.info(y_train.shape) logger.info(X_test.shape) logger.info(y_test.shape) word_index = reuters.get_word_index( os.path.join(root_path, "data", "reuters", "reuters_word_index.json")) logger.info(word_index) num_words = max(max([len(x) for x in X_train]), max([len(x) for x in X_test])) + 1 num_classify = max(max(y_train), max(y_test)) + 1 num_vocab = max(max([max(x) for x in X_train]), max([max(x) for x in X_test])) + 1 logger.info("num_words {0}".format(num_words)) logger.info("num_classify {0}".format(num_classify)) logger.info("num_voc {0}".format(num_vocab))
def decode(wire_list): word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Try to get the key in the reverse_word_index with a default value of '?' and join with spaces decoded_newswire = ' '.join( [reverse_word_index.get(i - 3, '?') for i in wire_list]) return decoded_newswire
def reuters_news_wire_texts(): (x_train, y_train), (x_test, y_test) = reuters.load_data() wordDict = {y: x for x, y in reuters.get_word_index().items()} texts = [] for x in x_train: texts.append(" ".join([ wordDict.get(index - 3) for index in x if wordDict.get(index - 3) is not None ])) return texts, y_train
def decode_sample(datapoint, word_index=None, reverse_word_index=None): if word_index is None: word_index = reuters.get_word_index() if reverse_word_index is None: reverse_word_index = dict([(v,k) for (k,v) in word_index.items()]) text = ' '.join(reverse_word_index.get(i-3,'?') for i in datapoint) return text
def num2word(input_data): word_index = reuters.get_word_index() print('字典长度: ' + str(len(word_index))) reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) result = ''.join(reverse_word_index.get(i - 3, '?') for i in input_data) return result
def train(): (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) print len(train_data) print len(train_labels) print train_data[10] word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_newswire = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) print train_labels[10] x_train = vectorize_sequences(train_data) x_test = vectorize_sequences(test_data) # one_hot_train_labels = to_one_hot(train_labels) # one_hot_test_labels = to_one_hot(test_labels) one_hot_train_labels = to_categorical(train_labels) one_hot_test_labels = to_categorical(test_labels) model = models.Sequential() model.add(layers.Dense(64, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(46, activation='softmax')) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) x_val = x_train[:1000] partial_x_train = x_train[1000:] y_val = one_hot_train_labels[:1000] partial_y_train = one_hot_train_labels[1000:] history = model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val)) loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(loss) + 1) plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show()
def decode_review(self): word_index = reuters.get_word_index( ) # dictionary mapping words to integer index # Map integer indices to words reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Indices offset by 3, as 0, 1 and 2 reserved for padding, start of sequence and unknown decoded_newswire = ' '.join( [reverse_word_index.get(i - 3, '?') for i in self.train_data[0]]) print(decoded_newswire)
def word_map(): global reverse_word_index # A dictionary mapping words to an integer index word_index = reuters.get_word_index() # The first indices are reserved word_index = {k: (v + 3) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def main(): (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2) word_index = reuters.get_word_index(path="reuters_word_index.json") print('# of Training Samples: {}'.format(len(x_train))) print('# of Test Samples: {}'.format(len(x_test))) num_classes = max(y_train) + 1 print('# of CLasses: {0}'.format(num_classes)) max_words = 10000 tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='count') x_test = tokenizer.sequences_to_matrix(x_test, mode='count') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) print(x_train[0]) print(len(x_train[0])) print(max(x_train[0])) print(y_train[0]) print(len(y_train[0])) model = Sequential() model.add(Dense(512, input_shape=(max_words, ))) # model.add(Activation('relu')) model.add(Activation('exponential')) model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.metrics_names) batch_size = 32 epochs = 2 model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) print('Test loss:', score[0]) print('Test accuracy:', score[1])
def test_reuters(): # only run data download tests 20% of the time # to speed up frequent testing random.seed(time.time()) if random.random() > 0.8: (x_train, y_train), (x_test, y_test) = reuters.load_data() assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) assert len(x_train) + len(x_test) == 11228 (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = reuters.get_word_index() assert isinstance(word_index, dict)
def multi_dataset_test(): random.seed(time.time()) if random.random() > 0.8: (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() assert len(x_train) == len(y_train) == 60000 assert len(x_test) == len(y_test) == 10000 (x_train, y_train), (x_test, y_test) = boston_housing.load_data() assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) (x_train, y_train), (x_test, y_test) = imdb.load_data() (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = imdb.get_word_index() assert isinstance(word_index, dict) (x_train, y_train), (x_test, y_test) = mnist.load_data() assert len(x_train) == len(y_train) == 60000 assert len(x_test) == len(y_test) == 10000 (x_train, y_train), (x_test, y_test) = reuters.load_data() assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) assert len(x_train) + len(x_test) == 11228 (x_train, y_train), (x_test, y_test) = reuters.load_data(maxlen=10) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = reuters.get_word_index() assert isinstance(word_index, dict) (x_train, y_train), (x_test, y_test) = cifar10.load_data() cifarDefaultTrainLength = 50000 cifarDefaultTestLength = 10000 assert len(x_train) == len(y_train) == cifarDefaultTrainLength assert len(x_test) == len(y_test) == cifarDefaultTestLength (x_train, y_train), (x_test, y_test) = cifar100.load_data('fine') cifarFineTrainLength = 50000 cifarFineTestLength = 10000 assert len(x_train) == len(y_train) == cifarFineTrainLength assert len(x_test) == len(y_test) == cifarFineTestLength (x_train, y_train), (x_test, y_test) = cifar100.load_data('coarse') cifarCoarseTrainLength = 50000 cifarCoarseTestLength = 10000 assert len(x_train) == len(y_train) == cifarCoarseTrainLength assert len(x_test) == len(y_test) == cifarCoarseTestLength
def train(): (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". decoded_newswire = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) # Our vectorized training data x_train = vectorize_sequences(train_data) # Our vectorized test data y_train = vectorize_sequences(test_data) # Our vectorized training labels one_hot_train_labels = to_one_hot(train_labels) # Our vectorized test labels one_hot_test_labels = to_one_hot(test_labels) model = models.Sequential() model.add(layers.Dense(64, activation='relu', input_shape=(10000, ))) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(46, activation='softmax')) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) x_val = x_train[:1000] partial_x_train = x_train[1000:] y_val = one_hot_train_labels[:1000] partial_y_train = one_hot_train_labels[1000:] return model.fit(partial_x_train, partial_y_train, epochs=20, batch_size=512, validation_data=(x_val, y_val))
def load_retures_keras(text=False): from keras.datasets import reuters from keras.preprocessing.sequence import pad_sequences max_words = 10000 print('Loading data...') (x, y), (_, _) = reuters.load_data(num_words=max_words, test_split=0.0) if not text: num_classes = np.max(y) + 1 print(num_classes, 'classes') print('Vectorizing sequence data...') x = pad_sequences(x, maxlen=250) print('x_train shape:', x.shape) return x.astype(float), y else: word_index = reuters.get_word_index() word_index = {k: (v + 3) for k, v in word_index.items()} word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 # unknown word_index["<UNUSED>"] = 3 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) def decode_review(text): return ' '.join([reverse_word_index.get(i, '?') for i in text]) all_sentence = [] for sent in x: all_sentence.append(decode_review(sent)) return np.array(all_sentence), y
def main(): # model parameters: maxlen = 400 max_words = 10000 batch_size = 32 epochs = 20 embedding_dims = 50 cnn_filters = 100 cnn_kernel_size = 5 dense_hidden_dims = 200 # 1. Loading started (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2) word_index = reuters.get_word_index(path="reuters_word_index.json") num_classes = max(y_train) + 1 # 2. pad_sequences x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen) x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen) y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # 3. Build CNN model... model = Sequential() model.add(Embedding(max_words, embedding_dims, input_length=maxlen)) model.add(Dropout(0.2)) model.add( Conv1D(cnn_filters, cnn_kernel_size, padding='valid', activation='relu')) model.add(GlobalMaxPooling1D()) model.add(Dense(dense_hidden_dims, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation='sigmoid')) model.summary() # 4. compile network model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) # 5. train model history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1) # 6. evaluate model loss_and_metrics = model.evaluate(x_test, y_test, batch_size, verbose=1) print('Test loss:{}\nTest accuracy:{}'.format(loss_and_metrics[0], loss_and_metrics[1])) # Create a graph of accuracy and loss over time history_dict = history.history history_dict.keys() acc = history_dict['categorical_accuracy'] val_acc = history_dict['val_categorical_accuracy'] loss = history_dict['loss'] val_loss = history_dict['val_loss'] epochs = range(1, len(acc) + 1) # "bo" is for "blue dot" plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show()
def decode_newswire(newswire): word_index = reuters.get_word_index() reverse_word_index = dict([(value, word) for (word, value) in word_index.items()]) decoded_newswire = ' '.join([reverse_word_index.get(value - 3, '?') for value in newswire]) print(decoded_newswire)
def print_newswire(newswire): word_index = reuters.get_word_index() word_lookup = dict([(value, key) for (key, value) in word_index.items()]) print(' '.join([word_lookup.get(i - 3, '?') for i in newswire]))
def load_data_set(type,max_len,vocab_size,batch_size): """ Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification Args: type : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set max_len: {int} timesteps used for padding vocab_size: {int} size of the vocabulary batch_size: batch_size Returns: train_loader: {torch.Dataloader} train dataloader x_test_pad : padded tokenized test_data for cross validating y_test : y_test word_to_id : {dict} words mapped to indices """ INDEX_FROM=3 if not bool(type): NUM_WORDS=vocab_size # only use top 1000 words # word index offset train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = imdb.get_word_index() word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value:key for key,value in word_to_id.items()} x = np.concatenate([x_train, x_test]) y = np.concatenate([y_train, y_test]) n_train = x.shape[0] - 1000 n_valid = 1000 x_train = x[:n_train] y_train = y[:n_train] x_test = x[n_train:n_train+n_valid] y_test = y[n_train:n_train+n_valid] #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50) x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,x_test_pad,y_test,word_to_id else: from keras.datasets import reuters train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = reuters.get_word_index(path="reuters_word_index.json") word_to_id = {k:(v+3) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id['<EOS>'] = 3 id_to_word = {value:key for key,value in word_to_id.items()} x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,train_set,test_set,x_test_pad,word_to_id
def decodeToWords(sequence): wordIndex=reuters.get_word_index() revIndex=dict( [(value,key)for (key,value) in wordIndex.items()]) decWords=" ".join([revIndex.get(i-3,'?') for i in sequence]) return decWords
#1. 데이터 (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2) # 가장 많이 쓰이는 단어 10000개를 load하겠다 print("x_train.shape, x_test.shape: ", x_train.shape, x_test.shape) # x_train.shape, x_test.shape: (8982,) (2246,) print("y_train.shape, y_test.shape: ", y_train.shape, y_test.shape) # y_train.shape, y_test.shape: (8982,) (2246,) print("첫번째 훈련용 뉴스 기사: \n", x_train[0]) # 인덱스 숫자만 리스트 형태로 출력 print(" 첫번째 훈련용 뉴스 기사 레이블: \n", y_train[0]) # 인덱스만 출력 # x_train에 들어있는 숫자들이 각각 어떤 단어들을 나타내는지 확인 word_index = reuters.get_word_index() print("x데이터의 word_index: \n", word_index) # 딕셔너리 형태로 각 단어별 인덱스 출력 # 인덱스를 단어로 바꿔주기 from keras.preprocessing.text import Tokenizer token = Tokenizer() word_index = token.fit_on_texts(reuters.get_word_index()) word = token.sequences_to_texts(x_train[0:1]) print("x_train의 첫번째 word: \n", word) # x_train의 shape를 확인하고 싶다? # 하지만 리스트 형이라 shape가 없다 print(len(x_train[0])) # 87 # y의 카테고리 개수 출력
def translate(sentense): word_index = reuters.get_word_index() reverse_index = dict([(value, key) for (key, value) in word_index.items()]) decoded = ' '.join([reverse_index.get(i - 3, '*') for i in sentense]) return decoded
import keras from keras.datasets import reuters # 限定为前10000个最常用单词 (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) # 样本整数列表转为单词形式 word_index = reuters.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) # Note that our indices were offset by 3 # because 0, 1 and 2 are reserved indices for "padding", "start of sequence", and "unknown". decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) print(decoded_newswire) # 准备数据,将其向量化 import numpy as np def vectorize_sequences(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1. return results # Our vectorized training data x_train = vectorize_sequences(train_data) # Our vectorized test data x_test = vectorize_sequences(test_data)
model.add(Dropout(0.25)) model.add(Dense(1, activation="sigmoid")) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) model.summary() return model import pandas as pd from keras.datasets import reuters from keras.preprocessing import sequence from keras.preprocessing.text import text_to_word_sequence #word_dict = imdb.get_word_index() word_dict = reuters.get_word_index() def encode_sentence(text): result = [] arr = text_to_word_sequence( text, lower=True, split=" ") # returns list of words (like split) for word in arr: w = encode_word(word) if w is not None: result.append(w) return result def encode_word(word): if word not in word_dict:
import keras from keras.datasets import reuters #Using TensorFlow backend. (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2) word_index = reuters.get_word_index(path="reuters_word_index.json") print('# of Training Samples: {}'.format(len(x_train))) print('# of Test Samples: {}'.format(len(x_test))) num_classes = max(y_train) + 1 print('# of Classes: {}'.format(num_classes)) # of Training Samples: 8982 # of Test Samples: 2246 # of Classes: 46 index_to_word = {} for key, value in word_index.items(): index_to_word[value] = key print(' '.join([index_to_word[x] for x in x_train[0]])) print(y_train[0]) from keras.preprocessing.text import Tokenizer max_words = 10000 tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') x_test = tokenizer.sequences_to_matrix(x_test, mode='binary') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) print(x_train[0])
theano.config.optimizer = 'fast_run' # also 'fast_run' or 'None' for debugging theano.config.linker = 'py' theano.config.floatX = 'float32' print 'initialising...' V = 1001 E = 12 total_trainset = 10000 total_iterations = 9000 train_x_entropy = 0 (X_train, y_train), (X_test, y_test) = reuters.load_data(path="reuters.pkl", \ nb_words=None, skip_top=0, maxlen=None, test_split=0.1, seed=10086) word_map_tmp = reuters.get_word_index(path="reuters_word_index.pkl") word_dict = dict((v, k) for k, v in word_map_tmp.iteritems()) word_dict[0] = "<UNK>" def real_words(l, eos): sent = [] for word in l: if word == eos : sent.append("<EOS>") elif word > eos: sent.append(word_dict[0]) else: sent.append(word_dict[word]) return sent
from keras.datasets import reuters (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) #从keras内置的数据集中引入数据。 #控制最大单词数目,选取频率前一万的。 #分别装入测试数据和训练数据中,相当于返回了...两个1×2的变量数组。 #此时数据是 Object的数组,因为每个Object的长度不同,所以并不是数组的数组。 #这些Object是单词的有序序列。 #但不满足最后需求的形式,最后要求的是numpy的一个矩阵。 #该数组的下标反映了一共有 多少个 样本。 word_index = reuters.get_word_index() #是一个 单词 和 序号 的 字典,其中 键 是 单词 ,值 是 数字。 reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) #反转词典 键值对 关系 的方法。 #要延后三位。 #前三位都是符号位。(?没懂) decoded_newswire = ' '.join([reverse_word_index.get(i - 3, '?') for i in train_data[0]]) #对于里面的每一个索引序号,都查询词典里对应的单词并且用“ ”拼接在一起。 #这拼出来的啥玩意。 #总之可以还原,但重要的是知道怎么转换。 #preparing the data
def main(): from keras.datasets import reuters (train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000) print('\n\nFirst sample of the Reuters training dataset:\n', train_data[0]) word_index = reuters.get_word_index() inverse_word_index = dict((v, k) for k, v in word_index.iteritems()) news_article_0 = ' '.join( [inverse_word_index.get(i - 3, '?') for i in train_data[0]]) print('Corresponding news article:\n', news_article_0, '\n\n') train_data = vectorize(train_data) test_data = vectorize(test_data) train_labels = to_categorical(train_labels) test_labels = to_categorical(test_labels) model = build_model() model.compile(optimizer=RMSprop(lr=1e-3), loss='categorical_crossentropy', metrics=['accuracy']) validation_data = train_data[:1000] validation_labels = train_labels[:1000] partial_train_data = train_data[1000:] partial_train_labels = train_labels[1000:] history = model.fit(partial_train_data, partial_train_labels, epochs=20, batch_size=512, verbose=0, validation_data=(validation_data, validation_labels)) plot_loss_and_accuracy(history) model = build_model() model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(train_data, train_labels, epochs=9, batch_size=512, verbose=0, validation_data=(test_data, test_labels)) evaluation = model.evaluate(test_data, test_labels) print( 'Loss on test data:{evaluation[0]}\nAccuracy on test data: {evaluation[1]}\n' .format(**locals())) predictions = model.predict(test_data) print( 'Topic with highest probability for the first news article in the test dataset:\n', np.argmax(predictions[0]))