def test_imdb(): # only run data download tests 20% of the time # to speed up frequent testing random.seed(time.time()) if random.random() > 0.8: (X_train, y_train), (X_test, y_test) = imdb.load_data() (X_train, y_train), (X_test, y_test) = imdb.load_data(maxlen=40)
def test_imdb_load_does_not_affect_global_rng(fake_downloaded_imdb_path): np.random.seed(1337) before = np.random.randint(0, 100, size=10) np.random.seed(1337) imdb.load_data(path=fake_downloaded_imdb_path, seed=9876) after = np.random.randint(0, 100, size=10) assert np.array_equal(before, after)
def test_imdb(): # only run data download tests 20% of the time # to speed up frequent testing random.seed(time.time()) if random.random() > 0.8: (x_train, y_train), (x_test, y_test) = imdb.load_data() (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40) assert len(x_train) == len(y_train) assert len(x_test) == len(y_test) word_index = imdb.get_word_index() assert isinstance(word_index, dict)
def __init__(self, feature='tfidf', **kwargs): super(IMDB, self).__init__(**kwargs) if self.conf is not None: feature = self.conf.get('feature', 'tfidf') if feature.startswith('tfidf'): max_features = 5000 (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) else: (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=None, skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=2, index_from=3) X, y = self.get_data_by_imageset(X_train, y_train, X_test, y_test) print('data_set={}, Average sequence length: {}'.format(self.data_set, np.mean(list(map(len, X))))) #feature if feature == 'origin': maxlen = 400 X = sequence.pad_sequences(X, maxlen=maxlen) elif feature == 'tfidf': from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=False) #transformer = TfidfTransformer(smooth_idf=True) X_train_bin = np.zeros((len(X_train), max_features), dtype=np.int16) X_bin = np.zeros((len(X), max_features), dtype=np.int16) for i, X_i in enumerate(X_train): X_train_bin[i, :] = np.bincount(X_i, minlength=max_features) for i, X_i in enumerate(X): X_bin[i, :] = np.bincount(X_i, minlength=max_features) transformer.fit(X_train_bin) X = transformer.transform(X_bin) X = np.asarray(X.todense()) elif feature == 'tfidf_seq': from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=False) maxlen = 400 N = len(X) X_bin = np.zeros((N, max_features), dtype=np.int16) for i, X_i in enumerate(X): X_bin_i = np.bincount(X_i) X_bin[i, :len(X_bin_i)] = X_bin_i tfidf = transformer.fit_transform(X_bin) tfidf = np.asarray(tfidf.todense()) X_id = sequence.pad_sequences(X, maxlen=maxlen) X = np.zeros(X_id.shape, dtype=np.float32) for i in range(N): X[i, :] = tfidf[i][X_id[i]] else: raise ValueError('Unkown feature: ', feature) X = X[:,np.newaxis,:,np.newaxis] self.X = self.init_layout_X(X) self.y = self.init_layout_y(y)
def test_dan_original(): max_features = 20000 maxlen = 100 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print("Loading data...") (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print(len(X_train), "train sequences") print(len(X_test), "test sequences") print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) model = dan_original(max_features) # try using different optimizers and different optimizer configs model.compile(loss="binary_crossentropy", optimizer="adagrad", class_mode="binary") print("Train...") model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True) print("Test score:", score) print("Test accuracy:", acc)
def train(): # load the dataset but only keep the top n words, zero the rest (X_train, Y_train), (X_test, Y_test) = imdb.load_data(nb_words=top_words) # truncate and pad input sequences X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) # create the model embedding_vecor_length = 32 model = Sequential() model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length)) model.add(Dropout(0.2)) model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu')) model.add(MaxPooling1D(pool_length=2)) model.add(LSTM(100)) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=2, batch_size=64) # Final evaluation of the model scores = model.evaluate(X_test, Y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1]*100)) model.save("imdb_%0.2f.pkl" % scores[1])
def main(): top_words = 5000 # Keep only the most frequent 500 words in the dataset. (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) # Keras requires same length (although 0 will mean no information). max_review_length = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) embedding_length = 32 input_seq = Input(shape=(500,)) a = Embedding(top_words, embedding_length, input_length=max_review_length)(input_seq) b, state_h, state_c = LSTM(100, return_state=True, return_sequences=True)(a) c = AttentionLayerV2(attention_depth=4)(b) d = Dropout(0.5)(c) e = Dense(1, activation='sigmoid')(d) model = Model(inputs=[input_seq], outputs=[e]) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy']) model.summary() # print(model.predict(np.ones((10, 500)))) model.fit(X_train, y_train, epochs=5, batch_size=64) # Final evaluation of the model scores = model.evaluate(X_test, y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1]*100)) model.save_weights('model_weights.h5')
def run_keras_cnn_example(): # set parameters: max_features = 5000 maxlen = 100 batch_size = 32 embedding_dims = 100 nb_filter = 250 filter_length = 3 hidden_dims = 250 nb_epoch = 2 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) model.add(Dropout(0.25)) # we add a Convolution1D, which will learn nb_filter # word group filters of size filter_length: model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='tanh', subsample_length=1)) # we use standard max pooling (halving the output of the previous layer): model.add(MaxPooling1D(pool_length=2)) # We flatten the output of the conv layer, # so that we can add a vanilla dense layer: model.add(Flatten()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(0.25)) model.add(Activation('tanh')) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='rmsprop', class_mode='binary') model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True, validation_data=(X_test, y_test))
def imdb_test(): # set parameters: max_features = 5000 # number of vocabulary maxlen = 200 # padding batch_size = 16 nb_epoch = 10 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) nb_classes = 2 y_train = np_utils.to_categorical(y_train, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) model = imdb_cnn() plot(model, to_file='./images/imdb_model.png') # try using different optimizers and different optimizer configs # model.compile(loss='binary_crossentropy', optimizer='adagrad', class_mode="binary") model.compile(loss='categorical_crossentropy', optimizer='adagrad') print("Train...") early_stopping = EarlyStopping(monitor='val_loss', patience=5) model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, y_test), show_accuracy=True, callbacks=[early_stopping]) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True) print('Test score:', score) print('Test accuracy:', acc)
def imdb_lstm(): max_features = 20000 maxlen = 80 # cut texts after this number of words (among top max_features most common words) batch_size = 32 (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) print type(X_train) exit(0) print len(X_train), 'train sequences' print len(X_test), 'test sequences' print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() model.add(Embedding(max_features, 128, dropout=0.2)) model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) # try using a GRU instead, for fun model.add(Dense(1)) model.add(Activation('sigmoid')) # try using different optimizers and different optimizer configs model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15, validation_data=(X_test, y_test)) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size) print('Test score:', score) print('Test accuracy:', acc)
def load_data(data_source): assert data_source in ["keras_data_set", "local_dir"], "Unknown data source" if data_source == "keras_data_set": (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" else: x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data() vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} y = y.argmax(axis=1) # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * 0.9) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] return x_train, y_train, x_test, y_test, vocabulary_inv
def data(): np.random.seed(1337) # for reproducibility max_features = 20000 maxlen = 100 (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) return X_train, X_test, y_train, y_test, maxlen, max_features
def load_imdb(): """ Load IMDB dataset Transform input data into an RDD of Sample """ from keras.preprocessing import sequence from keras.datasets import imdb (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=20000) X_train = sequence.pad_sequences(X_train, maxlen=100) X_test = sequence.pad_sequences(X_test, maxlen=100) return X_train, y_train, X_test, y_test
def data(): import numpy as np from keras.preprocessing import sequence from keras.datasets import imdb np.random.seed(1337) # for reproducibility max_features = 20000 maxlen = 100 (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) return X_train, X_test, y_train, y_test, maxlen, max_features
def data(): maxlen = 100 max_features = 20000 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) return X_train, X_test, y_train, y_test, max_features, maxlen
def loadData(): (train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=100) p_train_data = train_data[0:10000] p_train_labels = train_labels[0:10000] val_data = train_data[20000:21000] val_labels = train_labels[20000:21000] x_train = p_train_data y_train = np.asarray(p_train_labels) x_train_arr = x_train.tolist() y_train_arr = y_train.tolist() x_val = val_data y_val = np.asarray(val_labels) x_val_arr = x_val.tolist() y_val_arr = y_val.tolist() return (x_train_arr, y_train_arr), (x_val_arr, y_val_arr)
def train_1layer_glove_wordembedding(hidden_dim,modelfile): train = {} test = {} dev = {} #embedded_train, train_labels = WordEmbeddingLayer.load_embedded_data(path="../data/",name="train",representation="glove.840B.300d") #embedded_dev, dev_labels = WordEmbeddingLayer.load_embedded_data(path="../data/",name="dev",representation="glove.840B.300d") #embedded_test, test_labels = WordEmbeddingLayer.load_embedded_data(path="../data/",name="test",representation="glove.840B.300d") #embedded_train, train_labels, word_to_index, index_to_word, labels_count = DataPrep.load_one_hot_sentiment_data("../data/sentiment/trainsentence_and_label_binary.txt") #embedded_dev, dev_labels= DataPrep.load_one_hot_sentiment_data_traind_vocabulary("../data/sentiment/devsentence_and_label_binary.txt",word_to_index,index_to_word,labels_count) #self.test["sentences"], self.test["sentiments"]= DataPrep.load_one_hot_sentiment_data_traind_vocabulary("../../data/sentiment/testsentence_and_label_binary.txt",self.word_to_index, self.index_to_word,self.labels_count) (X_train, train_labels), (X_test,dev_labels) = imdb.load_data(nb_words=20000,test_split=0.2) embedded_train = [] embedded_dev = [] train_labels = [np.eye(2)[l] for l in train_labels] dev_labels = [np.eye(2)[l] for l in dev_labels] one_hot_vocab = np.eye(20000,dtype=np.float32) for sent in X_train: sentence = [one_hot_vocab[term] for term in sent] embedded_train.append(sentence) for sent in X_test: sentence = [one_hot_vocab[term] for term in sent] embedded_dev.append(sentence) flstm = FullyConnectedLSTM(input_dim=len(embedded_train[0][0]),output_dim=2,number_of_layers=1, hidden_dims=[hidden_dim],dropout_p=0.1,learning_rate=0.01) flstm.build_model() #train_labels[train_labels == 0] = -1 #dev_labels[dev_labels == 0] = -1 flstm.train(embedded_train,train_labels,embedded_dev,dev_labels) flstm.save_model(modelfile)
def test_imdb_lstm(): max_features = 20000 maxlen = 80 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print('Build model...') model = Sequential() model.add(Embedding(max_features, 128)) model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(1, activation='sigmoid')) # try using different optimizers and different optimizer configs model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print('Train...') model.fit(x_train, y_train, batch_size=batch_size, epochs=15, validation_data=(x_test, y_test)) score, acc = model.evaluate(x_test, y_test, batch_size=batch_size) print('Test score:', score) print('Test accuracy:', acc)
model = Model([comment_seq], output) adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy']) return model if __name__ == '__main__': print('————————————————load data————————————————') (X_train, y_train), (X_test, y_test) = imdb.load_data() X_all = (list(X_train) + list(X_test))[0:1000] y_all = (list(y_train) + list(y_test))[0:1000] print(len(X_all), len(y_all)) imdb_word2idx = imdb.get_word_index() imdb_idx2word = dict((idx, word) for (word, idx) in imdb_word2idx.items()) X_all = [[imdb_idx2word.get(idx - 3, '?') for idx in sen][1:] for sen in X_all] w2vModel = train_W2V(X_all, in_path + 'w2vModel') word2idx, embedMatrix = build_word2idx_embedMatrix( w2vModel) # 制作word2idx和embedMatrix X_all_idx = make_X_train_idx(X_all, word2idx, MAX_SEQ_LEN) y_all_idx = np.array(y_all) # 一定要注意,X_all和y_all必须是np.array()类型,否则报错
@author: Tim George Kabue @phone_number: +254706762054 @email: [email protected] """ from keras.datasets import imdb from keras import preprocessing, models, layers import numpy as np import matplotlib.pyplot as plt #Load dataset. no_of_tokens = 10000 (train_samples, train_labels), (test_samples, test_labels) = imdb.load_data(num_words=no_of_tokens) #Preprocess the data. max_len = 500 train_samples = preprocessing.sequence.pad_sequences(train_samples, maxlen=max_len) test_samples = preprocessing.sequence.pad_sequences(test_samples, maxlen=max_len) #Network architecture. model = models.Sequential() vector_size = 32 model.add(layers.Embedding(no_of_tokens, vector_size)) model.add(layers.SimpleRNN(32))
def load_data_set(type,max_len,vocab_size,batch_size): """ Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification Args: type : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set max_len: {int} timesteps used for padding vocab_size: {int} size of the vocabulary batch_size: batch_size Returns: train_loader: {torch.Dataloader} train dataloader x_test_pad : padded tokenized test_data for cross validating y_test : y_test word_to_id : {dict} words mapped to indices """ INDEX_FROM=3 if not bool(type): NUM_WORDS=vocab_size # only use top 1000 words # word index offset train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = imdb.get_word_index() word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value:key for key,value in word_to_id.items()} x = np.concatenate([x_train, x_test]) y = np.concatenate([y_train, y_test]) n_train = x.shape[0] - 1000 n_valid = 1000 x_train = x[:n_train] y_train = y[:n_train] x_test = x[n_train:n_train+n_valid] y_test = y[n_train:n_train+n_valid] #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50) x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,x_test_pad,y_test,word_to_id else: from keras.datasets import reuters train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM) x_train,y_train = train_set[0],train_set[1] x_test,y_test = test_set[0],test_set[1] word_to_id = reuters.get_word_index(path="reuters_word_index.json") word_to_id = {k:(v+3) for k,v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id['<EOS>'] = 3 id_to_word = {value:key for key,value in word_to_id.items()} x_train_pad = pad_sequences(x_train,maxlen=max_len) x_test_pad = pad_sequences(x_test,maxlen=max_len) train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor)) train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True) return train_loader,train_set,test_set,x_test_pad,word_to_id
output_dir = 'model_output/imdb_deep_net' epochs = 4 batch_size = 128 n_dim = 64 n_unique_words = 5000 n_words_to_skip = 50 max_review_length = 100 pad_type = trunc_type = 'pre' n_dense = 64 dropout = 0.5 (X_train, y_train), (X_valid, y_valid) = imdb.load_data(num_words=n_unique_words, skip_top=n_words_to_skip) word_index = keras.datasets.imdb.get_word_index() # v+3 so we push the words 3 positions. word_index = {k: (v + 3) for k, v in word_index.items()} # Now we fill in some keywords for the first 3 indexes as seen below. word_index['PAD'] = 0 word_index['START'] = 1 word_index['UNK'] = 2 index_word = {v: k for k, v in word_index.items()} review = ' '.join(index_word[id] for id in X_train[0]) print(review)
from keras.datasets import imdb from keras.layers import Embedding, Dense, LSTM from keras.models import Sequential from keras.preprocessing import sequence from matplotlib import pyplot as plt max_features = 10000 maxlen = 500 batch_size = 32 print('Loading data...') (input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features) print(len(input_train), 'train sequences') print(len(input_test), 'test sequences') print('Pad sequences (samples x time)') input_train = sequence.pad_sequences(input_train, maxlen=maxlen) input_test = sequence.pad_sequences(input_test, maxlen=maxlen) print('input train shape:', input_train.shape) print('input test shape:', input_test.shape) model = Sequential() model.add(Embedding(max_features, 32)) model.add(LSTM(32)) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) history = model.fit(input_train, y_train, epochs=10,
import numpy as np from keras.datasets import imdb from keras.preprocessing.text import Tokenizer from keras import models from keras import layers import matplotlib.pyplot as plt np.random.seed(0) number_of_features = 10000 (data_train, target_train), (data_test, target_test) = imdb.load_data(num_words=number_of_features) tokenizer = Tokenizer(num_words=number_of_features) features_train = tokenizer.sequences_to_matrix(data_train, mode="binary") features_test = tokenizer.sequences_to_matrix(data_test, mode="binary") network = models.Sequential() network.add( layers.Dense(units=16, activation="relu", input_shape=(number_of_features, ))) network.add(layers.Dense(units=16, activation="relu")) network.add(layers.Dense(units=1, activation="sigmoid"))
from keras.datasets import imdb from keras.preprocessing import sequence from keras.layers import * from keras.optimizers import Adam from keras import Model from PositionEmbedding import SinusoidalPositionEmbedding from MultiHeadAttention import MultiHeadAttention from LayerNormalization import LayerNormalization max_words = 20000 maxlen = 100 embed_dim = 64 batch_size = 128 (x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=max_words) x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_val = sequence.pad_sequences(x_val, maxlen=maxlen) text_input = Input(shape=(maxlen, ), dtype='int32') x = Embedding(max_words, embed_dim)(text_input) x = SinusoidalPositionEmbedding()(x) def transformer_encoder(inputs, num_heads=4, dropout_rate=0.1): in_dim = K.int_shape(inputs)[-1] x = MultiHeadAttention(num_heads, in_dim)([inputs, inputs]) x = Dropout(dropout_rate)(x) x = add([inputs, x]) x1 = LayerNormalization()(x) x = Dense(in_dim * 2, activation='relu')(x1)
import numpy as np # データ数、特徴数、ベクトルの次元数、ステップ数など train_reviews = 5000 valid_reviews = 100 max_features = 5000 embedding_size = 256 step_size = 5 batch_size = 32 index_from = 2 rnn_units = 128 epochs = 2 word_index_prev = {'<PAD>': 0, '<START>': 1, '<UNK>': 2} # IMDBデータを読み込み (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, index_from=index_from) # IMDBデータから単語情報を抽出 word_index = { word: (index + index_from) for word, index in imdb.get_word_index().items() if (index + index_from) < max_features } word_index.update(word_index_prev) # 単語情報から辞書を作成 index_word = {index: word for word, index in word_index.items()} # 文章を表示する関数 def print_sentence(sentence):
# Tutorial: https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/ import numpy # Import IMDB reviews labeled dataset # Downloads it from https://s3.amazonaws.com/text-datasets/imdb.pkl (33M) on the first run # Or https://s3.amazonaws.com/text-datasets/imdb.npz from keras.datasets import imdb from keras.layers import Embedding from keras.preprocessing.sequence import pad_sequences from matplotlib import pyplot # load the dataset (X_train, y_train), (X_test, y_test) = imdb.load_data() X = numpy.concatenate((X_train, X_test), axis=0) y = numpy.concatenate((y_train, y_test), axis=0) # summarize size print("Training data: ") print(X.shape) print(y.shape) # Summarize number of classes print("Classes: ") print(numpy.unique(y)) # Summarize number of words print("Number of words: ") print(len(numpy.unique(numpy.hstack(X)))) # Summarize review length print("Review length: ")
# In[1]: from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense, Embedding from keras.layers import LSTM from keras.datasets import imdb max_features = 20000 maxlen = 80 batch_size = 32 # 加载数据并将单词转化为ID,max_features给出了最多使用的单词数。 (trainX, trainY), (testX, testY) = imdb.load_data(num_words=max_features) print(len(trainX), 'train sequences') print(len(testX), 'test sequences') # 在自然语言中,每一段话的长度是不一样的,但循环神经网络的循环长度是固定的, # 所以这里需要先将所有段落统一成固定长度。 trainX = sequence.pad_sequences(trainX, maxlen=maxlen) testX = sequence.pad_sequences(testX, maxlen=maxlen) print('trainX shape:', trainX.shape) print('testX shape:', testX.shape) # ### 2. 定义模型。 # In[2]:
from keras.models import Sequential from keras.layers import Dense from keras.utils import np_utils, to_categorical from keras.datasets import imdb if not os.path.exists('output'): os.mkdir('output') f = open("output/testme.txt", 'w') f.write("hellllo") f.close() (train_data, train_labels), (test_data, test_labels) = imdb.load_data( path="imdb.npz", num_words=10000) print("train_data ", train_data.shape) print("train_labels ", train_labels.shape) print("_"*100) print("Going to sleep for 20 min") print("Going to sleep for 20 min") time.sleep(5) time.sleep(1200) print("test_data ", test_data.shape) print("test_labels ", test_labels.shape) print("_"*100) # See an actual review in words # Reverse from integers to words using the DICTIONARY (given by keras...need to do nothing to create it)
#加载库 import numpy as np from keras.datasets import imdb from keras.preprocessing.text import Tokenizer from keras import models from keras import layers #设定随机种子 np.random.seed(0) #设定想要的特征数量 number_of_features = 1000 #从影评数据中加载数据和目标向量 (data_train,target_train),(data_test,target_test)=imdb.load_data(num_words=number_of_features) #将影评数据转换为one-hot 编码过的特征矩阵 tokenizer = Tokenizer(num_words=number_of_features) features_train = tokenizer.sequences_to_matrix(data_train,mode='binary') features_test = tokenizer.sequences_to_matrix(data_test,mode='binary') #创建神经网络对象 network = models.Sequential() #添加使用RelU激活函数的全连接层 network.add(layers.Dense(units=16,activation="relu",input_shape=(number_of_features,))) #添加使用ReLU激活函数的全连接层 network.add(layers.Dense(units=16,activation="relu")) #添加使用sigmoid激活函数的全连接层 network.add(layers.Dense(units=1,activation="sigmoid"))
# coding=utf-8 from keras.callbacks import EarlyStopping from keras.datasets import imdb from keras.preprocessing import sequence from textcnn_model import TextCNN vocab_size = 5000 maxlen = 512 batch_size = 32 embedding_dims = 100 epochs = 10 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x time)...') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print('Build model...') model = TextCNN(maxlen, vocab_size, embedding_dims).get_model() model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) print('Train...') #这里可以自己实现自己需要的回调函数,做比赛时基本都是自定义回调函数
from keras.datasets import imdb from keras import models,layers import pandas as pd import numpy as np import matplotlib.pyplot as plt (train_data, train_labels), (test_data, test_labels) = imdb.load_data( num_words=10000) print(train_data[0]) print(train_labels[0]) #vectorize train data to 0's and 1's def vectorizeComment(sequences,dimensions=10000): results=np.zeros((len(sequences),dimensions)) for i,sequence in enumerate(sequences): #enumerate used to add counter to array results[i,sequence]=1 return results x_train=vectorizeComment(train_data) #converting label as array and vectorize it y_train=np.asarray(train_labels).astype('float32') #x_test=vectorizeComment(test_data) model=models.Sequential() model.add(layers.Dense(16,kernel_regularizer=regularizers.l2(0.001),activation='relu',input_shape=(10000,))) #adding 3 layers model.add(layers.Dense(16,kernel_regularizer=regularizers.l2(0.001),activation='relu')) model.add(layers.Dense(1,activation='sigmoid'))
img = image.load_img(pic, target_size=(224, 224)) x = image.img_to_array(img) x = np.expand_dims(x, axis=0) x = preprocess_input(x) preds = resnet50.predict(x) predict = decode_predictions(preds, top=2) print("Neural Net predicts this is a: ", predict[0][0]) print("Neural Net predicts this is a: ", predict[0][1]) ######## ######## #imdb data of 50k reviews. Train and test set are 25k each. Each set around 50% postiive and 50% negative reviews from keras.datasets import imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data( num_words=10000 ) #num_words is 10k most frequent words. All others will be discarded max([max(sequence) for sequence in train_data]) word_index = imdb.get_word_index() ###get the word to the index. #Subtract 3 because first 3 are reserved for 'padding','start of sequence', and 'unknown' reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_reivew = ' '.join( [reverse_word_index.get(i - 3, '?') for i in train_data[0]]) #PREP THE DATA
[ 2.71, 0.95, -4.85], [-2.82, -1.4 , 2.69]], dtype=float32), # biases for 3 output nodes array([ 0.21, -0.39, -1.22], dtype=float32) ] # test the model and your weights # model.fit(bin7, count3, epochs=1) # model.set_weights(myWeights) # predict3 = model.predict(bin7) # np.set_printoptions(suppress=True) # np.set_printoptions(precision=1) # print('prediction =', predict3) from keras.datasets import imdb (x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz", num_words=None, skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=2, index_from=3) Examples = { 'count3' : [ bin7, count3, model, myWeights ], 'imdb' : [x_train, y_train, model, myWeights], }
import pyswarms as ps from keras.datasets import imdb from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix from sklearn import preprocessing, datasets, linear_model from sklearn.model_selection import train_test_split ##### ##### Loading the dataset and bifercating it into training and testing data for IMDB dataset ##### # maximum vocabulary nb_words = 50000 # cut texts after this number of words maxlen = 100 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=nb_words) x = x_train y = y_train ##### ##### Padding input sequences ##### print('Pad sequences (samples x time)') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) ##### ##### Shaping the training and data input #####
from keras.layers import Dense, Dropout, Activation from keras.preprocessing.text import Tokenizer import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') np.random.seed(42) # ## 1. Loading the data # This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment. # In[21]: # Loading the data (it's preloaded in Keras) (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000) print(x_train.shape) print(x_test.shape) # ## 2. Examining the data # Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector. # # The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative. # In[22]: print(x_train[0]) print(y_train[0])
# word embeddings differ from taks to task, some work better on sentiment analysis some on QA # embeddings must have meaning, similar words or even synonyms should be closer to each other # they should have some sort of geometric representation, distances, vectors for example: # dog+wild_vector=wolf, cat+wild_vector=tiger, dog and cat are similar (small distance), both pets # TODO: 186 + 187 embedding_layer = Embedding(1000, 64) # (number of possible tokens, dimensionality) # takes indices of words from dict ({1: "hello", 2: "good", ...}) -> and transforms to dense vectors max_features = 10000 # 10k most common words maxlen = 20 # take first 20 words (x_train, y_train), (x_test, y_test) = imdb.load_data( num_words=max_features) # samples, words, list of ints x_train = preprocessing.sequence.pad_sequences( x_train, maxlen=maxlen) # 2D integer tensor x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen) # embedding sentences in sequences of vectors, flattening them, and training a Dense layer model = Sequential() model.add(Embedding(10000, 8, input_length=maxlen)) # 8 dimensional embedding model.add(Flatten()) # 3D -> 2D tensor model.add(Dense(1, activation='sigmoid') ) # this model treats words as single units, not a group of words model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) model.summary()
from keras.datasets import imdb def p(mess, obj): """Useful function for tracing""" if hasattr(obj, 'shape'): print(mess, type(obj), obj.shape, "\n", obj) else: print(mess, type(obj), "\n", obj) lexicon_size = 20000 (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=lexicon_size) word_index = imdb.get_word_index() reverse_word_index = dict([(value, key) for (key, value) in word_index.items()]) reviews = {} titles = {} for i, encoded_review in enumerate(train_data): decoded_review = ' '.join( [reverse_word_index.get(i - 3, '?') for i in encoded_review]) reviews[i] = decoded_review titles[i] = decoded_review[:20] for i, title in enumerate(reviews): print(reviews[title])
def test_text_classification(self): # This example demonstrates the use of Convolution1D for text classification. # This example is from Keras K.set_image_dim_ordering("th") import numpy as np np.random.seed(1337) # for reproducibility from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers import Embedding from keras.layers import Convolution1D from keras.datasets import imdb # set parameters: max_features = 5000 maxlen = 400 batch_size = 32 embedding_dims = 50 nb_filter = 250 filter_length = 3 hidden_dims = 250 nb_epoch = 1 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() model.add(Embedding( max_features, embedding_dims, input_length=maxlen)) # Exception if specify Dropout dropout=0.2 # we add a Convolution1D, which will learn nb_filter # word group filters of size filter_length: model.add( Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='relu', subsample_length=1)) # we use max pooling: model.add(GlobalMaxPooling1D()) # model.add(GlobalAveragePooling1D()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(0.2)) model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model = with_bigdl_backend(model) model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, y_test), is_distributed=True) # 2017-09-22 15:53:45 INFO DistriOptimizer$:657 # - Top1Accuracy is Accuracy(correct: 21557, count: 25000, accuracy: 0.86228) # this result is from GlobalAveragePooling not GlobalMaxPooling. model.predict(X_test, is_distributed=True) model.evaluate(X_test, y_test, is_distributed=True) print(model)
np.random.seed(1337) # for reproducibility from keras.preprocessing import sequence from keras.utils import np_utils from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.embeddings import Embedding from keras.layers.recurrent import LSTM from keras.datasets import imdb max_features = 20000 maxlen = 100 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() model.add(Embedding(max_features, 128, input_length=maxlen, dropout=0.5)) model.add(LSTM(128, dropout_W=0.5, dropout_U=0.1)) # try using a GRU instead, for fun model.add(Dropout(0.5)) model.add(Dense(1))
import keras from keras.models import Sequential from keras.layers import Embedding, LSTM, Dense from keras.datasets import imdb from keras.preprocessing import sequence VOCAB_SIZE = 5000 # Number of words to load INDEX_FROM = 3 # Start at 3 to account for padding/unknown, & start of sentence MAX_SEQ_LEN = 128 # Max input length EMBEDDING_DIM = 64 # Load and assign dataset (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE, index_from=INDEX_FROM) word_to_idx = imdb.get_word_index() idx_to_word = {v + INDEX_FROM: k for k, v in word_to_idx.items()} idx_to_word[0] = '<PAD>' idx_to_word[1] = '<START>' idx_to_word[2] = '<UNK>' # Fit sequences to max length X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQ_LEN) X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQ_LEN) print(' '.join([idx_to_word[idx] for idx in X_train[0]])) # Specify the model model = Sequential() model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQ_LEN))
import numpy from keras.datasets import imdb from keras.models import Sequential from keras.layers import Dense from keras.layers import Flatten from keras.layers.convolutional import Convolution1D from keras.layers.convolutional import MaxPooling1D from keras.layers.embeddings import Embedding from keras.preprocessing import sequence # fix random seed for reproducibility seed = 7 numpy.random.seed(seed) # load the dataset but only keep the top n words, zero the rest top_words = 5000 test_split = 0.33 (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, test_split=test_split) # pad dataset to a maximum review length in words max_words = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) # create the model model = Sequential() model.add(Embedding(top_words, 32, input_length=max_words)) model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu')) model.add(MaxPooling1D(pool_length=2)) model.add(Flatten()) model.add(Dense(250, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) # Fit the model
from keras.datasets import imdb vocabulary_size = 5000 (X_train, y_train),(X_test, y_test) = imdb.load_data(num_words = vocabulary_size) print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test))) print('---review---') print(X_train[6]) # review is stored as a sequence of integers. These are word IDs that have been pre-assigned to individual words print('---label---') print(y_train[6]) #label is an integer (0 for negative, 1 for positive). word2id = imdb.get_word_index() id2word = {i: word for word, i in word2id.items()} print('---review with words---') print([id2word.get(i, ' ') for i in X_train[6]]) print('---label---') print(y_train[6]) print('Maximum review length: {}'.format(len(max((X_train + X_test), key=len)))) print('Minimum review length: {}'.format(len(min((X_test + X_test), key=len)))) from keras.preprocessing import sequence max_words = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) from keras import Sequential from keras.layers import Embedding, LSTM, Dense, Dropout embedding_size=32 model=Sequential()
import numpy as np import matplotlib.pyplot as plt from keras.datasets import imdb from keras import models, layers def vectorized(alist, dim=10000): a = np.zeros((len(alist), dim)) for k, v in enumerate(alist): a[k, v] = 1 return a (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) x_train = vectorized(train_data) x_test = vectorized(test_data) y_train = np.asarray(train_labels).astype('float32') y_test = np.asarray(test_labels).astype('float32') x_val = x_train[:10000] partial_x_val = x_train[10000:] y_val = y_train[:10000] partial_y_val = y_train[10000:] model = models.Sequential() model.add(layers.Dense(16, activation='relu', input_shape=(10000,))) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(16, activation='relu')) model.add(layers.Dense(1, activation='sigmoid'))
import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import matplotlib.backends.backend_pdf import numpy as np import keras if __name__ == '__main__': from keras.datasets import imdb from keras.preprocessing import sequence max_features = 10000 maxlen = 500 batch_size = 32 print('Loading data...') (input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features) print(len(input_train), 'train sequences') print(len(input_test), 'test sequences') print('Pad sequences (samples x time)') input_train = sequence.pad_sequences(input_train, maxlen=maxlen) input_test = sequence.pad_sequences(input_test, maxlen=maxlen) print('input_train shape', input_train.shape) print('input_test shape', input_test.shape) from keras.layers import LSTM, Embedding, Dense from keras.models import Sequential model = Sequential() model.add(Embedding(max_features, 32)) model.add(LSTM(32)) model.add(Dense(1, activation='sigmoid')) model.summary()
from keras.datasets import imdb import numpy as np def vectorize(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) #domyślny typ float64 for i,sequence in enumerate(sequences): #i - counter/ sequence-value -> do iteracji po listach by mieć counter i wartość results[i,sequence] = 1 #pod indeksem sequence ustawia 1 return results (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) #data set 10000 najczęstszych słów x_train = vectorize(train_data) x_test = vectorize(test_data) y_train = train_labels.astype('float32') #zmiana typu na float bo na takich operuje api y_test = test_labels.astype('float32') from keras import models from keras import layers from keras import regularizers model = models.Sequential() model.add(layers.Dense(8,activation='relu', input_shape=(10000,))) #pierwszej warstwie trzeba nadać wymiar jakiego oczekuje #zwraca tensor o rozmierze (*,16) metoda aktywacji->operacja na tensorze model.add(layers.Dropout(0.5)) model.add(layers.Dense(8, activation = 'relu'))
from keras.datasets import imdb imdb.load_data(nb_words=5000)
def get_data(): return imdb.load_data( num_words=10000 ) # num_words informs the function that we want only the 10000 most common words
def test_text_classification(self): # This example demonstrates the use of Convolution1D for text classification. # This example is from Keras K.set_image_dim_ordering("th") import numpy as np np.random.seed(1337) # for reproducibility from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers import Embedding from keras.layers import Convolution1D from keras.datasets import imdb # set parameters: max_features = 5000 maxlen = 400 batch_size = 32 embedding_dims = 50 nb_filter = 250 filter_length = 3 hidden_dims = 250 nb_epoch = 1 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) # Exception if specify Dropout dropout=0.2 # we add a Convolution1D, which will learn nb_filter # word group filters of size filter_length: model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='relu', subsample_length=1)) # we use max pooling: model.add(GlobalMaxPooling1D()) # model.add(GlobalAveragePooling1D()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(0.2)) model.add(Activation('relu')) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model = with_bigdl_backend(model) model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, y_test), is_distributed=True) # 2017-09-22 15:53:45 INFO DistriOptimizer$:657 # - Top1Accuracy is Accuracy(correct: 21557, count: 25000, accuracy: 0.86228) # this result is from GlobalAveragePooling not GlobalMaxPooling. model.predict(X_test, is_distributed=True) model.evaluate(X_test, y_test, is_distributed=True) print(model)
def main(): ## set things up model_name = "SA_2.h5" SEED = 1337 # for reproducibility np.random.seed(SEED) max_features = 7000 maxlen = 70 # cut texts after this number of words (among top max_features most common words) batch_size = 128 dim = 32 drop_rate = .2 max_epochs = 2 ## prepare data print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') ## pad sequences print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) ## build sentiment analysis model print('Build model...') model = Sequential() model.add( Embedding(max_features, dim, input_length=maxlen, dropout=drop_rate)) ## deep cnn approach model.add( Convolution1D(nb_filter=dim, filter_length=3, border_mode='same', activation='relu')) model.add(MaxPooling1D(pool_length=2)) model.add(Flatten()) model.add(Dense(250, activation='relu')) model.add(Dense(1, activation='sigmoid')) ## LSTM approach # model.add(LSTM(output_dim=dim, dropout_W=drop_rate, dropout_U=drop_rate)) # model.add(Dense(1)) # model.add(Activation('sigmoid')) ## get ready! model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) ## train it print('Train...') model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=max_epochs, validation_data=(X_test, y_test)) ## validate score, acc = model.evaluate(X_test, y_test, batch_size=batch_size) print('Test score:', score) print('Test accuracy:', acc) ## save it model.save(model_name)
from keras.datasets.imdb import load_data from pickle import dump if __name__ == '__main__': data = load_data(nb_words=20000, test_split=0.3) dump(data, open('imdb', 'wb'))
def load_and_preprocess_data(): (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) x_train = pad_sequences(x_train, maxlen=input_length) x_test = pad_sequences(x_test, maxlen=input_length) return (x_train, y_train), (x_test, y_test)
#### 3.1.1 - 3.3.4 # Notes to be transcribed from handwritten later #%% [markdown] #### 3.4.1 The IMDB Dataset # Working with IMDB data to classify reviews as positive or negative #%% # Downloading/loading the built-in imdb data from keras.datasets import imdb #Setting up train and test data (trainData, trainLabels), (testData, testLabels) = imdb.load_data( num_words=10000 #Only keep top 10K words ) #%% #Decoding one of the reviews back to English, just to see how it's done wordIndex = imdb.get_word_index() reverseWordIndex = dict([(value, key) for (key, value) in wordIndex.items()]) decodedReview = ' '.join( [reverseWordIndex.get(i - 3, '?') for i in trainData[0]]) print(decodedReview) #%% [markdown] #### 3.4.2 Preparing the data #"You can't feed a list of integers into a neural network. You have to turn them # into into tensors. There are two ways to do that:<br/><br/> # 1. Pad your lists so they all have the same length, turn them into n integer
__author__ = "Luke Liu" #encoding="utf-8" from keras.models import Sequential from keras.layers import Embedding, SimpleRNN from keras.datasets import imdb from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer maxlen = 500 bacth_size = 32 print("Loading the data..................................") (input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=2000) print(len(input_train), 'train sequences') print(len(input_test), 'test sequences') input_train = sequence.pad_sequences(input_train, maxlen=maxlen) input_test = sequence.pad_sequences(input_test, maxlen=maxlen) print('input_train shape:', input_train.shape) print('input_test shape:', input_test.shape) # build the model from keras.layers import Dense model = Sequential() model.add(Embedding(2000, 32)) model.add(SimpleRNN(32)) model.add(Dense(1, activation='sigmoid')) model.summary() model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) history = model.fit(input_train, y_train,
# LSTM for sequence classification in the IMDB dataset import numpy from keras.datasets import imdb from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM, Convolution1D, Flatten, Dropout from keras.layers.embeddings import Embedding from keras.preprocessing import sequence from keras.callbacks import TensorBoard import numpy as np # Using keras to load the dataset with the top_words top_words = 10000 (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) path = '/home/santanu/Downloads/Mobile_App/' X_train = np.load(path + "aclImdb/X_train.npy") y_train = np.load(path + "aclImdb/y_train.npy") y_train = np.reshape(y_train,(-1,1)) X_test = np.load(path + "aclImdb/X_val.npy") y_test = np.load(path + "aclImdb/y_val.npy") y_test = np.reshape(y_test,(-1,1)) print(X_train[0]) # Pad the sequence to the same length max_review_length = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) # Using embedding from Keras
import numpy as np from keras.utils import to_categorical from keras import models, layers from keras.datasets import imdb def vectorize(sequences, dimension=10000): results = np.zeros((len(sequences), dimension)) for i, sequence in enumerate(sequences): results[i, sequence] = 1 return results (training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000) data = np.concatenate((training_data, testing_data), axis=0) data = vectorize(data) targets = np.concatenate((training_targets, testing_targets), axis=0) targets = np.array(targets).astype("float32") test_x = data[:10000] test_y = targets[:10000] train_x = data[10000:] train_y = targets[10000:] model = models.Sequential() # Input - Layer
new_list.append(token_indice[ngram]) new_sequences.append(new_list) return new_sequences # Set parameters: # ngram_range = 2 will add bi-grams features ngram_range = 1 max_features = 20000 maxlen = 400 batch_size = 32 embedding_dims = 50 nb_epoch = 5 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int))) print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int))) if ngram_range > 1: print('Adding {}-gram features'.format(ngram_range)) # Create set of unique n-gram from the training set. ngram_set = set() for input_list in X_train: for i in range(2, ngram_range+1): set_of_ngram = create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer.
from keras.datasets import imdb from keras import models from keras import layers import numpy as np import matplotlib.pyplot as plt # Take only the top 10000 most frequently occurring words. (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) def explore(): print(train_data.shape) print(train_labels.shape) print(test_data.shape) print(test_labels.shape) print(train_data[0]) print(train_labels[0]) print(max(max(sequence) for sequence in train_data)) # word_index is a dictionary mapping: word -> int indices word_index = imdb.get_word_index() # reversing it, mapping becomes int indices -> word reversed_word_index = dict([(value, key) for (key, value) in word_index.items()]) decoded_review = ' '.join( reversed_word_index.get(i - 3, '?') for i in train_data[0]) print(decoded_review)
''' Train a Bidirectional LSTM on the IMDB sentiment classification task. GPU command: THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_bidirectional_lstm.py Output after 4 epochs on CPU: ~0.8146 Time per epoch on CPU (Core i7): ~150s. ''' max_features = 20000 maxlen = 100 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print("Loading data...") (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) y_train = np.array(y_train) y_test = np.array(y_test) print('Build model...') model = Graph() model.add_input(name='input', input_shape=(1,), dtype=int) model.add_node(Embedding(max_features, 128, input_length=maxlen),
import numpy as np from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Embedding, Dense, LSTM, Dropout from keras.datasets import imdb max_features = 20000 max_length = 80 embedding_dim = 256 batch_size = 128 epochs = 10 modes = [0, 1, 2] print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features) X_train = sequence.pad_sequences(X_train, max_length) X_test = sequence.pad_sequences(X_test, max_length) # Compile and train different models while meauring performance. results = [] for mode in modes: print('Testing mode: implementation={}'.format(mode)) model = Sequential() model.add(Embedding(max_features, embedding_dim, input_length=max_length)) model.add(Dropout(0.2)) model.add(LSTM(embedding_dim, dropout=0.2, recurrent_dropout=0.2,