Ejemplo n.º 1
0
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (X_train, y_train), (X_test, y_test) = imdb.load_data()
        (X_train, y_train), (X_test, y_test) = imdb.load_data(maxlen=40)
Ejemplo n.º 2
0
def test_imdb_load_does_not_affect_global_rng(fake_downloaded_imdb_path):
    np.random.seed(1337)
    before = np.random.randint(0, 100, size=10)

    np.random.seed(1337)
    imdb.load_data(path=fake_downloaded_imdb_path, seed=9876)
    after = np.random.randint(0, 100, size=10)

    assert np.array_equal(before, after)
Ejemplo n.º 3
0
def test_imdb():
    # only run data download tests 20% of the time
    # to speed up frequent testing
    random.seed(time.time())
    if random.random() > 0.8:
        (x_train, y_train), (x_test, y_test) = imdb.load_data()
        (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=40)
        assert len(x_train) == len(y_train)
        assert len(x_test) == len(y_test)
        word_index = imdb.get_word_index()
        assert isinstance(word_index, dict)
Ejemplo n.º 4
0
    def __init__(self, feature='tfidf', **kwargs):
        super(IMDB, self).__init__(**kwargs)
        if self.conf is not None:
            feature = self.conf.get('feature', 'tfidf')
        if feature.startswith('tfidf'):
            max_features = 5000
            (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
        else:
            (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=None, 
                    skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=2, index_from=3)
        X, y = self.get_data_by_imageset(X_train, y_train, X_test, y_test)
        print('data_set={}, Average sequence length: {}'.format(self.data_set, np.mean(list(map(len, X)))))

        #feature
        if feature == 'origin':
            maxlen = 400
            X = sequence.pad_sequences(X, maxlen=maxlen)
        elif feature == 'tfidf':
            from sklearn.feature_extraction.text import TfidfTransformer
            transformer = TfidfTransformer(smooth_idf=False)
            #transformer = TfidfTransformer(smooth_idf=True)
            X_train_bin = np.zeros((len(X_train), max_features), dtype=np.int16)
            X_bin = np.zeros((len(X), max_features), dtype=np.int16)
            for i, X_i in enumerate(X_train):
                X_train_bin[i, :] = np.bincount(X_i, minlength=max_features)
            for i, X_i in enumerate(X):
                X_bin[i, :] = np.bincount(X_i, minlength=max_features)
            transformer.fit(X_train_bin)
            X = transformer.transform(X_bin)
            X = np.asarray(X.todense())
        elif feature == 'tfidf_seq':
            from sklearn.feature_extraction.text import TfidfTransformer
            transformer = TfidfTransformer(smooth_idf=False)
            maxlen = 400
            N = len(X)
            X_bin = np.zeros((N, max_features), dtype=np.int16)
            for i, X_i in enumerate(X):
                X_bin_i = np.bincount(X_i)
                X_bin[i, :len(X_bin_i)] = X_bin_i
            tfidf = transformer.fit_transform(X_bin)
            tfidf = np.asarray(tfidf.todense())
            X_id = sequence.pad_sequences(X, maxlen=maxlen)
            X = np.zeros(X_id.shape, dtype=np.float32)
            for i in range(N):
                X[i, :] = tfidf[i][X_id[i]]
        else:
            raise ValueError('Unkown feature: ', feature)

        X = X[:,np.newaxis,:,np.newaxis]
        self.X = self.init_layout_X(X)
        self.y = self.init_layout_y(y)
Ejemplo n.º 5
0
def test_dan_original():
    max_features = 20000
    maxlen = 100  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    print("Loading data...")
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)
    print(len(X_train), "train sequences")
    print(len(X_test), "test sequences")

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    model = dan_original(max_features)

    # try using different optimizers and different optimizer configs
    model.compile(loss="binary_crossentropy", optimizer="adagrad", class_mode="binary")

    print("Train...")
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True)
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
    print("Test score:", score)
    print("Test accuracy:", acc)
Ejemplo n.º 6
0
def train():
    # load the dataset but only keep the top n words, zero the rest
    (X_train, Y_train), (X_test, Y_test) = imdb.load_data(nb_words=top_words)
    # truncate and pad input sequences
    X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

    # create the model
    embedding_vecor_length = 32
    model = Sequential()
    model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
    model.add(Dropout(0.2))
    model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
    model.add(MaxPooling1D(pool_length=2))
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=2, batch_size=64)

    # Final evaluation of the model
    scores = model.evaluate(X_test, Y_test, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))
    model.save("imdb_%0.2f.pkl" % scores[1])
def main():
    top_words = 5000  # Keep only the most frequent 500 words in the dataset.
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

    # Keras requires same length (although 0 will mean no information).
    max_review_length = 500
    X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

    embedding_length = 32
    input_seq = Input(shape=(500,))
    a = Embedding(top_words, embedding_length,
                  input_length=max_review_length)(input_seq)
    b, state_h, state_c = LSTM(100, return_state=True,
                               return_sequences=True)(a)
    c = AttentionLayerV2(attention_depth=4)(b)
    d = Dropout(0.5)(c)
    e = Dense(1, activation='sigmoid')(d)
    model = Model(inputs=[input_seq], outputs=[e])
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['accuracy'])
    model.summary()
    # print(model.predict(np.ones((10, 500))))
    model.fit(X_train, y_train, epochs=5, batch_size=64)

    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    print("Accuracy: %.2f%%" % (scores[1]*100))

    model.save_weights('model_weights.h5')
def run_keras_cnn_example():
	# set parameters:
	max_features = 5000
	maxlen = 100
	batch_size = 32
	embedding_dims = 100
	nb_filter = 250
	filter_length = 3
	hidden_dims = 250
	nb_epoch = 2

	print('Loading data...')
	(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
														  test_split=0.2)
	print(len(X_train), 'train sequences')
	print(len(X_test), 'test sequences')

	print('Pad sequences (samples x time)')
	X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
	X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
	print('X_train shape:', X_train.shape)
	print('X_test shape:', X_test.shape)

	print('Build model...')
	model = Sequential()

	# we start off with an efficient embedding layer which maps
	# our vocab indices into embedding_dims dimensions
	model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
	model.add(Dropout(0.25))

	# we add a Convolution1D, which will learn nb_filter
	# word group filters of size filter_length:
	model.add(Convolution1D(nb_filter=nb_filter,
							filter_length=filter_length,
							border_mode='valid',
							activation='tanh',
							subsample_length=1))
	# we use standard max pooling (halving the output of the previous layer):
	model.add(MaxPooling1D(pool_length=2))

	# We flatten the output of the conv layer,
	# so that we can add a vanilla dense layer:
	model.add(Flatten())

	# We add a vanilla hidden layer:
	model.add(Dense(hidden_dims))
	model.add(Dropout(0.25))
	model.add(Activation('tanh'))

	# We project onto a single unit output layer, and squash it with a sigmoid:
	model.add(Dense(1))
	model.add(Activation('sigmoid'))

	model.compile(loss='binary_crossentropy',
				  optimizer='rmsprop',
				  class_mode='binary')
	model.fit(X_train, y_train, batch_size=batch_size,
			  nb_epoch=nb_epoch, show_accuracy=True,
			  validation_data=(X_test, y_test))
Ejemplo n.º 9
0
def imdb_test():
    # set parameters:
    max_features = 5000  # number of vocabulary
    maxlen = 200  # padding
    batch_size = 16
    nb_epoch = 10

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
                                                          test_split=0.2)

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)
    nb_classes = 2
    y_train = np_utils.to_categorical(y_train, nb_classes)
    y_test = np_utils.to_categorical(y_test, nb_classes)

    model = imdb_cnn()
    plot(model, to_file='./images/imdb_model.png')

    # try using different optimizers and different optimizer configs
    # model.compile(loss='binary_crossentropy', optimizer='adagrad', class_mode="binary")
    model.compile(loss='categorical_crossentropy', optimizer='adagrad')

    print("Train...")
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, y_test),
              show_accuracy=True, callbacks=[early_stopping])
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True)
    print('Test score:', score)
    print('Test accuracy:', acc)
Ejemplo n.º 10
0
def imdb_lstm():
    max_features = 20000
    maxlen = 80  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
    print type(X_train)
    exit(0)
    print len(X_train), 'train sequences'
    print len(X_test), 'test sequences'
    print('Pad sequences (samples x time)')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, 128, dropout=0.2))
    model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    # try using different optimizers and different optimizer configs
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

    print('Train...')
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15,
                        validation_data=(X_test, y_test))
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv
Ejemplo n.º 12
0
def data():
    np.random.seed(1337)  # for reproducibility
    max_features = 20000
    maxlen = 100

    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

    return X_train, X_test, y_train, y_test, maxlen, max_features
Ejemplo n.º 13
0
def load_imdb():
    """
    Load IMDB dataset
    Transform input data into an RDD of Sample
    """
    from keras.preprocessing import sequence
    from keras.datasets import imdb
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=20000)
    X_train = sequence.pad_sequences(X_train, maxlen=100)
    X_test = sequence.pad_sequences(X_test, maxlen=100)
    return X_train, y_train, X_test, y_test
Ejemplo n.º 14
0
def data():
    import numpy as np
    from keras.preprocessing import sequence
    from keras.datasets import imdb

    np.random.seed(1337)  # for reproducibility

    max_features = 20000
    maxlen = 100

    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

    return X_train, X_test, y_train, y_test, maxlen, max_features
Ejemplo n.º 15
0
def data():
    maxlen = 100
    max_features = 20000

    print('Loading data...')
    (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2)
    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    print("Pad sequences (samples x time)")
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    return X_train, X_test, y_train, y_test, max_features, maxlen
Ejemplo n.º 16
0
def loadData():
    (train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=100)
    p_train_data = train_data[0:10000]
    p_train_labels = train_labels[0:10000]
    val_data = train_data[20000:21000]
    val_labels = train_labels[20000:21000]
    x_train = p_train_data
    y_train = np.asarray(p_train_labels)
    x_train_arr = x_train.tolist()
    y_train_arr = y_train.tolist()

    x_val = val_data
    y_val = np.asarray(val_labels)
    x_val_arr = x_val.tolist()
    y_val_arr = y_val.tolist()
    return (x_train_arr, y_train_arr), (x_val_arr, y_val_arr)
    def train_1layer_glove_wordembedding(hidden_dim,modelfile):
        train = {}
        test = {}
        dev = {}

        #embedded_train, train_labels = WordEmbeddingLayer.load_embedded_data(path="../data/",name="train",representation="glove.840B.300d")
        #embedded_dev, dev_labels = WordEmbeddingLayer.load_embedded_data(path="../data/",name="dev",representation="glove.840B.300d")
        #embedded_test, test_labels = WordEmbeddingLayer.load_embedded_data(path="../data/",name="test",representation="glove.840B.300d")


        #embedded_train, train_labels, word_to_index, index_to_word, labels_count = DataPrep.load_one_hot_sentiment_data("../data/sentiment/trainsentence_and_label_binary.txt")
        #embedded_dev, dev_labels= DataPrep.load_one_hot_sentiment_data_traind_vocabulary("../data/sentiment/devsentence_and_label_binary.txt",word_to_index,index_to_word,labels_count)
        #self.test["sentences"], self.test["sentiments"]= DataPrep.load_one_hot_sentiment_data_traind_vocabulary("../../data/sentiment/testsentence_and_label_binary.txt",self.word_to_index, self.index_to_word,self.labels_count)


        (X_train, train_labels), (X_test,dev_labels) = imdb.load_data(nb_words=20000,test_split=0.2)
        embedded_train = []
        embedded_dev = []

        train_labels = [np.eye(2)[l] for l in train_labels]
        dev_labels = [np.eye(2)[l] for l in dev_labels]

        one_hot_vocab = np.eye(20000,dtype=np.float32)
        for sent in X_train:
            sentence = [one_hot_vocab[term]  for term in sent]
            embedded_train.append(sentence)

        for sent in X_test:
            sentence = [one_hot_vocab[term]  for term in sent]
            embedded_dev.append(sentence)

        flstm = FullyConnectedLSTM(input_dim=len(embedded_train[0][0]),output_dim=2,number_of_layers=1, hidden_dims=[hidden_dim],dropout_p=0.1,learning_rate=0.01)
        flstm.build_model()

        #train_labels[train_labels == 0] = -1
        #dev_labels[dev_labels == 0] = -1
        flstm.train(embedded_train,train_labels,embedded_dev,dev_labels)
        flstm.save_model(modelfile)
Ejemplo n.º 18
0
def test_imdb_lstm():
    max_features = 20000
    maxlen = 80  # cut texts after this number of words (among top max_features most common words)
    batch_size = 32

    print('Loading data...')
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
    print(len(x_train), 'train sequences')
    print(len(x_test), 'test sequences')

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)

    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, 128))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    # try using different optimizers and different optimizer configs
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print('Train...')
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=15,
              validation_data=(x_test, y_test))
    score, acc = model.evaluate(x_test, y_test,
                                batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)
Ejemplo n.º 19
0
    model = Model([comment_seq], output)

    adam = optimizers.Adam(lr=0.001,
                           beta_1=0.9,
                           beta_2=0.999,
                           epsilon=1e-08,
                           decay=0.0)
    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=['accuracy'])
    return model


if __name__ == '__main__':
    print('————————————————load data————————————————')
    (X_train, y_train), (X_test, y_test) = imdb.load_data()
    X_all = (list(X_train) + list(X_test))[0:1000]
    y_all = (list(y_train) + list(y_test))[0:1000]
    print(len(X_all), len(y_all))

    imdb_word2idx = imdb.get_word_index()
    imdb_idx2word = dict((idx, word) for (word, idx) in imdb_word2idx.items())
    X_all = [[imdb_idx2word.get(idx - 3, '?') for idx in sen][1:]
             for sen in X_all]

    w2vModel = train_W2V(X_all, in_path + 'w2vModel')
    word2idx, embedMatrix = build_word2idx_embedMatrix(
        w2vModel)  # 制作word2idx和embedMatrix

    X_all_idx = make_X_train_idx(X_all, word2idx, MAX_SEQ_LEN)
    y_all_idx = np.array(y_all)  # 一定要注意,X_all和y_all必须是np.array()类型,否则报错
Ejemplo n.º 20
0
@author: Tim George Kabue
@phone_number: +254706762054
@email: [email protected]
"""

from keras.datasets import imdb
from keras import preprocessing, models, layers
import numpy as np
import matplotlib.pyplot as plt

#Load dataset.
no_of_tokens = 10000
(train_samples,
 train_labels), (test_samples,
                 test_labels) = imdb.load_data(num_words=no_of_tokens)

#Preprocess the data.
max_len = 500
train_samples = preprocessing.sequence.pad_sequences(train_samples,
                                                     maxlen=max_len)
test_samples = preprocessing.sequence.pad_sequences(test_samples,
                                                    maxlen=max_len)

#Network architecture.
model = models.Sequential()

vector_size = 32
model.add(layers.Embedding(no_of_tokens, vector_size))

model.add(layers.SimpleRNN(32))
def load_data_set(type,max_len,vocab_size,batch_size):
    """
        Loads the dataset. Keras Imdb dataset for binary classifcation. Keras reuters dataset for multiclass classification
 
        Args:
            type   : {bool} 0 for binary classification returns imdb dataset. 1 for multiclass classfication return reuters set
            max_len: {int} timesteps used for padding
			vocab_size: {int} size of the vocabulary
			batch_size: batch_size
        Returns:
            train_loader: {torch.Dataloader} train dataloader
            x_test_pad  : padded tokenized test_data for cross validating
			y_test      : y_test
            word_to_id  : {dict} words mapped to indices
 
      
        """
   
    INDEX_FROM=3
    if not bool(type):
        NUM_WORDS=vocab_size # only use top 1000 words
           # word index offset
 
        train_set,test_set = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = imdb.get_word_index()
        word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
 
        id_to_word = {value:key for key,value in word_to_id.items()}
        x = np.concatenate([x_train, x_test])
        y = np.concatenate([y_train, y_test])
        n_train = x.shape[0] - 1000
        n_valid = 1000
 
        x_train = x[:n_train]
        y_train = y[:n_train]
        x_test = x[n_train:n_train+n_valid]
        y_test = y[n_train:n_train+n_valid]
 
 
        #embeddings = load_glove_embeddings("../../GloVe/glove.6B.50d.txt",word_to_id,50)
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.DoubleTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,x_test_pad,y_test,word_to_id
       
    else:
        from keras.datasets import reuters
 
        train_set,test_set = reuters.load_data(path="reuters.npz",num_words=vocab_size,skip_top=0,index_from=INDEX_FROM)
        x_train,y_train = train_set[0],train_set[1]
        x_test,y_test = test_set[0],test_set[1]
        word_to_id = reuters.get_word_index(path="reuters_word_index.json")
        word_to_id = {k:(v+3) for k,v in word_to_id.items()}
        word_to_id["<PAD>"] = 0
        word_to_id["<START>"] = 1
        word_to_id["<UNK>"] = 2
        word_to_id['<EOS>'] = 3
        id_to_word = {value:key for key,value in word_to_id.items()}
        x_train_pad = pad_sequences(x_train,maxlen=max_len)
        x_test_pad = pad_sequences(x_test,maxlen=max_len)
 
 
        train_data = data_utils.TensorDataset(torch.from_numpy(x_train_pad).type(torch.LongTensor),torch.from_numpy(y_train).type(torch.LongTensor))
        train_loader = data_utils.DataLoader(train_data,batch_size=batch_size,drop_last=True)
        return train_loader,train_set,test_set,x_test_pad,word_to_id
output_dir = 'model_output/imdb_deep_net'

epochs = 4
batch_size = 128

n_dim = 64
n_unique_words = 5000
n_words_to_skip = 50
max_review_length = 100
pad_type = trunc_type = 'pre'

n_dense = 64
dropout = 0.5

(X_train, y_train), (X_valid,
                     y_valid) = imdb.load_data(num_words=n_unique_words,
                                               skip_top=n_words_to_skip)

word_index = keras.datasets.imdb.get_word_index()

# v+3 so we push the words 3 positions.
word_index = {k: (v + 3) for k, v in word_index.items()}

# Now we fill in some keywords for the first 3 indexes as seen below.
word_index['PAD'] = 0
word_index['START'] = 1
word_index['UNK'] = 2

index_word = {v: k for k, v in word_index.items()}

review = ' '.join(index_word[id] for id in X_train[0])
print(review)
Ejemplo n.º 23
0
from keras.datasets import imdb
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from matplotlib import pyplot as plt

max_features = 10000
maxlen = 500
batch_size = 32

print('Loading data...')
(input_train, y_train), (input_test,
                         y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')

print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
print('input train shape:', input_train.shape)
print('input test shape:', input_test.shape)

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(input_train,
                    y_train,
                    epochs=10,
Ejemplo n.º 24
0
import numpy as np
from keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from keras import models
from keras import layers
import matplotlib.pyplot as plt

np.random.seed(0)

number_of_features = 10000

(data_train,
 target_train), (data_test,
                 target_test) = imdb.load_data(num_words=number_of_features)

tokenizer = Tokenizer(num_words=number_of_features)

features_train = tokenizer.sequences_to_matrix(data_train, mode="binary")
features_test = tokenizer.sequences_to_matrix(data_test, mode="binary")

network = models.Sequential()

network.add(
    layers.Dense(units=16,
                 activation="relu",
                 input_shape=(number_of_features, )))

network.add(layers.Dense(units=16, activation="relu"))

network.add(layers.Dense(units=1, activation="sigmoid"))
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.layers import *
from keras.optimizers import Adam
from keras import Model

from PositionEmbedding import SinusoidalPositionEmbedding
from MultiHeadAttention import MultiHeadAttention
from LayerNormalization import LayerNormalization

max_words = 20000
maxlen = 100
embed_dim = 64
batch_size = 128

(x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=max_words)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = sequence.pad_sequences(x_val, maxlen=maxlen)

text_input = Input(shape=(maxlen, ), dtype='int32')
x = Embedding(max_words, embed_dim)(text_input)
x = SinusoidalPositionEmbedding()(x)


def transformer_encoder(inputs, num_heads=4, dropout_rate=0.1):
    in_dim = K.int_shape(inputs)[-1]
    x = MultiHeadAttention(num_heads, in_dim)([inputs, inputs])
    x = Dropout(dropout_rate)(x)
    x = add([inputs, x])
    x1 = LayerNormalization()(x)
    x = Dense(in_dim * 2, activation='relu')(x1)
Ejemplo n.º 26
0
import numpy as np

# データ数、特徴数、ベクトルの次元数、ステップ数など
train_reviews = 5000
valid_reviews = 100
max_features = 5000
embedding_size = 256
step_size = 5
batch_size = 32
index_from = 2
rnn_units = 128
epochs = 2
word_index_prev = {'<PAD>': 0, '<START>': 1, '<UNK>': 2}

# IMDBデータを読み込み
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features,
                                                      index_from=index_from)

# IMDBデータから単語情報を抽出
word_index = {
    word: (index + index_from)
    for word, index in imdb.get_word_index().items()
    if (index + index_from) < max_features
}
word_index.update(word_index_prev)

# 単語情報から辞書を作成
index_word = {index: word for word, index in word_index.items()}


# 文章を表示する関数
def print_sentence(sentence):
Ejemplo n.º 27
0
# Tutorial: https://machinelearningmastery.com/predict-sentiment-movie-reviews-using-deep-learning/

import numpy
# Import IMDB reviews labeled dataset
# Downloads it from https://s3.amazonaws.com/text-datasets/imdb.pkl (33M) on the first run
# Or https://s3.amazonaws.com/text-datasets/imdb.npz
from keras.datasets import imdb
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences
from matplotlib import pyplot

# load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data()
X = numpy.concatenate((X_train, X_test), axis=0)
y = numpy.concatenate((y_train, y_test), axis=0)

# summarize size
print("Training data: ")
print(X.shape)
print(y.shape)

# Summarize number of classes
print("Classes: ")
print(numpy.unique(y))

# Summarize number of words
print("Number of words: ")
print(len(numpy.unique(numpy.hstack(X))))

# Summarize review length
print("Review length: ")
Ejemplo n.º 28
0
# In[1]:


from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

max_features = 20000
maxlen = 80  
batch_size = 32

# 加载数据并将单词转化为ID,max_features给出了最多使用的单词数。
(trainX, trainY), (testX, testY) = imdb.load_data(num_words=max_features)
print(len(trainX), 'train sequences')
print(len(testX), 'test sequences')

# 在自然语言中,每一段话的长度是不一样的,但循环神经网络的循环长度是固定的,
# 所以这里需要先将所有段落统一成固定长度。
trainX = sequence.pad_sequences(trainX, maxlen=maxlen)
testX = sequence.pad_sequences(testX, maxlen=maxlen)
print('trainX shape:', trainX.shape)
print('testX shape:', testX.shape)


# ### 2. 定义模型。

# In[2]:
Ejemplo n.º 29
0
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils, to_categorical
 
from keras.datasets import imdb

if not os.path.exists('output'):
    os.mkdir('output')

f = open("output/testme.txt", 'w')
f.write("hellllo")
f.close()


(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
path="imdb.npz",
num_words=10000)

print("train_data ", train_data.shape)
print("train_labels ", train_labels.shape)
print("_"*100)
print("Going to sleep for 20 min")
print("Going to sleep for 20 min")
time.sleep(5)
time.sleep(1200)
print("test_data ", test_data.shape)
print("test_labels ", test_labels.shape)
print("_"*100)

# See an actual review in words
# Reverse from integers to words using the DICTIONARY (given by keras...need to do nothing to create it)
#加载库
import numpy as np
from  keras.datasets import  imdb
from  keras.preprocessing.text import  Tokenizer
from  keras import  models
from keras import  layers

#设定随机种子
np.random.seed(0)

#设定想要的特征数量
number_of_features =  1000

#从影评数据中加载数据和目标向量
(data_train,target_train),(data_test,target_test)=imdb.load_data(num_words=number_of_features)
#将影评数据转换为one-hot 编码过的特征矩阵
tokenizer = Tokenizer(num_words=number_of_features)
features_train = tokenizer.sequences_to_matrix(data_train,mode='binary')
features_test = tokenizer.sequences_to_matrix(data_test,mode='binary')

#创建神经网络对象
network = models.Sequential()
#添加使用RelU激活函数的全连接层
network.add(layers.Dense(units=16,activation="relu",input_shape=(number_of_features,)))

#添加使用ReLU激活函数的全连接层
network.add(layers.Dense(units=16,activation="relu"))
#添加使用sigmoid激活函数的全连接层
network.add(layers.Dense(units=1,activation="sigmoid"))
Ejemplo n.º 31
0
# coding=utf-8

from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence

from textcnn_model import TextCNN

vocab_size = 5000
maxlen = 512
batch_size = 32
embedding_dims = 100
epochs = 10

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = TextCNN(maxlen, vocab_size, embedding_dims).get_model()
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
#这里可以自己实现自己需要的回调函数,做比赛时基本都是自定义回调函数
Ejemplo n.º 32
0
from keras.datasets import imdb
from keras import models,layers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
num_words=10000)

print(train_data[0])
print(train_labels[0])


#vectorize train data to 0's and 1's
def vectorizeComment(sequences,dimensions=10000):
	results=np.zeros((len(sequences),dimensions))
	for i,sequence in enumerate(sequences): #enumerate used to add counter to array
		results[i,sequence]=1
	return results

x_train=vectorizeComment(train_data)
#converting label as array and vectorize it
y_train=np.asarray(train_labels).astype('float32')

#x_test=vectorizeComment(test_data)
model=models.Sequential()
model.add(layers.Dense(16,kernel_regularizer=regularizers.l2(0.001),activation='relu',input_shape=(10000,))) #adding 3 layers
model.add(layers.Dense(16,kernel_regularizer=regularizers.l2(0.001),activation='relu'))
model.add(layers.Dense(1,activation='sigmoid'))
Ejemplo n.º 33
0
img = image.load_img(pic, target_size=(224, 224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

preds = resnet50.predict(x)
predict = decode_predictions(preds, top=2)
print("Neural Net predicts this is a: ", predict[0][0])
print("Neural Net predicts this is a: ", predict[0][1])

########
########
#imdb data of 50k reviews.  Train and test set are 25k each.  Each set around 50% postiive and 50% negative reviews
from keras.datasets import imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(
    num_words=10000
)  #num_words is 10k most frequent words.  All others will be discarded

max([max(sequence) for sequence in train_data])

word_index = imdb.get_word_index()

###get the word to the index.
#Subtract 3 because first 3 are reserved for 'padding','start of sequence', and 'unknown'
reverse_word_index = dict([(value, key)
                           for (key, value) in word_index.items()])
decoded_reivew = ' '.join(
    [reverse_word_index.get(i - 3, '?') for i in train_data[0]])

#PREP THE DATA
Ejemplo n.º 34
0
       [ 2.71,  0.95, -4.85],
       [-2.82, -1.4 ,  2.69]], dtype=float32),
    # biases for 3 output nodes
    array([ 0.21, -0.39, -1.22], dtype=float32)
]

# test the model and your weights
# model.fit(bin7, count3, epochs=1)
# model.set_weights(myWeights)
# predict3 = model.predict(bin7)
# np.set_printoptions(suppress=True)
# np.set_printoptions(precision=1)
# print('prediction =', predict3)

from keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)


Examples = {
    'count3' : [ bin7, count3, model, myWeights ],
    'imdb' : [x_train, y_train, model, myWeights],
}
Ejemplo n.º 35
0
import pyswarms as ps
from keras.datasets import imdb
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix
from sklearn import preprocessing, datasets, linear_model
from sklearn.model_selection import train_test_split

#####
#####  Loading the dataset and bifercating it into training and testing data for IMDB dataset
#####

# maximum vocabulary
nb_words = 50000
# cut texts after this number of words
maxlen = 100

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=nb_words)
x = x_train
y = y_train

#####
#####  Padding input sequences
#####

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

#####
#####  Shaping the training and data input
#####
Ejemplo n.º 36
0
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

np.random.seed(42)


# ## 1. Loading the data
# This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment.

# In[21]:


# Loading the data (it's preloaded in Keras)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=1000)

print(x_train.shape)
print(x_test.shape)


# ## 2. Examining the data
# Notice that the data has been already pre-processed, where all the words have numbers, and the reviews come in as a vector with the words that the review contains. For example, if the word 'the' is the first one in our dictionary, and a review contains the word 'the', then there is a 1 in the corresponding vector.
# 
# The output comes as a vector of 1's and 0's, where 1 is a positive sentiment for the review, and 0 is negative.

# In[22]:


print(x_train[0])
print(y_train[0])
Ejemplo n.º 37
0
# word embeddings differ from taks to task, some work better on sentiment analysis some on QA

# embeddings must have meaning, similar words or even synonyms should be closer to each other
# they should have some sort of geometric representation, distances, vectors for example:
# dog+wild_vector=wolf, cat+wild_vector=tiger, dog and cat are similar (small distance), both pets

# TODO: 186 + 187

embedding_layer = Embedding(1000,
                            64)  # (number of possible tokens, dimensionality)
# takes indices of words from dict ({1: "hello", 2: "good", ...}) -> and transforms to dense vectors

max_features = 10000  # 10k most common words
maxlen = 20  # take first 20 words

(x_train, y_train), (x_test, y_test) = imdb.load_data(
    num_words=max_features)  # samples, words, list of ints
x_train = preprocessing.sequence.pad_sequences(
    x_train, maxlen=maxlen)  # 2D integer tensor
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

# embedding sentences in sequences of vectors, flattening them, and training a Dense layer

model = Sequential()

model.add(Embedding(10000, 8, input_length=maxlen))  # 8 dimensional embedding
model.add(Flatten())  # 3D -> 2D tensor
model.add(Dense(1, activation='sigmoid')
          )  # this model treats words as single units, not a group of words

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
from keras.datasets import imdb


def p(mess, obj):
    """Useful function for tracing"""
    if hasattr(obj, 'shape'):
        print(mess, type(obj), obj.shape, "\n", obj)
    else:
        print(mess, type(obj), "\n", obj)


lexicon_size = 20000
(train_data,
 train_labels), (test_data,
                 test_labels) = imdb.load_data(num_words=lexicon_size)

word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key)
                           for (key, value) in word_index.items()])

reviews = {}
titles = {}
for i, encoded_review in enumerate(train_data):
    decoded_review = ' '.join(
        [reverse_word_index.get(i - 3, '?') for i in encoded_review])
    reviews[i] = decoded_review
    titles[i] = decoded_review[:20]

for i, title in enumerate(reviews):
    print(reviews[title])
Ejemplo n.º 39
0
    def test_text_classification(self):
        # This example demonstrates the use of Convolution1D for text classification.
        # This example is from Keras
        K.set_image_dim_ordering("th")
        import numpy as np
        np.random.seed(1337)  # for reproducibility

        from keras.preprocessing import sequence
        from keras.models import Sequential
        from keras.layers import Dense, Dropout, Activation
        from keras.layers import Embedding
        from keras.layers import Convolution1D
        from keras.datasets import imdb

        # set parameters:
        max_features = 5000
        maxlen = 400
        batch_size = 32
        embedding_dims = 50
        nb_filter = 250
        filter_length = 3
        hidden_dims = 250
        nb_epoch = 1

        print('Loading data...')
        (X_train, y_train), (X_test,
                             y_test) = imdb.load_data(nb_words=max_features)
        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')

        print('Pad sequences (samples x time)')
        X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
        X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
        print('X_train shape:', X_train.shape)
        print('X_test shape:', X_test.shape)

        print('Build model...')
        model = Sequential()

        model.add(Embedding(
            max_features, embedding_dims,
            input_length=maxlen))  # Exception if specify Dropout dropout=0.2

        # we add a Convolution1D, which will learn nb_filter
        # word group filters of size filter_length:
        model.add(
            Convolution1D(nb_filter=nb_filter,
                          filter_length=filter_length,
                          border_mode='valid',
                          activation='relu',
                          subsample_length=1))
        # we use max pooling:
        model.add(GlobalMaxPooling1D())
        # model.add(GlobalAveragePooling1D())

        # We add a vanilla hidden layer:
        model.add(Dense(hidden_dims))
        model.add(Dropout(0.2))
        model.add(Activation('relu'))

        # We project onto a single unit output layer, and squash it with a sigmoid:
        model.add(Dense(1))
        model.add(Activation('sigmoid'))

        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        model = with_bigdl_backend(model)

        model.fit(X_train,
                  y_train,
                  batch_size=batch_size,
                  nb_epoch=nb_epoch,
                  validation_data=(X_test, y_test),
                  is_distributed=True)
        # 2017-09-22 15:53:45 INFO  DistriOptimizer$:657
        # - Top1Accuracy is Accuracy(correct: 21557, count: 25000, accuracy: 0.86228)
        # this result is from GlobalAveragePooling not GlobalMaxPooling.
        model.predict(X_test, is_distributed=True)
        model.evaluate(X_test, y_test, is_distributed=True)
        print(model)
Ejemplo n.º 40
0
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.datasets import imdb

max_features = 20000
maxlen = 100  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
                                                      test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen, dropout=0.5))
model.add(LSTM(128, dropout_W=0.5, dropout_U=0.1))  # try using a GRU instead, for fun
model.add(Dropout(0.5))
model.add(Dense(1))
Ejemplo n.º 41
0
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.datasets import imdb
from keras.preprocessing import sequence

VOCAB_SIZE = 5000  # Number of words to load
INDEX_FROM = 3  # Start at 3 to account for padding/unknown, & start of sentence
MAX_SEQ_LEN = 128  # Max input length
EMBEDDING_DIM = 64

# Load and assign dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE,
                                                      index_from=INDEX_FROM)

word_to_idx = imdb.get_word_index()
idx_to_word = {v + INDEX_FROM: k for k, v in word_to_idx.items()}

idx_to_word[0] = '<PAD>'
idx_to_word[1] = '<START>'
idx_to_word[2] = '<UNK>'

# Fit sequences to max length
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQ_LEN)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQ_LEN)

print(' '.join([idx_to_word[idx] for idx in X_train[0]]))

# Specify the model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_SEQ_LEN))
Ejemplo n.º 42
0
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
test_split = 0.33
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=top_words, test_split=test_split)
# pad dataset to a maximum review length in words
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
# create the model
model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# Fit the model
Ejemplo n.º 43
0
from keras.datasets import imdb
vocabulary_size = 5000
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

print('---review---')
print(X_train[6])  # review is stored as a sequence of integers. These are word IDs that have been pre-assigned to individual words
print('---label---')
print(y_train[6]) #label is an integer (0 for negative, 1 for positive).

word2id = imdb.get_word_index()
id2word = {i: word for word, i in word2id.items()}
print('---review with words---')
print([id2word.get(i, ' ') for i in X_train[6]])
print('---label---')
print(y_train[6])

print('Maximum review length: {}'.format(len(max((X_train + X_test), key=len))))

print('Minimum review length: {}'.format(len(min((X_test + X_test), key=len))))


from keras.preprocessing import sequence
max_words = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
embedding_size=32
model=Sequential()
Ejemplo n.º 44
0
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import imdb
from keras import models, layers


def vectorized(alist, dim=10000):
    a = np.zeros((len(alist), dim))
    for k, v in enumerate(alist):
        a[k, v] = 1
    return a


(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

x_train = vectorized(train_data)
x_test = vectorized(test_data)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

x_val = x_train[:10000]
partial_x_val = x_train[10000:]
y_val = y_train[:10000]
partial_y_val = y_train[10000:]

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
Ejemplo n.º 45
0
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.backends.backend_pdf

import numpy as np
import keras

if __name__ == '__main__':
	from keras.datasets import imdb
	from keras.preprocessing import sequence
	max_features = 10000
	maxlen = 500
	batch_size = 32
	print('Loading data...')
	(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
	print(len(input_train), 'train sequences')
	print(len(input_test), 'test sequences')
	print('Pad sequences (samples x time)')
	input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
	input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
	print('input_train shape', input_train.shape)
	print('input_test shape', input_test.shape)

	from keras.layers import LSTM, Embedding, Dense
	from keras.models import Sequential
	model = Sequential()
	model.add(Embedding(max_features, 32))
	model.add(LSTM(32))
	model.add(Dense(1, activation='sigmoid'))
	model.summary()
Ejemplo n.º 46
0
from keras.datasets import imdb
import numpy as np


def vectorize(sequences, dimension=10000):
	results = np.zeros((len(sequences), dimension))	#domyślny typ float64
	for i,sequence in enumerate(sequences):		#i - counter/ sequence-value -> do iteracji po listach by mieć counter i wartość
		results[i,sequence] = 1	#pod indeksem sequence ustawia 1
	return results
	


(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) #data set 10000 najczęstszych słów

x_train = vectorize(train_data)
x_test = vectorize(test_data)

y_train = train_labels.astype('float32')	#zmiana typu na float bo na takich operuje api
y_test = test_labels.astype('float32') 


from keras import models
from keras import layers
from keras import regularizers

model = models.Sequential()

model.add(layers.Dense(8,activation='relu', input_shape=(10000,))) #pierwszej warstwie trzeba nadać wymiar jakiego oczekuje
#zwraca tensor o rozmierze (*,16) metoda aktywacji->operacja na tensorze
model.add(layers.Dropout(0.5))
model.add(layers.Dense(8, activation = 'relu'))
Ejemplo n.º 47
0
from keras.datasets import imdb
imdb.load_data(nb_words=5000)
Ejemplo n.º 48
0
def get_data():
    return imdb.load_data(
        num_words=10000
    )  # num_words informs the function that we want only the 10000 most common words
Ejemplo n.º 49
0
    def test_text_classification(self):
        # This example demonstrates the use of Convolution1D for text classification.
        # This example is from Keras
        K.set_image_dim_ordering("th")
        import numpy as np
        np.random.seed(1337)  # for reproducibility

        from keras.preprocessing import sequence
        from keras.models import Sequential
        from keras.layers import Dense, Dropout, Activation
        from keras.layers import Embedding
        from keras.layers import Convolution1D
        from keras.datasets import imdb

        # set parameters:
        max_features = 5000
        maxlen = 400
        batch_size = 32
        embedding_dims = 50
        nb_filter = 250
        filter_length = 3
        hidden_dims = 250
        nb_epoch = 1

        print('Loading data...')
        (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
        print(len(X_train), 'train sequences')
        print(len(X_test), 'test sequences')

        print('Pad sequences (samples x time)')
        X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
        X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
        print('X_train shape:', X_train.shape)
        print('X_test shape:', X_test.shape)

        print('Build model...')
        model = Sequential()

        model.add(Embedding(max_features,
                            embedding_dims,
                            input_length=maxlen))  # Exception if specify Dropout dropout=0.2

        # we add a Convolution1D, which will learn nb_filter
        # word group filters of size filter_length:
        model.add(Convolution1D(nb_filter=nb_filter,
                                filter_length=filter_length,
                                border_mode='valid',
                                activation='relu',
                                subsample_length=1))
        # we use max pooling:
        model.add(GlobalMaxPooling1D())
        # model.add(GlobalAveragePooling1D())

        # We add a vanilla hidden layer:
        model.add(Dense(hidden_dims))
        model.add(Dropout(0.2))
        model.add(Activation('relu'))

        # We project onto a single unit output layer, and squash it with a sigmoid:
        model.add(Dense(1))
        model.add(Activation('sigmoid'))

        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        model = with_bigdl_backend(model)

        model.fit(X_train, y_train,
                  batch_size=batch_size,
                  nb_epoch=nb_epoch,
                  validation_data=(X_test, y_test),
                  is_distributed=True)
        # 2017-09-22 15:53:45 INFO  DistriOptimizer$:657
        # - Top1Accuracy is Accuracy(correct: 21557, count: 25000, accuracy: 0.86228)
        # this result is from GlobalAveragePooling not GlobalMaxPooling.
        model.predict(X_test, is_distributed=True)
        model.evaluate(X_test, y_test, is_distributed=True)
        print(model)
Ejemplo n.º 50
0
def main():
    ## set things up
    model_name = "SA_2.h5"
    SEED = 1337  # for reproducibility
    np.random.seed(SEED)

    max_features = 7000
    maxlen = 70  # cut texts after this number of words (among top max_features most common words)
    batch_size = 128
    dim = 32

    drop_rate = .2
    max_epochs = 2

    ## prepare data
    print('Loading data...')
    (X_train, y_train), (X_test,
                         y_test) = imdb.load_data(nb_words=max_features)
    print(len(X_train), 'train sequences')
    print(len(X_test), 'test sequences')

    ## pad sequences
    print('Pad sequences (samples x time)')
    X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    ## build sentiment analysis model
    print('Build model...')
    model = Sequential()
    model.add(
        Embedding(max_features, dim, input_length=maxlen, dropout=drop_rate))

    ## deep cnn approach
    model.add(
        Convolution1D(nb_filter=dim,
                      filter_length=3,
                      border_mode='same',
                      activation='relu'))
    model.add(MaxPooling1D(pool_length=2))
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    ## LSTM approach
    # model.add(LSTM(output_dim=dim, dropout_W=drop_rate, dropout_U=drop_rate))
    # model.add(Dense(1))
    # model.add(Activation('sigmoid'))

    ## get ready!
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    ## train it
    print('Train...')
    model.fit(X_train,
              y_train,
              batch_size=batch_size,
              nb_epoch=max_epochs,
              validation_data=(X_test, y_test))

    ## validate
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
    print('Test score:', score)
    print('Test accuracy:', acc)

    ## save it
    model.save(model_name)
Ejemplo n.º 51
0
Archivo: load_db.py Proyecto: kleach/ML
from keras.datasets.imdb import load_data
from pickle import dump


if __name__ == '__main__':
    data = load_data(nb_words=20000, test_split=0.3)
    dump(data, open('imdb', 'wb'))
def load_and_preprocess_data():
    (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
    x_train = pad_sequences(x_train, maxlen=input_length)
    x_test = pad_sequences(x_test, maxlen=input_length)
    return (x_train, y_train), (x_test, y_test)
#### 3.1.1 - 3.3.4

# Notes to be transcribed from handwritten later

#%% [markdown]
#### 3.4.1 The IMDB Dataset

# Working with IMDB data to classify reviews as positive or negative

#%%
# Downloading/loading the built-in imdb data
from keras.datasets import imdb

#Setting up train and test data
(trainData, trainLabels), (testData, testLabels) = imdb.load_data(
    num_words=10000  #Only keep top 10K words
)

#%%
#Decoding one of the reviews back to English, just to see how it's done
wordIndex = imdb.get_word_index()
reverseWordIndex = dict([(value, key) for (key, value) in wordIndex.items()])
decodedReview = ' '.join(
    [reverseWordIndex.get(i - 3, '?') for i in trainData[0]])
print(decodedReview)

#%% [markdown]
#### 3.4.2 Preparing the data
#"You can't feed a list of integers into a neural network. You have to turn them
# into into tensors. There are two ways to do that:<br/><br/>
#       1. Pad your lists so they all have the same length, turn them into n integer
Ejemplo n.º 54
0
__author__ = "Luke Liu"
#encoding="utf-8"

from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

maxlen = 500
bacth_size = 32
print("Loading the data..................................")
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=2000)
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

# build the model

from keras.layers import Dense
model = Sequential()
model.add(Embedding(2000, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(input_train,
                    y_train,
# LSTM for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
import numpy as np

# Using keras to load the dataset with the top_words
top_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
path = '/home/santanu/Downloads/Mobile_App/' 
X_train = np.load(path + "aclImdb/X_train.npy") 
y_train = np.load(path + "aclImdb/y_train.npy") 
y_train = np.reshape(y_train,(-1,1))
X_test = np.load(path + "aclImdb/X_val.npy") 
y_test = np.load(path + "aclImdb/y_val.npy")
y_test = np.reshape(y_test,(-1,1)) 
      

print(X_train[0])

# Pad the sequence to the same length
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Using embedding from Keras
Ejemplo n.º 56
0
import numpy as np
from keras.utils import to_categorical
from keras import models, layers
from keras.datasets import imdb


def vectorize(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results


(training_data,
 training_targets), (testing_data,
                     testing_targets) = imdb.load_data(num_words=10000)

data = np.concatenate((training_data, testing_data), axis=0)
data = vectorize(data)

targets = np.concatenate((training_targets, testing_targets), axis=0)
targets = np.array(targets).astype("float32")

test_x = data[:10000]
test_y = targets[:10000]
train_x = data[10000:]
train_y = targets[10000:]

model = models.Sequential()

# Input - Layer
Ejemplo n.º 57
0
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

# Set parameters:
# ngram_range = 2 will add bi-grams features
ngram_range = 1
max_features = 20000
maxlen = 400
batch_size = 32
embedding_dims = 50
nb_epoch = 5

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, X_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, X_test)), dtype=int)))

if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in X_train:
        for i in range(2, ngram_range+1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
Ejemplo n.º 58
0
from keras.datasets import imdb
from keras import models
from keras import layers
import numpy as np
import matplotlib.pyplot as plt

# Take only the top 10000 most frequently occurring words.
(train_data, train_labels), (test_data,
                             test_labels) = imdb.load_data(num_words=10000)


def explore():
    print(train_data.shape)
    print(train_labels.shape)

    print(test_data.shape)
    print(test_labels.shape)

    print(train_data[0])
    print(train_labels[0])

    print(max(max(sequence) for sequence in train_data))

    # word_index is a dictionary mapping: word -> int indices
    word_index = imdb.get_word_index()
    # reversing it, mapping becomes int indices -> word
    reversed_word_index = dict([(value, key)
                                for (key, value) in word_index.items()])
    decoded_review = ' '.join(
        reversed_word_index.get(i - 3, '?') for i in train_data[0])
    print(decoded_review)
Ejemplo n.º 59
0
'''
    Train a Bidirectional LSTM on the IMDB sentiment classification task.

    GPU command:
        THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32 python imdb_bidirectional_lstm.py

    Output after 4 epochs on CPU: ~0.8146
    Time per epoch on CPU (Core i7): ~150s.
'''

max_features = 20000
maxlen = 100  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print("Loading data...")
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
                                                      test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

print('Build model...')
model = Graph()
model.add_input(name='input', input_shape=(1,), dtype=int)
model.add_node(Embedding(max_features, 128, input_length=maxlen),
import numpy as np

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, Dense, LSTM, Dropout
from keras.datasets import imdb

max_features = 20000
max_length = 80
embedding_dim = 256
batch_size = 128
epochs = 10
modes = [0, 1, 2]

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
X_train = sequence.pad_sequences(X_train, max_length)
X_test = sequence.pad_sequences(X_test, max_length)

# Compile and train different models while meauring performance.
results = []
for mode in modes:
    print('Testing mode: implementation={}'.format(mode))

    model = Sequential()
    model.add(Embedding(max_features, embedding_dim,
                        input_length=max_length))
    model.add(Dropout(0.2))
    model.add(LSTM(embedding_dim,
                   dropout=0.2,
                   recurrent_dropout=0.2,