Esempio n. 1
0
def build_model_lstm_emb(filters, vocab_size, n_dim, embedding_matrix,
                         len_max_tweet, lr):
    """
    Build a LSTM network with a given embedding matrix. The model is built with a binary cross entropy loss
    and the Adam optimizer.

    :param dim_output: Dimension of the output in the LSTM layer
    :param embedding_layer: Embedding layer of type keras.layers.embedding
    :return: LSTM network
    """
    model = Sequential([
        Embedding(vocab_size,
                  n_dim,
                  weights=[embedding_matrix],
                  input_length=len_max_tweet,
                  trainable=False),
        LSTM(filters),
        Dense(1, activation='sigmoid')
    ])

    opt = optimizers.Adam(learning_rate=lr)

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])

    return model
Esempio n. 2
0
    def build(vocab_size, max_length):
        embedding = 100
        model = Sequential()
        model.add(Embedding(vocab_size, embedding, input_length=max_length))
        model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

        return model
def CNNmodel(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()
    return model
def create_model():
    model = Sequential()
    model.add(
        Embedding(len(aa_tokenizer.word_index) + 1,
                  256,
                  input_length=max_length))
    model.add(
        Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.1)))
    model.add(
        TimeDistributed(
            Dense(len(dssp_tokenizer.word_index) + 1, activation='softmax')))
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=[mask_acc])
    return model
    def __init__(self, input_dim, input_len, hidden_state_size=100):
        """
        item_dim is a number of different values that can occur as unput. I.e. for utterance input_dim=vocab_size.
        """
        # self.model = Sequential([
        #     Flatten(input_dim=input_dim, output_dim=output_dim),
        #     Embedding(),
        #     LSTM(hidden_state_size)
        # ])

        self.model = Sequential([
            Embedding(input_dim=input_dim,
                      output_dim=hidden_state_size,
                      input_length=input_len),
        ])
        self.lstm = Sequential([
            LSTM(input_shape=(1, input_len * hidden_state_size),
                 units=hidden_state_size)
        ])
Esempio n. 6
0
def bidirectional_model():

    length_vocab, embedding_size = word2vec.shape

    model = Sequential()
    model.add(
        Embedding(length_vocab,
                  embedding_size,
                  input_length=parameters.max_length,
                  weights=[word2vec],
                  mask_zero=True,
                  name='embedding_layer'))

    for i in range(parameters.rnn_layers):
        bilstm = Bidirectional(
            LSTM(parameters.rnn_size,
                 return_sequences=True,
                 name='bilstm_layer_%d' % (i + 1)))
        model.add(bilstm)

    model.add(
        Lambda(simple_context,
               mask=lambda inputs, mask: mask[:, parameters.max_len_desc:],
               output_shape=lambda input_shape:
               (input_shape[0], parameters.max_len_head, 2 *
                (parameters.rnn_size - parameters.activation_rnn_size)),
               name='simple_context_layer'))

    vocab_size = word2vec.shape[0]
    model.add(TimeDistributed(Dense(vocab_size,
                                    name='time_distributed_layer')))

    model.add(Activation('softmax', name='activation_layer'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    K.set_value(model.optimizer.lr, np.float32(parameters.learning_rate))
    print(model.summary())

    return model
Esempio n. 7
0
def create_model():

    length_vocab, embedding_size = word2vec.shape
    print("shape of word2vec matrix ", word2vec.shape)

    model = Sequential()
    model.add(
        Embedding(length_vocab,
                  embedding_size,
                  input_length=parameters.max_length,
                  weights=[word2vec],
                  mask_zero=True,
                  name='embedding_layer'))

    for i in range(parameters.rnn_layers):
        gru = GRU(parameters.rnn_size,
                  return_sequences=True,
                  name='gru_layer_%d' % (i + 1))

        model.add(gru)

    model.add(
        Lambda(simple_context,
               mask=lambda inputs, mask: mask[:, parameters.max_len_desc:],
               output_shape=output_shape_simple_context_layer,
               name='simple_context_layer'))

    vocab_size = word2vec.shape[0]
    model.add(TimeDistributed(Dense(vocab_size,
                                    name='time_distributed_layer')))

    model.add(Activation('softmax', name='activation_layer'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    K.set_value(model.optimizer.lr, np.float32(parameters.learning_rate))
    print(model.summary())

    return model
Esempio n. 8
0
#getting the index out of 10e3 from the dictionary voc_size
onehot_repr = [one_hot(words, voc_size) for words in sent]
print(onehot_repr)

#word embedding representation
from tensorflow.python.keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences  #making sure the sentences are of equal size

from tensorflow.python.keras import Sequential  #needed for the embedding
import numpy as np

sent_length = 8  #set the max sent length
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)
print(embedded_docs)

dim = 10  # how many features

#adding embedding layer to the sequential model
model = Sequential()
model.add(Embedding(voc_size, 10, input_length=sent_length))
model.compile()
model.summary()

#see how the words got converted
model.predict(embedded_docs).shape
embedded_docs[10]

model.predict(embedded_docs)[
    0]  #the 8 words; for each word, a vector of 10 floats
Esempio n. 9
0
from numpy import array

# truncate and pad the review sequences 
max_review_length = 250 
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) 
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) 
print(pd.DataFrame(X_train).head())
# create the model 
embedding_vector_length = 128

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import LSTM,Dense, Dropout
from tensorflow.python.keras.layers import SpatialDropout1D
from tensorflow.python.keras.layers import Embedding
model = Sequential()
model.add(Embedding(15001, embedding_vector_length,     
                                     input_length=250) )
model.add(LSTM(100))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', 
                           metrics=['accuracy'])
r=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64) 
# Final evaluation of the model 
import tensorflow as tf 
filename = "my_model.h5"
model.save(filename)
model=tf.keras.models.load_model(filename)
scores = model.evaluate(X_test, y_test, verbose=0) 
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()
'''
Use pretrained Word2Vec model from google but trim the word list to 50,0000 compared to 300,000 in the original
Google pretrained model
'''
w2vModel = word2vec.KeyedVectors.load_word2vec_format('D:/twittersentiment1/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True, limit=50000)
#Convert words to integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets_split)
X = tokenizer.texts_to_sequences(tweets_split)
#lenght of tweet to consider
maxlentweet = 10
#add padding
X = pad_sequences(X, maxlen=maxlentweet)
print(X.shape)
#create a embedding layer using Google pre triained word2vec (50000 words)
embedding_layer = Embedding(input_dim=w2vModel.syn0.shape[0], output_dim=w2vModel.syn0.shape[1], weights=[w2vModel.syn0],
                            input_length=X.shape[1])
#create model
lstm_out = 80
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(units=lstm_out))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#split dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size= 0.1, random_state = 24)
#fit model
batch_size = 1
model.fit(X_train, Y_train, epochs=10, verbose=1, batch_size=batch_size)
#analyze the results
score, acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size=batch_size)
Esempio n. 11
0
print('단어 카운트:', token.word_counts)
print('문장 카운트:', token.document_count)
print('각 단어가 몇개의 문장에 포함되어 있는가 :', token.word_docs)
print('각 단어에 매겨진 인덱스 값 :', token.word_index)

print()
# 텍스트를 읽고 긍정 , 부정 분류 예측 

docs = ['너무 재밌네요', '최고에요','참 잘만든 영화예요','추천하고 싶은 영화네요','한번 더 보고싶네요',
        '글쎄요','별로네요','생각보다 지루합니다','연기가 좋지않아요','재미없어요']

import numpy as np 
classes = np.array([1,1,1,1,1,0,0,0,0,0])

token = Tokenizer()
token.fit_on_texts(docs)
print(token.word_index)

model = Sequential()
model.add(Embedding(word_size,8,input_length=4))
#model.add(Flatten())
model.add(LSTM(32))
model.add(Dense(1,activation='sigmoid'))

print(model.summary())
model.compile(optimizer='adam',loss='binary_crossentropy')




Esempio n. 12
0
ytrain =np.array([0 for _ in range(900)] + [1 for _ in range(900)])

positive_docs = process_docs('/home/sreekesh/python/NLP/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('/home/sreekesh/python/NLP/txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs

encoded_docs = tokenizer.texts_to_sequences(test_docs)
print(encoded_docs)
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

ytest =np.array([0 for _ in range(100)] + [1 for _ in range(100)])

v_size = len(tokenizer.word_index) + 1     

model = Sequential()
model.add(Embedding(v_size, 100 , input_length = max_length))
model.add(Conv1D(filters=32,kernel_size=8,activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10,input_dim=10,activation='relu'))
model.add(Dense(10,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(Xtrain,ytrain,epochs=10)

_,accu = model.evaluate(Xtest,ytest)
print("accuracy : {}".format(accu*100))

from numpy import loadtxt 
Esempio n. 13
0
    def train_model(self,
                    sentences_pair,
                    is_similar,
                    embedding_meta_data,
                    model_save_directory='./'):
        """
        Train Siamese network to find similarity between sentences in `sentences_pair`
            Steps Involved:
                1. Pass the each from sentences_pairs  to bidirectional LSTM encoder.
                2. Merge the vectors from LSTM encodes and passed to dense layer.
                3. Pass the  dense layer vectors to sigmoid output layer.
                4. Use cross entropy loss to train weights
        Args:
            sentences_pair (list): list of tuple of sentence pairs
            is_similar (list): target value 1 if same sentences pair are similar otherwise 0
            embedding_meta_data (dict): dict containing tokenizer and word embedding matrix
            model_save_directory (str): working directory for where to save models

        Returns:
            return (best_model_path):  path of best model
        """
        tokenizer, embedding_matrix = embedding_meta_data[
            'tokenizer'], embedding_meta_data['embedding_matrix']

        train_data_x1, train_data_x2, train_labels, leaks_train, val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(
            tokenizer, sentences_pair, is_similar, self.max_sequence_length,
            self.validation_split_ratio)

        if train_data_x1 is None:
            print("++++ !! Failure: Unable to train model ++++")
            return None

        nb_words = len(tokenizer.word_index) + 1

        # Creating word embedding layer
        # embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix],
        #                             input_length=self.max_sequence_length, trainable=False)
        embedding_layer = Embedding(nb_words,
                                    self.embedding_dim,
                                    input_length=self.max_sequence_length,
                                    trainable=False)

        # Creating LSTM Encoder
        lstm_layer = Bidirectional(
            LSTM(self.number_lstm_units,
                 dropout=self.rate_drop_lstm,
                 recurrent_dropout=self.rate_drop_lstm))

        # Creating LSTM Encoder layer for First Sentence
        sequence_1_input = Input(shape=(self.max_sequence_length, ),
                                 dtype='int32')
        embedded_sequences_1 = embedding_layer(sequence_1_input)
        x1 = lstm_layer(embedded_sequences_1)

        # Creating LSTM Encoder layer for Second Sentence
        sequence_2_input = Input(shape=(self.max_sequence_length, ),
                                 dtype='int32')
        embedded_sequences_2 = embedding_layer(sequence_2_input)
        x2 = lstm_layer(embedded_sequences_2)

        # Creating leaks input
        leaks_input = Input(shape=(leaks_train.shape[1], ))
        leaks_dense = Dense(int(self.number_dense_units / 2),
                            activation=self.activation_function)(leaks_input)

        # Merging two LSTM encodes vectors from sentences to
        # pass it to dense layer applying dropout and batch normalisation
        merged = concatenate([x1, x2, leaks_dense])
        merged = BatchNormalization()(merged)
        merged = Dropout(self.rate_drop_dense)(merged)
        merged = Dense(self.number_dense_units,
                       activation=self.activation_function)(merged)
        merged = BatchNormalization()(merged)
        merged = Dropout(self.rate_drop_dense)(merged)
        preds = Dense(1, activation='sigmoid')(merged)
        model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input],
                      outputs=preds)
        model.compile(loss='binary_crossentropy',
                      optimizer='nadam',
                      metrics=['acc'])
        early_stopping = EarlyStopping(monitor='val_loss', patience=20)
        STAMP = 'lstm_%d_%d_%.2f_%.2f' % (
            self.number_lstm_units, self.number_dense_units,
            self.rate_drop_lstm, self.rate_drop_dense)

        checkpoint_dir = model_save_directory + 'checkpoints/' + str(
            int(time.time())) + '/'

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        bst_model_path = checkpoint_dir + STAMP + '.h5'

        model_checkpoint = ModelCheckpoint(bst_model_path,
                                           save_best_only=True,
                                           save_weights_only=False)

        tensorboard = TensorBoard(log_dir=checkpoint_dir +
                                  "logs/{}".format(time.time()))

        model.fit([train_data_x1, train_data_x2, leaks_train],
                  train_labels,
                  validation_data=([val_data_x1, val_data_x2,
                                    leaks_val], val_labels),
                  epochs=200,
                  batch_size=64,
                  shuffle=True,
                  callbacks=[early_stopping, model_checkpoint, tensorboard])

        return bst_model_path
Esempio n. 14
0
else:
    embedding = np.random.uniform(-1.0 / 2.0 / G.embedding_dimension,
                                  1.0 / 2.0 / G.embedding_dimension,
                                  (G.vocab_size, G.embedding_dimension))
    np.save(aFile, embedding)

embeddingTwo = np.zeros((G.vocab_size, G.embedding_dimension))
# Creating CBOW model
# Model has 3 inputs
# Current word index, context words indexes and negative sampled word indexes
word_index = Input(shape=(1, ), name="word")
context = Input(shape=(context_size, ), name="context")
negative_samples = Input(shape=(G.vocab_size - 1, ), name="negative")
# All the inputs are processed through a common embedding layer
shared_embedding_layer = Embedding(input_dim=(G.vocab_size),
                                   output_dim=G.embedding_dimension,
                                   weights=[embedding])
shared_embedding_layer2 = Embedding(input_dim=(G.vocab_size),
                                    output_dim=G.embedding_dimension,
                                    weights=[embeddingTwo])

word_embedding = shared_embedding_layer(word_index)
word_embedding = Lambda(lambda x: x * 1)(word_embedding)
context_embeddings = shared_embedding_layer2(context)
negative_words_embedding = shared_embedding_layer(negative_samples)
negative_words_embedding = Lambda(lambda x: x * 1)(negative_words_embedding)

# Now the context words are averaged to get the CBOW vector
cbow = Lambda(lambda x: K.mean(x, axis=1),
              output_shape=(G.embedding_dimension, ))(context_embeddings)
# The context is multiplied (dot product) with current word and negative sampled words
Esempio n. 15
0
    def build_model(self):
        """Helper method for creating the model"""
        vocab = set()
        for story, q, answer in self.train_stories + self.test_stories:
            vocab |= set(story + q + [answer])
        vocab = sorted(vocab)

        # Reserve 0 for masking via pad_sequences
        vocab_size = len(vocab) + 1
        story_maxlen = max(
            len(x) for x, _, _ in self.train_stories + self.test_stories)
        query_maxlen = max(
            len(x) for _, x, _ in self.train_stories + self.test_stories)

        word_idx = {c: i + 1 for i, c in enumerate(vocab)}
        self.inputs_train, self.queries_train, self.answers_train = (
            vectorize_stories(word_idx, story_maxlen, query_maxlen,
                              self.train_stories))
        self.inputs_test, self.queries_test, self.answers_test = (
            vectorize_stories(word_idx, story_maxlen, query_maxlen,
                              self.test_stories))

        # placeholders
        input_sequence = Input((story_maxlen, ))
        question = Input((query_maxlen, ))

        # encoders
        # embed the input sequence into a sequence of vectors
        input_encoder_m = Sequential()
        input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
        input_encoder_m.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, story_maxlen, embedding_dim)

        # embed the input into a sequence of vectors of size query_maxlen
        input_encoder_c = Sequential()
        input_encoder_c.add(
            Embedding(input_dim=vocab_size, output_dim=query_maxlen))
        input_encoder_c.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, story_maxlen, query_maxlen)

        # embed the question into a sequence of vectors
        question_encoder = Sequential()
        question_encoder.add(
            Embedding(input_dim=vocab_size,
                      output_dim=64,
                      input_length=query_maxlen))
        question_encoder.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, query_maxlen, embedding_dim)

        # encode input sequence and questions (which are indices)
        # to sequences of dense vectors
        input_encoded_m = input_encoder_m(input_sequence)
        input_encoded_c = input_encoder_c(input_sequence)
        question_encoded = question_encoder(question)

        # compute a "match" between the first input vector sequence
        # and the question vector sequence
        # shape: `(samples, story_maxlen, query_maxlen)`
        match = dot([input_encoded_m, question_encoded], axes=(2, 2))
        match = Activation("softmax")(match)

        # add the match matrix with the second input vector sequence
        response = add([match, input_encoded_c
                        ])  # (samples, story_maxlen, query_maxlen)
        response = Permute(
            (2, 1))(response)  # (samples, query_maxlen, story_maxlen)

        # concatenate the match matrix with the question vector sequence
        answer = concatenate([response, question_encoded])

        # the original paper uses a matrix multiplication.
        # we choose to use a RNN instead.
        answer = LSTM(32)(answer)  # (samples, 32)

        # one regularization layer -- more would probably be needed.
        answer = Dropout(self.config.get("dropout", 0.3))(answer)
        answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
        # we output a probability distribution over the vocabulary
        answer = Activation("softmax")(answer)

        # build the final model
        model = Model([input_sequence, question], answer)
        return model
Esempio n. 16
0
token = Tokenizer(num_words=2000)
token.fit_on_texts(comment_dic)
# print token.document_count
# print token.word_index

x_train_seq = token.texts_to_sequences(comment_dic)
x_train = sequence.pad_sequences(x_train_seq, maxlen=200, padding='post')

all_labels = [1] * len(comment_dic_neg) + [0] * len(comment_dic_pos)

print(len(comment_dic_neg), len(comment_dic_pos), len(comment_dic))

# 构建网络
model = Sequential()
model.add(Embedding(output_dim=20, input_dim=2000, input_length=200))
model.add(Dropout(0.1))
model.add(SimpleRNN(units=16))
model.add(Dense(units=256, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(x=x_train,
          y=all_labels,
          batch_size=500,
          validation_split=0.2,
          epochs=5)
from os.path import isfile
from read_tc import ReadTC
from constants import *

tc = ReadTC('train.csv', input_length, vocab_size, train_percent)

if isfile(filename):
    print("\nLOADING EXISTING NETWORK\n\n")
    model = load_model(filename)
else:
    print("\nBUILDING NEW NETWORK\n\n")
    model = Sequential()
    model.add(
        Embedding(vocab_size,
                  embed_size,
                  batch_input_shape=(batch_size, input_length)))

    model.add(Flatten())
    model.add(Dense(2, activation='sigmoid'))
    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['acc'])
    save_callback = ModelCheckpoint(filename)
    model.fit_generator(tc.get_train_data(batch_size),
                        batch_size,
                        num_epochs,
                        validation_data=tc.get_test_data(batch_size),
                        validation_steps=batch_size,
                        callbacks=[save_callback])
Esempio n. 18
0
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras.layers import Flatten
from tensorflow.python.keras.layers.embeddings import Embedding
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.layers.convolutional import Conv1D
from tensorflow.python.keras.layers.convolutional import MaxPooling1D

top_words = 3000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

max_words = 300
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)

model = Sequential()
model.add(Embedding(top_words, 32, input_length=max_words))
model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.001, seed=0))
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train,
          y_train,
          validation_data=(X_test, y_test),
          epochs=5,
Esempio n. 19
0
    def _build_model(self):
        with tf.name_scope("inputs"):
            user_input = tf.keras.Input(shape=(self.history_length, 3))
            label_input = tf.keras.Input(shape=(self.history_length, 1))
            mask_input = tf.keras.Input(shape=(self.history_length, 1))

        with tf.name_scope("layers"):
            embedding = Embedding(input_dim=self.vocab_size,
                                  output_dim=self.embedding_dim,
                                  weights=[self.embedding_mx],
                                  trainable=False)
            session_cells = [
                GRUCell(units=self.num_units, name="sesion_rnn_01"),
                GRUCell(units=self.num_units, name="sesion_rnn_02")
                # GRUCell(units=self.num_units, name="sesion_rnn_03")
            ]
            user_cells = [
                GRUCell(units=self.num_units, name="user_rnn_01"),
                GRUCell(units=self.num_units, name="user_rnn_02")
                # GRUCell(units=self.num_units, name="user_rnn_03")
            ]
            cell = HierarchicalRNNCell(user_cells=user_cells,
                                       session_cells=session_cells,
                                       embedding_layer=embedding)
            recurrent = RNN(cell=cell,
                            return_sequences=True,
                            return_state=True)

        with tf.name_scope("loss"):

            loss = RankingLoss(num_units=self.num_units,
                               num_sampled=self.num_negatives,
                               num_classes=self.vocab_size - 1,
                               num_true=1,
                               history_length=self.history_length,
                               remove_accidental_hits=True)

            time_distributed = TimeDistributed(
                loss, input_shape=(self.history_length, self.num_units + 1))

        with tf.name_scope("model"):
            tensor = recurrent(inputs=user_input)
            outputs = tensor[0]
            outputs = tf.concat([outputs, label_input], axis=2)
            tensor = time_distributed(outputs)
            # loss
            loss = tf.gather(tensor, [0], axis=2)
            loss = tf.multiply(loss, mask_input, name="loss")
            # prediction
            prediction = tf.gather(tensor, [1], axis=2)
            prediction = tf.multiply(prediction, mask_input, name="prediction")
            # build the model
            model = tf.keras.Model(
                inputs=[user_input, label_input, mask_input],
                outputs=[loss, prediction])
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                loss={
                    'tf_op_layer_loss': custom_loss,
                    'tf_op_layer_prediction': 'binary_crossentropy'
                },
                loss_weights={
                    'tf_op_layer_loss': 1.0,
                    'tf_op_layer_prediction': 0.0
                },
                metrics={'tf_op_layer_prediction': custom_acc})
        return model
    for word, i in tokenizer.word_index.items():
        if i > vocabSize:
            continue
        if word in word2vecModel.wv.vocab.keys():
            embeddingWeights[i] = word2vecModel.wv.get_vector(word)

    XTrainTokens = tokenizer.texts_to_sequences(X_train)
    XTrainPad = pad_sequences(XTrainTokens, maxlen=maxLength, padding='post')
    XTestTokens = tokenizer.texts_to_sequences(X_test)
    XTestPad = pad_sequences(XTestTokens, maxlen=maxLength, padding='post')

    biGRU = Sequential()
    biGRU.add(
        Embedding(vocabSize,
                  embDim,
                  embeddings_initializer=Constant(embeddingWeights),
                  input_length=maxLength,
                  mask_zero=True))
    biGRU.add(Bidirectional(GRU(units=20, dropout=0.3)))
    biGRU.add(Dense(1))

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
    lossFunction = tf.keras.losses.MeanSquaredError()
    biGRU.compile(optimizer=optimizer, loss=lossFunction)

    print('\nTraining Deep Learning Model\n')
    biGRU.fit(XTrainPad, y_train, batch_size=256, epochs=20)

    # model.save('my_model.h5')

    preds = biGRU.predict(XTestPad)
Esempio n. 21
0
max_review_length = 150

mapped_list = sequence.pad_sequences(mapped_list, maxlen=max_review_length)

train_x, test_x, train_y, test_y = train_test_split(mapped_list,
                                                    varietal_list,
                                                    test_size=0.3)

max_review_length = 150

embedding_vector_length = 64

model = Sequential()

model.add(
    Embedding(2500, embedding_vector_length, input_length=max_review_length))

model.add(Conv1D(50, 5))

model.add(Flatten())

model.add(Dense(100, activation='relu'))

model.add(Dense(max(varietal_list_o) + 1, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(train_x, train_y, epochs=3, batch_size=64)
Esempio n. 22
0
def build_embedding_network():

    inputs = []
    embeddings = []

    input_ps_ind_02_cat = Input(shape=(1, ))
    embedding = Embedding(5, 3, input_length=1)(input_ps_ind_02_cat)
    embedding = Reshape(target_shape=(3, ))(embedding)
    inputs.append(input_ps_ind_02_cat)
    embeddings.append(embedding)

    input_ps_ind_04_cat = Input(shape=(1, ))
    embedding = Embedding(3, 2, input_length=1)(input_ps_ind_04_cat)
    embedding = Reshape(target_shape=(2, ))(embedding)
    inputs.append(input_ps_ind_04_cat)
    embeddings.append(embedding)

    input_ps_ind_05_cat = Input(shape=(1, ))
    embedding = Embedding(8, 5, input_length=1)(input_ps_ind_05_cat)
    embedding = Reshape(target_shape=(5, ))(embedding)
    inputs.append(input_ps_ind_05_cat)
    embeddings.append(embedding)

    input_ps_car_01_cat = Input(shape=(1, ))
    embedding = Embedding(13, 7, input_length=1)(input_ps_car_01_cat)
    embedding = Reshape(target_shape=(7, ))(embedding)
    inputs.append(input_ps_car_01_cat)
    embeddings.append(embedding)

    input_ps_car_02_cat = Input(shape=(1, ))
    embedding = Embedding(3, 2, input_length=1)(input_ps_car_02_cat)
    embedding = Reshape(target_shape=(2, ))(embedding)
    inputs.append(input_ps_car_02_cat)
    embeddings.append(embedding)

    input_ps_car_03_cat = Input(shape=(1, ))
    embedding = Embedding(3, 2, input_length=1)(input_ps_car_03_cat)
    embedding = Reshape(target_shape=(2, ))(embedding)
    inputs.append(input_ps_car_03_cat)
    embeddings.append(embedding)

    input_ps_car_04_cat = Input(shape=(1, ))
    embedding = Embedding(10, 5, input_length=1)(input_ps_car_04_cat)
    embedding = Reshape(target_shape=(5, ))(embedding)
    inputs.append(input_ps_car_04_cat)
    embeddings.append(embedding)

    input_ps_car_05_cat = Input(shape=(1, ))
    embedding = Embedding(3, 2, input_length=1)(input_ps_car_05_cat)
    embedding = Reshape(target_shape=(2, ))(embedding)
    inputs.append(input_ps_car_05_cat)
    embeddings.append(embedding)

    input_ps_car_06_cat = Input(shape=(1, ))
    embedding = Embedding(18, 8, input_length=1)(input_ps_car_06_cat)
    embedding = Reshape(target_shape=(8, ))(embedding)
    inputs.append(input_ps_car_06_cat)
    embeddings.append(embedding)

    input_ps_car_07_cat = Input(shape=(1, ))
    embedding = Embedding(3, 2, input_length=1)(input_ps_car_07_cat)
    embedding = Reshape(target_shape=(2, ))(embedding)
    inputs.append(input_ps_car_07_cat)
    embeddings.append(embedding)

    input_ps_car_09_cat = Input(shape=(1, ))
    embedding = Embedding(6, 3, input_length=1)(input_ps_car_09_cat)
    embedding = Reshape(target_shape=(3, ))(embedding)
    inputs.append(input_ps_car_09_cat)
    embeddings.append(embedding)

    input_ps_car_10_cat = Input(shape=(1, ))
    embedding = Embedding(3, 2, input_length=1)(input_ps_car_10_cat)
    embedding = Reshape(target_shape=(2, ))(embedding)
    inputs.append(input_ps_car_10_cat)
    embeddings.append(embedding)

    input_ps_car_11_cat = Input(shape=(1, ))
    embedding = Embedding(104, 10, input_length=1)(input_ps_car_11_cat)
    embedding = Reshape(target_shape=(10, ))(embedding)
    inputs.append(input_ps_car_11_cat)
    embeddings.append(embedding)

    input_numeric = Input(shape=(24, ))
    embedding_numeric = Dense(16)(input_numeric)
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)

    x = Concatenate()(embeddings)
    x = Dense(80, activation='relu')(x)
    x = Dropout(.35)(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(.15)(x)
    x = Dense(10, activation='relu')(x)
    x = Dropout(.15)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs, output)

    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model
Esempio n. 23
0
def make_model():
    # load all training reviews
    positive_docs = process_docs('data/pos_train', vocab, True)
    negative_docs = process_docs('data/neg_train', vocab, True)
    train_docs = negative_docs + positive_docs
    # create the tokenizer
    tokenizer = Tokenizer()
    # fit the tokenizer on the documents
    tokenizer.fit_on_texts(train_docs)
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(train_docs)

    # pad sequences
    max_length = max([len(s.split()) for s in train_docs])
    print("\n\n maxlenght="+str(max_length))

    from tensorflow.python.keras.preprocessing.sequence import pad_sequences
    X = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

    # define training labels
    y = np.array([0 for _ in range(270)] + [1 for _ in range(270)])

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=42)
    '''
    # load all test reviews
    positive_docs = process_docs('data/pos_test', vocab, False)
    negative_docs = process_docs('data/neg_test', vocab, False)
    test_docs = negative_docs + positive_docs
    # sequence encode
    encoded_docs = tokenizer.texts_to_sequences(test_docs)
    # pad sequences
    Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    # define test labels
    ytest = np.array([0 for _ in range(len(listdir("data/neg_test")))] + [1 for _ in range(len(listdir("data/pos_test")))])
    '''
    print("\n pad_sequences : ",Xtest)
    print("\n ytest : ",ytest)

    # define vocabulary size (largest integer value)
    vocab_size = len(tokenizer.word_index) + 1

    # define model
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit network
    model.fit(Xtrain, ytrain, epochs=20, verbose=1)
    # evaluate
    loss, acc = model.evaluate(Xtest, ytest, verbose=0)
    print('Test Accuracy: %f' % (acc*100))

    model.save("relevancy_model_v2.0.1.h5")
    print("Done!")
Esempio n. 24
0
        _precision, _recall, _f1, _sample = precision_recall_fscore_support(
            y_test, y_val_pred)

        self.precisions.append(_precision)
        self.recalls.append(_recall)
        self.f1_scores.append(_f1)


metrics = ModelMetrics()

# ML model ----------------------------------------------

epochs = 10
ml_model1 = Sequential()

ml_model1.add(Embedding(max_features, 128, input_length=maxlen))
ml_model1.add(LSTM(128))
ml_model1.add(Dropout(0.5))
ml_model1.add(Dense(1))
ml_model1.add(Activation('sigmoid'))

ml_model1.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['mae', 'acc'])

## Splitting the test and train dataset
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
)
        self.precisions.append(_precision)
        self.recalls.append(_recall)
        self.f1_scores.append(_f1)


metrics = ModelMetrics()

# ML model ----------------------------------------------

epochs = 10
#ml_model1 = Sequential()

### CNN Code
text_input = Input(shape=(maxlen, ), name='text_input')
x = Embedding(input_dim=max_features, input_length=maxlen,
              output_dim=128)(text_input)

conv_a = Conv1D(15, 2, activation='relu')(x)
conv_b = Conv1D(15, 3, activation='relu')(x)
conv_c = Conv1D(15, 4, activation='relu')(x)
conv_d = Conv1D(15, 5, activation='relu')(x)
conv_e = Conv1D(15, 6, activation='relu')(x)

pool_a = GlobalMaxPooling1D()(conv_a)
pool_b = GlobalMaxPooling1D()(conv_b)
pool_c = GlobalMaxPooling1D()(conv_c)
pool_d = GlobalMaxPooling1D()(conv_d)
pool_e = GlobalMaxPooling1D()(conv_e)

flattened = concatenate([pool_a, pool_b, pool_c, pool_d, pool_e])
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(df['data'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(df['class']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(35))
model.add(LSTM(100))
model.add(Dense(9, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.005),
              metrics=['accuracy'])

epochs = 2
batch_size = 100

history = model.fit(X_train,
                    Y_train,
    
    
    self.precisions.append(_precision)
    self.recalls.append(_recall)
    self.f1_scores.append(_f1)

metrics = ModelMetrics()

# ML model ----------------------------------------------

epochs = 10
ml_model1 = Sequential()

### CNN Code
text_input = Input(shape = (maxlen,), name='text_input')
x = Embedding(input_dim=max_features, input_length=maxlen, output_dim=128)(text_input)

conv_a = Conv1D(15,2, activation='relu')(x)
conv_b = Conv1D(15,3, activation='relu')(x)
conv_c = Conv1D(15,4, activation='relu')(x)
conv_d = Conv1D(15,5, activation='relu')(x)
conv_e = Conv1D(15,6, activation='relu')(x)

pool_a = GlobalMaxPooling1D()(conv_a)
pool_b = GlobalMaxPooling1D()(conv_b)
pool_c = GlobalMaxPooling1D()(conv_c)
pool_d = GlobalMaxPooling1D()(conv_d)
pool_e = GlobalMaxPooling1D()(conv_e)

flattened = concatenate([pool_a, pool_b, pool_c, pool_d, pool_e])