def build_model_lstm_emb(filters, vocab_size, n_dim, embedding_matrix, len_max_tweet, lr): """ Build a LSTM network with a given embedding matrix. The model is built with a binary cross entropy loss and the Adam optimizer. :param dim_output: Dimension of the output in the LSTM layer :param embedding_layer: Embedding layer of type keras.layers.embedding :return: LSTM network """ model = Sequential([ Embedding(vocab_size, n_dim, weights=[embedding_matrix], input_length=len_max_tweet, trainable=False), LSTM(filters), Dense(1, activation='sigmoid') ]) opt = optimizers.Adam(learning_rate=lr) model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) return model
def build(vocab_size, max_length): embedding = 100 model = Sequential() model.add(Embedding(vocab_size, embedding, input_length=max_length)) model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def CNNmodel(vocab_size, max_length): model = Sequential() model.add(Embedding(vocab_size, 100, input_length=max_length)) model.add(Conv1D(filters=32, kernel_size=8, activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(Flatten()) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() return model
def create_model(): model = Sequential() model.add( Embedding(len(aa_tokenizer.word_index) + 1, 256, input_length=max_length)) model.add( Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.1))) model.add( TimeDistributed( Dense(len(dssp_tokenizer.word_index) + 1, activation='softmax'))) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=[mask_acc]) return model
def __init__(self, input_dim, input_len, hidden_state_size=100): """ item_dim is a number of different values that can occur as unput. I.e. for utterance input_dim=vocab_size. """ # self.model = Sequential([ # Flatten(input_dim=input_dim, output_dim=output_dim), # Embedding(), # LSTM(hidden_state_size) # ]) self.model = Sequential([ Embedding(input_dim=input_dim, output_dim=hidden_state_size, input_length=input_len), ]) self.lstm = Sequential([ LSTM(input_shape=(1, input_len * hidden_state_size), units=hidden_state_size) ])
def bidirectional_model(): length_vocab, embedding_size = word2vec.shape model = Sequential() model.add( Embedding(length_vocab, embedding_size, input_length=parameters.max_length, weights=[word2vec], mask_zero=True, name='embedding_layer')) for i in range(parameters.rnn_layers): bilstm = Bidirectional( LSTM(parameters.rnn_size, return_sequences=True, name='bilstm_layer_%d' % (i + 1))) model.add(bilstm) model.add( Lambda(simple_context, mask=lambda inputs, mask: mask[:, parameters.max_len_desc:], output_shape=lambda input_shape: (input_shape[0], parameters.max_len_head, 2 * (parameters.rnn_size - parameters.activation_rnn_size)), name='simple_context_layer')) vocab_size = word2vec.shape[0] model.add(TimeDistributed(Dense(vocab_size, name='time_distributed_layer'))) model.add(Activation('softmax', name='activation_layer')) model.compile(loss='categorical_crossentropy', optimizer='adam') K.set_value(model.optimizer.lr, np.float32(parameters.learning_rate)) print(model.summary()) return model
def create_model(): length_vocab, embedding_size = word2vec.shape print("shape of word2vec matrix ", word2vec.shape) model = Sequential() model.add( Embedding(length_vocab, embedding_size, input_length=parameters.max_length, weights=[word2vec], mask_zero=True, name='embedding_layer')) for i in range(parameters.rnn_layers): gru = GRU(parameters.rnn_size, return_sequences=True, name='gru_layer_%d' % (i + 1)) model.add(gru) model.add( Lambda(simple_context, mask=lambda inputs, mask: mask[:, parameters.max_len_desc:], output_shape=output_shape_simple_context_layer, name='simple_context_layer')) vocab_size = word2vec.shape[0] model.add(TimeDistributed(Dense(vocab_size, name='time_distributed_layer'))) model.add(Activation('softmax', name='activation_layer')) model.compile(loss='categorical_crossentropy', optimizer='adam') K.set_value(model.optimizer.lr, np.float32(parameters.learning_rate)) print(model.summary()) return model
#getting the index out of 10e3 from the dictionary voc_size onehot_repr = [one_hot(words, voc_size) for words in sent] print(onehot_repr) #word embedding representation from tensorflow.python.keras.layers.embeddings import Embedding from keras.preprocessing.sequence import pad_sequences #making sure the sentences are of equal size from tensorflow.python.keras import Sequential #needed for the embedding import numpy as np sent_length = 8 #set the max sent length embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length) print(embedded_docs) dim = 10 # how many features #adding embedding layer to the sequential model model = Sequential() model.add(Embedding(voc_size, 10, input_length=sent_length)) model.compile() model.summary() #see how the words got converted model.predict(embedded_docs).shape embedded_docs[10] model.predict(embedded_docs)[ 0] #the 8 words; for each word, a vector of 10 floats
from numpy import array # truncate and pad the review sequences max_review_length = 250 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) print(pd.DataFrame(X_train).head()) # create the model embedding_vector_length = 128 from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import LSTM,Dense, Dropout from tensorflow.python.keras.layers import SpatialDropout1D from tensorflow.python.keras.layers import Embedding model = Sequential() model.add(Embedding(15001, embedding_vector_length, input_length=250) ) model.add(LSTM(100)) model.add(Dense(2, activation='sigmoid')) model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) r=model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64) # Final evaluation of the model import tensorflow as tf filename = "my_model.h5" model.save(filename) model=tf.keras.models.load_model(filename) scores = model.evaluate(X_test, y_test, verbose=0) plt.plot(r.history['loss'], label='loss') plt.plot(r.history['val_loss'], label='val_loss') plt.legend() plt.show()
''' Use pretrained Word2Vec model from google but trim the word list to 50,0000 compared to 300,000 in the original Google pretrained model ''' w2vModel = word2vec.KeyedVectors.load_word2vec_format('D:/twittersentiment1/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin', binary=True, limit=50000) #Convert words to integers tokenizer = Tokenizer() tokenizer.fit_on_texts(tweets_split) X = tokenizer.texts_to_sequences(tweets_split) #lenght of tweet to consider maxlentweet = 10 #add padding X = pad_sequences(X, maxlen=maxlentweet) print(X.shape) #create a embedding layer using Google pre triained word2vec (50000 words) embedding_layer = Embedding(input_dim=w2vModel.syn0.shape[0], output_dim=w2vModel.syn0.shape[1], weights=[w2vModel.syn0], input_length=X.shape[1]) #create model lstm_out = 80 model = Sequential() model.add(embedding_layer) model.add(LSTM(units=lstm_out)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) #split dataset X_train, X_test, Y_train, Y_test = train_test_split(X, labels, test_size= 0.1, random_state = 24) #fit model batch_size = 1 model.fit(X_train, Y_train, epochs=10, verbose=1, batch_size=batch_size) #analyze the results score, acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size=batch_size)
print('단어 카운트:', token.word_counts) print('문장 카운트:', token.document_count) print('각 단어가 몇개의 문장에 포함되어 있는가 :', token.word_docs) print('각 단어에 매겨진 인덱스 값 :', token.word_index) print() # 텍스트를 읽고 긍정 , 부정 분류 예측 docs = ['너무 재밌네요', '최고에요','참 잘만든 영화예요','추천하고 싶은 영화네요','한번 더 보고싶네요', '글쎄요','별로네요','생각보다 지루합니다','연기가 좋지않아요','재미없어요'] import numpy as np classes = np.array([1,1,1,1,1,0,0,0,0,0]) token = Tokenizer() token.fit_on_texts(docs) print(token.word_index) model = Sequential() model.add(Embedding(word_size,8,input_length=4)) #model.add(Flatten()) model.add(LSTM(32)) model.add(Dense(1,activation='sigmoid')) print(model.summary()) model.compile(optimizer='adam',loss='binary_crossentropy')
ytrain =np.array([0 for _ in range(900)] + [1 for _ in range(900)]) positive_docs = process_docs('/home/sreekesh/python/NLP/txt_sentoken/pos', vocab, False) negative_docs = process_docs('/home/sreekesh/python/NLP/txt_sentoken/neg', vocab, False) test_docs = negative_docs + positive_docs encoded_docs = tokenizer.texts_to_sequences(test_docs) print(encoded_docs) Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post') ytest =np.array([0 for _ in range(100)] + [1 for _ in range(100)]) v_size = len(tokenizer.word_index) + 1 model = Sequential() model.add(Embedding(v_size, 100 , input_length = max_length)) model.add(Conv1D(filters=32,kernel_size=8,activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(Flatten()) model.add(Dense(10,input_dim=10,activation='relu')) model.add(Dense(10,activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy']) model.fit(Xtrain,ytrain,epochs=10) _,accu = model.evaluate(Xtest,ytest) print("accuracy : {}".format(accu*100)) from numpy import loadtxt
def train_model(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'): """ Train Siamese network to find similarity between sentences in `sentences_pair` Steps Involved: 1. Pass the each from sentences_pairs to bidirectional LSTM encoder. 2. Merge the vectors from LSTM encodes and passed to dense layer. 3. Pass the dense layer vectors to sigmoid output layer. 4. Use cross entropy loss to train weights Args: sentences_pair (list): list of tuple of sentence pairs is_similar (list): target value 1 if same sentences pair are similar otherwise 0 embedding_meta_data (dict): dict containing tokenizer and word embedding matrix model_save_directory (str): working directory for where to save models Returns: return (best_model_path): path of best model """ tokenizer, embedding_matrix = embedding_meta_data[ 'tokenizer'], embedding_meta_data['embedding_matrix'] train_data_x1, train_data_x2, train_labels, leaks_train, val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set( tokenizer, sentences_pair, is_similar, self.max_sequence_length, self.validation_split_ratio) if train_data_x1 is None: print("++++ !! Failure: Unable to train model ++++") return None nb_words = len(tokenizer.word_index) + 1 # Creating word embedding layer # embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix], # input_length=self.max_sequence_length, trainable=False) embedding_layer = Embedding(nb_words, self.embedding_dim, input_length=self.max_sequence_length, trainable=False) # Creating LSTM Encoder lstm_layer = Bidirectional( LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm)) # Creating LSTM Encoder layer for First Sentence sequence_1_input = Input(shape=(self.max_sequence_length, ), dtype='int32') embedded_sequences_1 = embedding_layer(sequence_1_input) x1 = lstm_layer(embedded_sequences_1) # Creating LSTM Encoder layer for Second Sentence sequence_2_input = Input(shape=(self.max_sequence_length, ), dtype='int32') embedded_sequences_2 = embedding_layer(sequence_2_input) x2 = lstm_layer(embedded_sequences_2) # Creating leaks input leaks_input = Input(shape=(leaks_train.shape[1], )) leaks_dense = Dense(int(self.number_dense_units / 2), activation=self.activation_function)(leaks_input) # Merging two LSTM encodes vectors from sentences to # pass it to dense layer applying dropout and batch normalisation merged = concatenate([x1, x2, leaks_dense]) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) merged = Dense(self.number_dense_units, activation=self.activation_function)(merged) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) preds = Dense(1, activation='sigmoid')(merged) model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) early_stopping = EarlyStopping(monitor='val_loss', patience=20) STAMP = 'lstm_%d_%d_%.2f_%.2f' % ( self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense) checkpoint_dir = model_save_directory + 'checkpoints/' + str( int(time.time())) + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) bst_model_path = checkpoint_dir + STAMP + '.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False) tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time())) model.fit([train_data_x1, train_data_x2, leaks_train], train_labels, validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels), epochs=200, batch_size=64, shuffle=True, callbacks=[early_stopping, model_checkpoint, tensorboard]) return bst_model_path
else: embedding = np.random.uniform(-1.0 / 2.0 / G.embedding_dimension, 1.0 / 2.0 / G.embedding_dimension, (G.vocab_size, G.embedding_dimension)) np.save(aFile, embedding) embeddingTwo = np.zeros((G.vocab_size, G.embedding_dimension)) # Creating CBOW model # Model has 3 inputs # Current word index, context words indexes and negative sampled word indexes word_index = Input(shape=(1, ), name="word") context = Input(shape=(context_size, ), name="context") negative_samples = Input(shape=(G.vocab_size - 1, ), name="negative") # All the inputs are processed through a common embedding layer shared_embedding_layer = Embedding(input_dim=(G.vocab_size), output_dim=G.embedding_dimension, weights=[embedding]) shared_embedding_layer2 = Embedding(input_dim=(G.vocab_size), output_dim=G.embedding_dimension, weights=[embeddingTwo]) word_embedding = shared_embedding_layer(word_index) word_embedding = Lambda(lambda x: x * 1)(word_embedding) context_embeddings = shared_embedding_layer2(context) negative_words_embedding = shared_embedding_layer(negative_samples) negative_words_embedding = Lambda(lambda x: x * 1)(negative_words_embedding) # Now the context words are averaged to get the CBOW vector cbow = Lambda(lambda x: K.mean(x, axis=1), output_shape=(G.embedding_dimension, ))(context_embeddings) # The context is multiplied (dot product) with current word and negative sampled words
def build_model(self): """Helper method for creating the model""" vocab = set() for story, q, answer in self.train_stories + self.test_stories: vocab |= set(story + q + [answer]) vocab = sorted(vocab) # Reserve 0 for masking via pad_sequences vocab_size = len(vocab) + 1 story_maxlen = max( len(x) for x, _, _ in self.train_stories + self.test_stories) query_maxlen = max( len(x) for _, x, _ in self.train_stories + self.test_stories) word_idx = {c: i + 1 for i, c in enumerate(vocab)} self.inputs_train, self.queries_train, self.answers_train = ( vectorize_stories(word_idx, story_maxlen, query_maxlen, self.train_stories)) self.inputs_test, self.queries_test, self.answers_test = ( vectorize_stories(word_idx, story_maxlen, query_maxlen, self.test_stories)) # placeholders input_sequence = Input((story_maxlen, )) question = Input((query_maxlen, )) # encoders # embed the input sequence into a sequence of vectors input_encoder_m = Sequential() input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64)) input_encoder_m.add(Dropout(self.config.get("dropout", 0.3))) # output: (samples, story_maxlen, embedding_dim) # embed the input into a sequence of vectors of size query_maxlen input_encoder_c = Sequential() input_encoder_c.add( Embedding(input_dim=vocab_size, output_dim=query_maxlen)) input_encoder_c.add(Dropout(self.config.get("dropout", 0.3))) # output: (samples, story_maxlen, query_maxlen) # embed the question into a sequence of vectors question_encoder = Sequential() question_encoder.add( Embedding(input_dim=vocab_size, output_dim=64, input_length=query_maxlen)) question_encoder.add(Dropout(self.config.get("dropout", 0.3))) # output: (samples, query_maxlen, embedding_dim) # encode input sequence and questions (which are indices) # to sequences of dense vectors input_encoded_m = input_encoder_m(input_sequence) input_encoded_c = input_encoder_c(input_sequence) question_encoded = question_encoder(question) # compute a "match" between the first input vector sequence # and the question vector sequence # shape: `(samples, story_maxlen, query_maxlen)` match = dot([input_encoded_m, question_encoded], axes=(2, 2)) match = Activation("softmax")(match) # add the match matrix with the second input vector sequence response = add([match, input_encoded_c ]) # (samples, story_maxlen, query_maxlen) response = Permute( (2, 1))(response) # (samples, query_maxlen, story_maxlen) # concatenate the match matrix with the question vector sequence answer = concatenate([response, question_encoded]) # the original paper uses a matrix multiplication. # we choose to use a RNN instead. answer = LSTM(32)(answer) # (samples, 32) # one regularization layer -- more would probably be needed. answer = Dropout(self.config.get("dropout", 0.3))(answer) answer = Dense(vocab_size)(answer) # (samples, vocab_size) # we output a probability distribution over the vocabulary answer = Activation("softmax")(answer) # build the final model model = Model([input_sequence, question], answer) return model
token = Tokenizer(num_words=2000) token.fit_on_texts(comment_dic) # print token.document_count # print token.word_index x_train_seq = token.texts_to_sequences(comment_dic) x_train = sequence.pad_sequences(x_train_seq, maxlen=200, padding='post') all_labels = [1] * len(comment_dic_neg) + [0] * len(comment_dic_pos) print(len(comment_dic_neg), len(comment_dic_pos), len(comment_dic)) # 构建网络 model = Sequential() model.add(Embedding(output_dim=20, input_dim=2000, input_length=200)) model.add(Dropout(0.1)) model.add(SimpleRNN(units=16)) model.add(Dense(units=256, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(units=1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x=x_train, y=all_labels, batch_size=500, validation_split=0.2, epochs=5)
from os.path import isfile from read_tc import ReadTC from constants import * tc = ReadTC('train.csv', input_length, vocab_size, train_percent) if isfile(filename): print("\nLOADING EXISTING NETWORK\n\n") model = load_model(filename) else: print("\nBUILDING NEW NETWORK\n\n") model = Sequential() model.add( Embedding(vocab_size, embed_size, batch_input_shape=(batch_size, input_length))) model.add(Flatten()) model.add(Dense(2, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) save_callback = ModelCheckpoint(filename) model.fit_generator(tc.get_train_data(batch_size), batch_size, num_epochs, validation_data=tc.get_test_data(batch_size), validation_steps=batch_size, callbacks=[save_callback])
from tensorflow.python.keras.layers import Dense, Dropout from tensorflow.python.keras.layers import Flatten from tensorflow.python.keras.layers.embeddings import Embedding from tensorflow.python.keras.preprocessing import sequence from tensorflow.python.keras.layers.convolutional import Conv1D from tensorflow.python.keras.layers.convolutional import MaxPooling1D top_words = 3000 (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) max_words = 300 X_train = sequence.pad_sequences(X_train, maxlen=max_words) X_test = sequence.pad_sequences(X_test, maxlen=max_words) model = Sequential() model.add(Embedding(top_words, 32, input_length=max_words)) model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')) model.add(Flatten()) model.add(Dense(250, activation='relu')) model.add(Dense(250, activation='relu')) model.add(Dropout(0.001, seed=0)) model.add(Dense(250, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5,
def _build_model(self): with tf.name_scope("inputs"): user_input = tf.keras.Input(shape=(self.history_length, 3)) label_input = tf.keras.Input(shape=(self.history_length, 1)) mask_input = tf.keras.Input(shape=(self.history_length, 1)) with tf.name_scope("layers"): embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, weights=[self.embedding_mx], trainable=False) session_cells = [ GRUCell(units=self.num_units, name="sesion_rnn_01"), GRUCell(units=self.num_units, name="sesion_rnn_02") # GRUCell(units=self.num_units, name="sesion_rnn_03") ] user_cells = [ GRUCell(units=self.num_units, name="user_rnn_01"), GRUCell(units=self.num_units, name="user_rnn_02") # GRUCell(units=self.num_units, name="user_rnn_03") ] cell = HierarchicalRNNCell(user_cells=user_cells, session_cells=session_cells, embedding_layer=embedding) recurrent = RNN(cell=cell, return_sequences=True, return_state=True) with tf.name_scope("loss"): loss = RankingLoss(num_units=self.num_units, num_sampled=self.num_negatives, num_classes=self.vocab_size - 1, num_true=1, history_length=self.history_length, remove_accidental_hits=True) time_distributed = TimeDistributed( loss, input_shape=(self.history_length, self.num_units + 1)) with tf.name_scope("model"): tensor = recurrent(inputs=user_input) outputs = tensor[0] outputs = tf.concat([outputs, label_input], axis=2) tensor = time_distributed(outputs) # loss loss = tf.gather(tensor, [0], axis=2) loss = tf.multiply(loss, mask_input, name="loss") # prediction prediction = tf.gather(tensor, [1], axis=2) prediction = tf.multiply(prediction, mask_input, name="prediction") # build the model model = tf.keras.Model( inputs=[user_input, label_input, mask_input], outputs=[loss, prediction]) model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss={ 'tf_op_layer_loss': custom_loss, 'tf_op_layer_prediction': 'binary_crossentropy' }, loss_weights={ 'tf_op_layer_loss': 1.0, 'tf_op_layer_prediction': 0.0 }, metrics={'tf_op_layer_prediction': custom_acc}) return model
for word, i in tokenizer.word_index.items(): if i > vocabSize: continue if word in word2vecModel.wv.vocab.keys(): embeddingWeights[i] = word2vecModel.wv.get_vector(word) XTrainTokens = tokenizer.texts_to_sequences(X_train) XTrainPad = pad_sequences(XTrainTokens, maxlen=maxLength, padding='post') XTestTokens = tokenizer.texts_to_sequences(X_test) XTestPad = pad_sequences(XTestTokens, maxlen=maxLength, padding='post') biGRU = Sequential() biGRU.add( Embedding(vocabSize, embDim, embeddings_initializer=Constant(embeddingWeights), input_length=maxLength, mask_zero=True)) biGRU.add(Bidirectional(GRU(units=20, dropout=0.3))) biGRU.add(Dense(1)) optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) lossFunction = tf.keras.losses.MeanSquaredError() biGRU.compile(optimizer=optimizer, loss=lossFunction) print('\nTraining Deep Learning Model\n') biGRU.fit(XTrainPad, y_train, batch_size=256, epochs=20) # model.save('my_model.h5') preds = biGRU.predict(XTestPad)
max_review_length = 150 mapped_list = sequence.pad_sequences(mapped_list, maxlen=max_review_length) train_x, test_x, train_y, test_y = train_test_split(mapped_list, varietal_list, test_size=0.3) max_review_length = 150 embedding_vector_length = 64 model = Sequential() model.add( Embedding(2500, embedding_vector_length, input_length=max_review_length)) model.add(Conv1D(50, 5)) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dense(max(varietal_list_o) + 1, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(train_x, train_y, epochs=3, batch_size=64)
def build_embedding_network(): inputs = [] embeddings = [] input_ps_ind_02_cat = Input(shape=(1, )) embedding = Embedding(5, 3, input_length=1)(input_ps_ind_02_cat) embedding = Reshape(target_shape=(3, ))(embedding) inputs.append(input_ps_ind_02_cat) embeddings.append(embedding) input_ps_ind_04_cat = Input(shape=(1, )) embedding = Embedding(3, 2, input_length=1)(input_ps_ind_04_cat) embedding = Reshape(target_shape=(2, ))(embedding) inputs.append(input_ps_ind_04_cat) embeddings.append(embedding) input_ps_ind_05_cat = Input(shape=(1, )) embedding = Embedding(8, 5, input_length=1)(input_ps_ind_05_cat) embedding = Reshape(target_shape=(5, ))(embedding) inputs.append(input_ps_ind_05_cat) embeddings.append(embedding) input_ps_car_01_cat = Input(shape=(1, )) embedding = Embedding(13, 7, input_length=1)(input_ps_car_01_cat) embedding = Reshape(target_shape=(7, ))(embedding) inputs.append(input_ps_car_01_cat) embeddings.append(embedding) input_ps_car_02_cat = Input(shape=(1, )) embedding = Embedding(3, 2, input_length=1)(input_ps_car_02_cat) embedding = Reshape(target_shape=(2, ))(embedding) inputs.append(input_ps_car_02_cat) embeddings.append(embedding) input_ps_car_03_cat = Input(shape=(1, )) embedding = Embedding(3, 2, input_length=1)(input_ps_car_03_cat) embedding = Reshape(target_shape=(2, ))(embedding) inputs.append(input_ps_car_03_cat) embeddings.append(embedding) input_ps_car_04_cat = Input(shape=(1, )) embedding = Embedding(10, 5, input_length=1)(input_ps_car_04_cat) embedding = Reshape(target_shape=(5, ))(embedding) inputs.append(input_ps_car_04_cat) embeddings.append(embedding) input_ps_car_05_cat = Input(shape=(1, )) embedding = Embedding(3, 2, input_length=1)(input_ps_car_05_cat) embedding = Reshape(target_shape=(2, ))(embedding) inputs.append(input_ps_car_05_cat) embeddings.append(embedding) input_ps_car_06_cat = Input(shape=(1, )) embedding = Embedding(18, 8, input_length=1)(input_ps_car_06_cat) embedding = Reshape(target_shape=(8, ))(embedding) inputs.append(input_ps_car_06_cat) embeddings.append(embedding) input_ps_car_07_cat = Input(shape=(1, )) embedding = Embedding(3, 2, input_length=1)(input_ps_car_07_cat) embedding = Reshape(target_shape=(2, ))(embedding) inputs.append(input_ps_car_07_cat) embeddings.append(embedding) input_ps_car_09_cat = Input(shape=(1, )) embedding = Embedding(6, 3, input_length=1)(input_ps_car_09_cat) embedding = Reshape(target_shape=(3, ))(embedding) inputs.append(input_ps_car_09_cat) embeddings.append(embedding) input_ps_car_10_cat = Input(shape=(1, )) embedding = Embedding(3, 2, input_length=1)(input_ps_car_10_cat) embedding = Reshape(target_shape=(2, ))(embedding) inputs.append(input_ps_car_10_cat) embeddings.append(embedding) input_ps_car_11_cat = Input(shape=(1, )) embedding = Embedding(104, 10, input_length=1)(input_ps_car_11_cat) embedding = Reshape(target_shape=(10, ))(embedding) inputs.append(input_ps_car_11_cat) embeddings.append(embedding) input_numeric = Input(shape=(24, )) embedding_numeric = Dense(16)(input_numeric) inputs.append(input_numeric) embeddings.append(embedding_numeric) x = Concatenate()(embeddings) x = Dense(80, activation='relu')(x) x = Dropout(.35)(x) x = Dense(20, activation='relu')(x) x = Dropout(.15)(x) x = Dense(10, activation='relu')(x) x = Dropout(.15)(x) output = Dense(1, activation='sigmoid')(x) model = Model(inputs, output) model.compile(loss='binary_crossentropy', optimizer='adam') return model
def make_model(): # load all training reviews positive_docs = process_docs('data/pos_train', vocab, True) negative_docs = process_docs('data/neg_train', vocab, True) train_docs = negative_docs + positive_docs # create the tokenizer tokenizer = Tokenizer() # fit the tokenizer on the documents tokenizer.fit_on_texts(train_docs) with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # sequence encode encoded_docs = tokenizer.texts_to_sequences(train_docs) # pad sequences max_length = max([len(s.split()) for s in train_docs]) print("\n\n maxlenght="+str(max_length)) from tensorflow.python.keras.preprocessing.sequence import pad_sequences X = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # define training labels y = np.array([0 for _ in range(270)] + [1 for _ in range(270)]) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=42) ''' # load all test reviews positive_docs = process_docs('data/pos_test', vocab, False) negative_docs = process_docs('data/neg_test', vocab, False) test_docs = negative_docs + positive_docs # sequence encode encoded_docs = tokenizer.texts_to_sequences(test_docs) # pad sequences Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post') # define test labels ytest = np.array([0 for _ in range(len(listdir("data/neg_test")))] + [1 for _ in range(len(listdir("data/pos_test")))]) ''' print("\n pad_sequences : ",Xtest) print("\n ytest : ",ytest) # define vocabulary size (largest integer value) vocab_size = len(tokenizer.word_index) + 1 # define model model = Sequential() model.add(Embedding(vocab_size, 100, input_length=max_length)) model.add(Conv1D(filters=64, kernel_size=8, activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(Conv1D(filters=32, kernel_size=8, activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(Flatten()) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='sigmoid')) print(model.summary()) # compile network model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # fit network model.fit(Xtrain, ytrain, epochs=20, verbose=1) # evaluate loss, acc = model.evaluate(Xtest, ytest, verbose=0) print('Test Accuracy: %f' % (acc*100)) model.save("relevancy_model_v2.0.1.h5") print("Done!")
_precision, _recall, _f1, _sample = precision_recall_fscore_support( y_test, y_val_pred) self.precisions.append(_precision) self.recalls.append(_recall) self.f1_scores.append(_f1) metrics = ModelMetrics() # ML model ---------------------------------------------- epochs = 10 ml_model1 = Sequential() ml_model1.add(Embedding(max_features, 128, input_length=maxlen)) ml_model1.add(LSTM(128)) ml_model1.add(Dropout(0.5)) ml_model1.add(Dense(1)) ml_model1.add(Activation('sigmoid')) ml_model1.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['mae', 'acc']) ## Splitting the test and train dataset X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, )
self.precisions.append(_precision) self.recalls.append(_recall) self.f1_scores.append(_f1) metrics = ModelMetrics() # ML model ---------------------------------------------- epochs = 10 #ml_model1 = Sequential() ### CNN Code text_input = Input(shape=(maxlen, ), name='text_input') x = Embedding(input_dim=max_features, input_length=maxlen, output_dim=128)(text_input) conv_a = Conv1D(15, 2, activation='relu')(x) conv_b = Conv1D(15, 3, activation='relu')(x) conv_c = Conv1D(15, 4, activation='relu')(x) conv_d = Conv1D(15, 5, activation='relu')(x) conv_e = Conv1D(15, 6, activation='relu')(x) pool_a = GlobalMaxPooling1D()(conv_a) pool_b = GlobalMaxPooling1D()(conv_b) pool_c = GlobalMaxPooling1D()(conv_c) pool_d = GlobalMaxPooling1D()(conv_d) pool_e = GlobalMaxPooling1D()(conv_e) flattened = concatenate([pool_a, pool_b, pool_c, pool_d, pool_e])
print('Found %s unique tokens.' % len(word_index)) X = tokenizer.texts_to_sequences(df['data'].values) X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) print('Shape of data tensor:', X.shape) Y = pd.get_dummies(df['class']).values print('Shape of label tensor:', Y.shape) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42) model = Sequential() model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1])) model.add(Conv1D(128, 5, activation='relu')) model.add(MaxPooling1D(5)) model.add(Conv1D(128, 5, activation='relu')) model.add(MaxPooling1D(35)) model.add(LSTM(100)) model.add(Dense(9, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=Adam(0.005), metrics=['accuracy']) epochs = 2 batch_size = 100 history = model.fit(X_train, Y_train,
self.precisions.append(_precision) self.recalls.append(_recall) self.f1_scores.append(_f1) metrics = ModelMetrics() # ML model ---------------------------------------------- epochs = 10 ml_model1 = Sequential() ### CNN Code text_input = Input(shape = (maxlen,), name='text_input') x = Embedding(input_dim=max_features, input_length=maxlen, output_dim=128)(text_input) conv_a = Conv1D(15,2, activation='relu')(x) conv_b = Conv1D(15,3, activation='relu')(x) conv_c = Conv1D(15,4, activation='relu')(x) conv_d = Conv1D(15,5, activation='relu')(x) conv_e = Conv1D(15,6, activation='relu')(x) pool_a = GlobalMaxPooling1D()(conv_a) pool_b = GlobalMaxPooling1D()(conv_b) pool_c = GlobalMaxPooling1D()(conv_c) pool_d = GlobalMaxPooling1D()(conv_d) pool_e = GlobalMaxPooling1D()(conv_e) flattened = concatenate([pool_a, pool_b, pool_c, pool_d, pool_e])