def load_data(): x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data() vocabulary_inv = { key: value for key, value in enumerate(vocabulary_inv_list) } y = y.argmax(axis=1) # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * 0.9) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=300, min_word_count=1, context=10) x_train = np.array([[embedding_weights[j] for j in i] for i in x_train]) x_test = np.array([[embedding_weights[j] for j in i] for i in x_test]) return x_train, y_train, x_test, y_test, vocabulary_inv
def get_embedding_weights(train_x, test_x, vocabulary_inv, min_count=1, context=10): x = np.concatenate((train_x, test_x), axis=0) return train_word2vec(x, vocabulary_inv, w2c_len, min_count, context)
def create_model(): final_paths = "files_path.txt" model_path = "models" + slash + "model_5E.model" embedding_weights = train_word2vec(vocabulary_inv, final_paths=final_paths, model_paths=model_path, option="load") model = make_configuration() model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) initialize_weights(model, embedding_weights) model.save("trained_models" + slash + "model_one") model.save_weights("trained_models" + slash + "model_one_weight") return model
import numpy as np import keras from keras.models import Sequential, Graph from keras.layers.core import Dense, Dropout, Activation, Flatten from keras.layers.embeddings import Embedding from keras.layers.convolutional import Convolution1D, MaxPooling1D from keras.layers.recurrent import LSTM from keras.utils import np_utils, generic_utils import data_helpers from w2v import train_word2vec from sklearn.cross_validation import StratifiedKFold cnn_train,lstm_train,Y_train,cnn_vocabulary,cnn_vocabulary_inv,lstm_vocabulary,lstm_vocabulary_inv = data_helpers.load_data() cnn_embedding_weights,lstm_embedding_weights = train_word2vec(cnn_train, cnn_vocabulary_inv,lstm_train,lstm_vocabulary_inv) #cnn_train=cnn_embedding_weights[0][cnn_train] #lstm_train=lstm_embedding_weights[0][lstm_train] shuffle_indices = np.random.permutation(np.arange(len(Y_train))) cnn_shuffled = cnn_train[shuffle_indices] lstm_shuffled = lstm_train[shuffle_indices] Y_train = Y_train[shuffle_indices] #Y_train_f=np_utils.to_categorical(Y_train,27) filter_sizes = (3, 4) num_filters = 150 hidden_dims = 150 cnn_graph = Graph() cnn_graph.add_input(name='input', input_shape=(32, 300)) for fsz in filter_sizes: conv = Convolution1D(nb_filter=num_filters,filter_length=fsz,border_mode='valid',activation='relu',subsample_length=1)
from keras.layers import * embedding_dim = 150 # The size of the word vectors batch_size = 32 # Batch Size for Neural Network training num_epochs = 1 # Number of Epochs to run training val_split = 0.2 # Percentage of data to be used for validation min_word_count = 1 # Minimum word count in sentences context_window_size = 12 # Size for context window x, y, vocabulary, vocabulary_inv = data_helpers.load_data( ) # Load training data and vocabulary from data_helpers embedding_weights = train_word2vec( x, vocabulary_inv, embedding_dim, min_word_count, context_window_size) # Train or Load weights for Word Vectors # Shuffle the data shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices].argmax(axis=1) print("Vocabulary Size: " + str(len(vocabulary))) # Declare the model model = Sequential() model.add( Embedding(len(vocabulary), embedding_dim, input_length=119,
num_epochs = 100 val_split = 0.1 # Word2Vec parameters, see train_word2vec min_word_count = 1 # Minimum word count context = 10 # Context window size # Data Preparatopn # ================================================== # # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() if model_variation=='CNN-non-static' or model_variation=='CNN-static': embedding_weights = train_word2vec(x, vocabulary_inv, embedding_dim, min_word_count, context) if model_variation=='CNN-static': x = embedding_weights[0][x] elif model_variation=='CNN-rand': embedding_weights = None else: raise ValueError('Unknown model variation') # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices].argmax(axis=1) print("Vocabulary Size: {:d}".format(len(vocabulary))) # Building model
def create_model(x_train, event_x_train, ners_x_train, sent_x_train, y_train, x_test, event_x_test, ners_x_test, sent_x_test, y_test, vocabulary, vocabulary_inv): global sequence_length, events_seq_length, ners_seq_length, sent2vec_seq_length #adjust seq lengths if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size: {:d}".format( x_test.shape[1])) sequence_length = x_test.shape[1] if events_seq_length != event_x_test.shape[1]: print("Adjusting event sequence length for actual size: {:d}".format( event_x_test.shape[1])) events_seq_length = event_x_test.shape[1] if ners_seq_length != ners_x_test.shape[1]: print("Adjusting ners sequence length for actual size: {:d}".format( ners_x_test.shape[1])) ners_seq_length = ners_x_test.shape[1] if sent2vec_seq_length != sent_x_test.shape[1]: print( "Adjusting sent2vec sequence length for actual size: {:d}".format( sent_x_test.shape[1])) sent2vec_seq_length = sent_x_test.shape[1] # Prepare embedding layer weights and convert inputs for static model embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) # Build model input_shape = (sequence_length, ) model_input = Input(shape=input_shape, name="Input_Layer") embed = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding_layer")(model_input) embed = Dropout(dropout_prob[0], name="embedding_dropout_layer")(embed) # Convolutional block conv_blocks = [] for sz in filter_sizes: conv = Convolution1D(filters=num_filters, kernel_size=sz, padding="valid", activation="relu", strides=1, name="conv_layer_" + str(sz))(embed) conv = MaxPooling1D(pool_size=2, name="conv_maxpool_layer_" + str(sz))(conv) conv = Flatten(name="conv_flatten_layer_" + str(sz))(conv) conv_blocks.append(conv) #concatenate conv layers conv_layers = Concatenate(name="conv_concate_layer")( conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0] flated_conv_layers = Dropout(dropout_prob[1], name="concate_dropout_layer")(conv_layers) #add events input layer events_input_layer = Input(shape=(events_seq_length, ), name="event_input_layer") events_dense = Dense(int((events_seq_length / 2)), activation="relu", name="event_dense_layer")(events_input_layer) #add ners input layer ners_input_layer = Input(shape=(ners_seq_length, ), name="ner_input_layer") ners_dense = Dense(int((ners_seq_length / 2)), activation="relu", name="ners_dense_layer")(ners_input_layer) #add sent2vec input layer sent2vec_input_layer = Input(shape=(700, ), name="sent2vec_input_layer") sent2vec_dense_layer = Dense( 350, activation="relu", name="sent2vec_dense_layer")(sent2vec_input_layer) #merge all input layers merged = concatenate( [flated_conv_layers, events_dense, ners_dense, sent2vec_dense_layer], name="conv_event_ner_sent2vec_merge_layer") #convolution layer for the contcatenated features merged_reshaped = Reshape((6, 357))(merged) merged_conv = Convolution1D(filters=10, kernel_size=3, padding="valid", activation="relu", strides=1, name="merged_conv_layer_" + str(3))(merged_reshaped) merged_conv = MaxPooling1D(pool_size=2, name="merged_conv_maxpool_layer_" + str(3))(merged_conv) merged_conv = Flatten(name="merged_conv_flatten_layer_" + str(3))(merged_conv) #dense layer dense = Dense(hidden_dims, activation="relu", name="conv_event_merge_dense_layer")(merged_conv) model_output = Dense(num_labels, activation="softmax", name="Output_layer")(dense) #create model model = Model([ model_input, events_input_layer, ners_input_layer, sent2vec_input_layer ], model_output) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) return model, embedding_weights
def run(embedding_dim, filter_sizes, num_filters, dropout_prob, hidden_dims, batch_size, num_epochs, sequence_length, max_words, min_word_count, context): # Data Preparation print("Load data...") x_train, y_train, x_test, y_test, vocabulary_inv = load_data( data_source, max_words, sequence_length) print(x_test.shape[1]) print(sequence_length) if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size") sequence_length = x_test.shape[1] print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("Vocabulary Size: {:d}".format(len(vocabulary_inv))) # Prepare embedding layer weights and convert inputs for static model print("Model type is", model_type) if model_type in ["CNN-non-static", "CNN-static"]: embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) if model_type == "CNN-static": x_train = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train ]) x_test = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test ]) print("x_train static shape:", x_train.shape) print("x_test static shape:", x_test.shape) elif model_type == "CNN-rand": embedding_weights = None else: raise ValueError("Unknown model type") # Build model if model_type == "CNN-static": input_shape = (sequence_length, embedding_dim) else: input_shape = (sequence_length, ) model_input = Input(shape=input_shape) # Static model does not have embedding layer if model_type == "CNN-static": z = model_input else: z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input) z = Dropout(dropout_prob[0])(z) # Convolutional block conv_blocks = [] for sz in filter_sizes: conv = Convolution1D(filters=num_filters, kernel_size=sz, padding="valid", activation="relu", strides=1)(z) conv = MaxPooling1D(pool_size=2)(conv) conv = Flatten()(conv) conv_blocks.append(conv) z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0] z = Dropout(dropout_prob[1])(z) z = Dense(hidden_dims, activation="relu")(z) model_output = Dense(1, activation="sigmoid")(z) model = Model(model_input, model_output) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) # Initialize weights with word2vec if model_type == "CNN-non-static": weights = np.array([v for v in embedding_weights.values()]) print("Initializing embedding layer with word2vec weights, shape", weights.shape) embedding_layer = model.get_layer("embedding") embedding_layer.set_weights([weights]) # Train the model model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), verbose=2)
def build_model(self): print("=" * 120) print("Model type is", self.model_type) if self.model_type in ["non-static", "static"]: embedding_weights = train_word2vec( self.data_source, np.vstack((self.x_train, self.x_test)), self.vocabulary_inv, num_features=self.embedding_dim, min_word_count=self.min_word_count, context=self.context) if self.model_type == "static": self.x_train = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in self.x_train ]) self.x_test = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in self.x_test ]) print("x_train static shape:", self.x_train.shape) print("x_test static shape:", self.x_test.shape) elif self.model_type == "rand": embedding_weights = None else: raise ValueError("Unknown model type") # 构建模型 if self.model_type == "static": input_shape = (self.sequence_length, self.embedding_dim) else: input_shape = (self.sequence_length, ) model_input = Input(shape=input_shape) # 静态模型, 没有embedding 层 if self.model_type == "static": z = model_input else: z = Embedding(len(self.vocabulary_inv), self.embedding_dim, input_length=self.sequence_length, name="embedding")(model_input) z = Dropout(self.dropout_prob[0])(z) # 卷积模块 conv_blocks = [] # 双向LSTM模块 lstm_blocks = [] for sz in self.filter_sizes: convo = Convolution1D(filters=self.num_filters, kernel_size=sz, padding="valid", activation="relu", use_bias=True, bias_initializer='zeros', strides=1)(z) conv = MaxPooling1D(pool_size=2)(convo) lstm = Bidirectional(LSTM(self.hidden_dims, dropout=self.dropout_prob[1]), input_shape=(int( (self.sequence_length - sz + 1) / 2), self.num_filters))(conv) conv = Flatten()(conv) conv_blocks.append(conv) lstm_blocks.append(lstm) z = Concatenate()( conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0] l = Concatenate()( lstm_blocks) if len(lstm_blocks) > 1 else lstm_blocks[0] z = Dropout(self.dropout_prob[2])(z) z = Dense(self.hidden_dims, activation="relu")(z) z = Concatenate()([z, l]) # z = l model_output = Dense(1, activation="sigmoid")(z) self.model = Model(model_input, model_output) self.model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) # 使用word2vec 初始化 权值 if self.model_type == "non-static": weights = np.array([v for v in embedding_weights.values()]) print("Initializing embedding layer with word2vec weights, shape", weights.shape) embedding_layer = self.model.get_layer("embedding") embedding_layer.set_weights([weights])
# Data Preparation print("Load data...") x_train, y_train, x_test, y_test, vocabulary_inv = load_data(data_source) if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size") sequence_length = x_test.shape[1] print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("Vocabulary Size: {:d}".format(len(vocabulary_inv))) # Prepare embedding layer weights and convert inputs for static model print("Model type is", model_type) if model_type in ["CNN-non-static", "CNN-static"]: embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) if model_type == "CNN-static": x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train]) x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test]) print("x_train static shape:", x_train.shape) print("x_test static shape:", x_test.shape) elif model_type == "CNN-rand": embedding_weights = None else: raise ValueError("Unknown model type") # Build model if model_type == "CNN-static": input_shape = (sequence_length, embedding_dim) else:
save_parameters(vocabulary_inv, max_length) if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size") sequence_length = x_test.shape[1] print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("Vocabulary Size: {:d}".format(len(vocabulary_inv))) # Prepare embedding layer weights and convert inputs for static model print("Model type is", model_type) if model_type in ["CNN-non-static", "CNN-static"]: embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context, gensim_model=gensim_model) if model_type == "CNN-static": x_train = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train ]) x_test = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test ]) print("x_train static shape:", x_train.shape) print("x_test static shape:", x_test.shape) elif model_type == "CNN-rand":
# loaded_model_json = json_file.read() # json_file.close() # model = model_from_json(loaded_model_json) # # load weights into new model # model.load_weights('model.h5') # print('Loaded model from disk') # model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) if True: print('Loading Data...') start = time.time() X, Y, vocab, vocab_inv, max_length, unique_answers = data_helpers.load_data( ) end = time.time() print('Loaded data successfully took ', (end - start), ' seconds') embedding_weights = w2v.train_word2vec(X, vocab_inv, embedding_dim, min_word_count, context) # vocab = json.load(open('json/vocab.json')) # vocab_inv = json.load(open('json/vocab_inv.json')) # unique_answers = json.load(open('json/answers.json')) # max_length = vocab["max_length"] # vocab.pop("max_length") # embedding_weights = w2v.load_model(vocab_inv, embedding_dim, min_word_count, context) graph_in = Input(shape=(max_length, embedding_dim)) convolutions = [] for fsz in filter_sizes: convolution = Convolution1D(nb_filter=num_filters, filter_length=fsz, border_mode='valid', activation='relu', subsample_length=1)(graph_in) pool = MaxPooling1D(pool_length=2)(convolution)
print(len(Xtest), 'test samples') print("Xtrain shape:", Xtrain.shape) print("Xtest shape:", Xtest.shape) assert Xtrain.shape[1] == Xtest.shape[1] if sequence_length != Xtrain.shape[1]: print("Adjusting sequence length for actual size") sequence_length = Xtrain.shape[1] # Prepare embedding layer weights and convert inputs for static model print("Model type is", model_type) if model_type in ["CNN-non-static", "CNN-static"]: embedding_weights = train_word2vec(X, vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) # Below if-clause does not concern us since we will use CNN-non-static model if model_type == "CNN-static": x = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x ]) # x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test]) print("x static shape:", x.shape) # print("x_test static shape:", x_test.shape) elif model_type == "CNN-rand": embedding_weights = None else:
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * split) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] print('Input data built! W2V') ### TRAIN WORD TO VEC! from w2v import train_word2vec min_word_count = 3 context = 2 embedding_dim = 30 embedding_weights = train_word2vec(np.vstack((x_train, x_test)), 'node_embeddings', vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) ### NN from keras.models import Model from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding, ZeroPadding1D from keras.layers.merge import Concatenate from keras import callbacks from sklearn import metrics # Model Hyperparameters filter_sizes = (3, 8) num_filters = 20 #10 dropout_prob = (0.5, 0.8) hidden_dims = 30 batch_size = 64 num_epochs = 1 sequence_length = 3 #400
print("Word index : \n", len(word_index)) if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size") sequence_length = x_test.shape[1] print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("Vocabulary Size: {:d}".format(len(vocabulary_inv))) # Prepare embedding layer weights and convert inputs for static model print("Model type is", model_type) if model_type in ["CNN-non-static", "CNN-static"]: embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, file_head=target, num_features=embedding_dim, min_word_count=min_word_count, context=context) if model_type == "CNN-static": x_train = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train ]) x_test = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test ]) print("x_train static shape:", x_train.shape) print("x_test static shape:", x_test.shape)
return x_train, y_train, x_test, y_test, vocabulary_inv print("Loading Data") x_train, y_train, x_test, y_test, vocabulary_inv = load_data(dataSource) if sequenceLength != x_test.shape[1]: print("Adjusting sequence length for actual size") sequenceLength = x_test.shape[1] print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("Vocabulary Size: {:d}".format(len(vocabulary_inv))) embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embeddingDim, minWordCount=minWordCount, context=contextWindow) input_shape = (sequenceLength, ) model_input = Input(shape=input_shape) z = Embedding(len(vocabulary_inv), embeddingDim, input_length=sequenceLength, name="embedding")(model_input) z = Dropout(dropoutProb[0])(z) conv_blocks = [] for sz in filterSizes:
context = 10 # Context window size # Data Preparatopn # ================================================== # # Load data print("Loading data...") neg_train_path = './data/imdb_train.neg' pos_train_path = './data/imdb_train.pos' x, y, vocabulary, vocabulary_inv = data_helpers.load_data( pos_train_path, neg_train_path) if model_variation == 'CNN-non-static' or model_variation == 'CNN-static': embedding_weights = train_word2vec(x, vocabulary_inv, model_variation, embedding_dim, min_word_count, context) if model_variation == 'CNN-static': x = embedding_weights[0][x] elif model_variation == 'CNN-rand': embedding_weights = None else: raise ValueError('Unknown model variation') # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices].argmax(axis=1) print("Vocabulary Size: {:d}".format(len(vocabulary)))
print("Stat file created !!") ### Part 2B: Network definition & word2vec training ### make sure to delete existing word2vec model if you want to udate it if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size") sequence_length = x_test.shape[1] print("Vocabulary Size: {:d}".format(len(vocabulary_inv))) # Prepare embedding layer weights and convert inputs for static model print("Model type is", model_type) if model_type in ["CNN-non-static", "CNN-static"]: print('Initiating word2vec.') embedding_weights = train_word2vec(np.vstack((x_train, x_test, neutral_tweets)), settings['dataset_name'], vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) print('Word2vec done.') if model_type == "CNN-static": x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train]) x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test]) print("x_train static shape:", x_train.shape) print("x_test static shape:", x_test.shape) elif model_type == "CNN-rand": embedding_weights = None else: raise ValueError("Unknown model type") # Build model if model_type == "CNN-static":
val_acc_nd = [] val_loss_nd = [] epoch_nr_nd = [] for run in range(0, 10, 1): # Data Preparation print("Load data...") x_train, y_train, x_test, y_test, vocabulary_inv = load_data( data_source, datalength) print("x_train[0]: ", x_train[0]) # train_word2vec returns embedding_weights of the trained word2vec embedding (see w2v.py) embedding_weights = train_word2vec(datalength, np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) print("x_train static shape:", x_train.shape) print("x_test static shape:", x_test.shape) print("x_train[0]: ", x_train[0]) if sequence_length != x_test.shape[1]: print("Adjusting sequence length for actual size") sequence_length = x_test.shape[1] model = create_model(sequence_length) history = model.fit(x_train, y_train,
# Word2Vec parameters, see train_word2vec min_word_count = 1 # Minimum word count context = 10 # Context window size # Data Preparatopn # ================================================== # # Load data print("Loading data...") neg_train_path = './data/imdb_train.neg' pos_train_path = './data/imdb_train.pos' x, y, vocabulary, vocabulary_inv = data_helpers.load_data(pos_train_path,neg_train_path) if model_variation=='CNN-non-static' or model_variation=='CNN-static': embedding_weights = train_word2vec(x, vocabulary_inv, model_variation, embedding_dim, min_word_count, context) if model_variation=='CNN-static': x = embedding_weights[0][x] elif model_variation=='CNN-rand': embedding_weights = None else: raise ValueError('Unknown model variation') # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices].argmax(axis=1) print("Vocabulary Size: {:d}".format(len(vocabulary)))
num_epochs = 100 val_split = 0.1 # Word2Vec parameters, see train_word2vec min_word_count = 1 # Minimum word count context = 10 # Context window size # Data Preparatopn # ================================================== # # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() if model_variation == 'CNN-non-static' or model_variation == 'CNN-static': embedding_weights = train_word2vec(x, vocabulary_inv, embedding_dim, min_word_count, context) if model_variation == 'CNN-static': x = embedding_weights[0][x] elif model_variation == 'CNN-rand': embedding_weights = None else: raise ValueError('Unknown model variation') # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices].argmax(axis=1) print("Vocabulary Size: {:d}".format(len(vocabulary))) # Building model
def create_model(x_train, x_test, vocabulary_inv, model_type, embedding_dim, min_word_count, context, sequence_length, dropout_prob, filter_sizes, num_filters, hidden_dims, optimizer="Adagrad"): # Prepare embedding layer weights and convert inputs for static model print("Model type is", model_type) if model_type in ["CNN-non-static", "CNN-static"]: embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) if model_type == "CNN-static": x_train = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train ]) x_test = np.stack([ np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test ]) print("x_train static shape:", x_train.shape) print("x_test static shape:", x_test.shape) elif model_type == "CNN-rand": embedding_weights = None else: raise ValueError("Unknown model type") # Build model if model_type == "CNN-static": input_shape = (sequence_length, embedding_dim) else: input_shape = (sequence_length, ) model_input = Input(shape=input_shape) # Static model does not have embedding layer if model_type == "CNN-static": z = model_input else: z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input) z = Dropout(dropout_prob[0])(z) # Convolutional block conv_blocks = [] for sz in filter_sizes: conv = Convolution1D(filters=num_filters, kernel_size=sz, padding="valid", activation="relu", strides=1)(z) conv = MaxPooling1D(pool_size=2)(conv) conv = Flatten()(conv) conv_blocks.append(conv) z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0] z = Dropout(dropout_prob[1])(z) z = Dense(hidden_dims, activation="relu")(z) model_output = Dense(1, activation="sigmoid")(z) model = Model(model_input, model_output) model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]) # Initialize weights with word2vec if model_type == "CNN-non-static": weights = np.array([v for v in embedding_weights.values()]) print("Initializing embedding layer with word2vec weights, shape", weights.shape) embedding_layer = model.get_layer("embedding") embedding_layer.set_weights([weights]) return model