def create_model(session, y, vocab, config, path, logger): # create model, reuse parameters if exists initializer = tf.random_uniform_initializer(-1 * config.init_scale, 1 * config.init_scale) # between(-1, 1) with tf.variable_scope("bi_rnn", reuse=None, initializer=initializer): bi_rnn = Bi_Lstm_Model(config=config, num_step=config.num_step, num_classes=1, vocab_size=len(vocab), is_training=0) # 0 train, 1 valid, 2 predict with tf.variable_scope("bi_rnn", reuse=True, initializer=initializer): valid_bi_rnn = Bi_Lstm_Model(config=config, num_step=config.num_step, num_classes=1, vocab_size=len(vocab), is_training=1) test_bi_rnn = Bi_Lstm_Model(config=config, num_step=config.num_step, num_classes=1, vocab_size=len(vocab), is_training=1) if path: ckpt = tf.train.get_checkpoint_state(os.path.join(path, "checkpoints")) if tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) bi_rnn.saver.restore(session, ckpt.model_checkpoint_path) else: logger.info("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) emb_weights = session.run(bi_rnn.embeddings.read_value()) emb_weights = data_helpers.load_word2vec(config.word2vec_path, vocab, config.embed_dim, emb_weights) session.run(bi_rnn.embeddings.assign(emb_weights)) logger.info("Load pre-trained embedding.") return bi_rnn, valid_bi_rnn, test_bi_rnn
def __init__(self, w2v_file, pooling_type='max_min'): """ Args: w2v_file: word2vec text file pooling_type: [max_min | avg | all], default max_min """ self.word2vec, self.vec_dim, _ = data_helpers.load_word2vec(w2v_file) if pooling_type == 'max_min': self.pooling = self.max_min_pooling elif pooling_type == 'avg': self.pooling = self.average_pooling else: self.pooling = self.all_pooling
def make_data(args, max_len, raw_input): vocab_file, _, _ = data_helpers.process_train_file( data_dir=args.data_dir, raw_input=raw_input, max_length=max_len, min_frequency=args.min_freq, ) w2v, vec_dim, _ = data_helpers.load_word2vec(args.w2v_file) data_helpers.make_embedding_matrix( data_dir=args.data_dir, prefix=os.path.basename(raw_input), word2vec=w2v, vec_dim=vec_dim, vocab_file=vocab_file, )
def train(self, sess, x_text, y, split_no, FLAGS): vocab_processor = learn.preprocessing.VocabularyProcessor( self.max_document_length) pickle.dump(vocab_processor, open("vocabproc{}.pickle".format(split_no), "wb")) #vocab_processor = pickle.load(open("vocabproc{}.pickle".format(split_no), "rb")) topics = pickle.load( open("phrase3000_{}.pickle".format(split_no), "rb")) x = np.array(list(vocab_processor.fit_transform(x_text))) y = np.array(y) bm25 = BM25() t = bm25.relevance(x_text, topics, split_no) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] t_shuffled = t[shuffle_indices] # Build vocabulary text_x_shuffled = [] for index in np.nditer(shuffle_indices): text_x_shuffled.append(x_text[index]) dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] t_train, t_dev = t_shuffled[:dev_sample_index], t_shuffled[ dev_sample_index:] del x, y, x_shuffled, y_shuffled self.initW = data_helpers.load_word2vec(vocab_processor, FLAGS.embedding_dim) cnn = self.get_cnn(FLAGS, voca_size=len(vocab_processor.vocabulary_)) self.train_nn(sess, cnn, x_train, x_dev, t_train, y_train, y_dev, t_dev, topics)
def main(): current_time = str(datetime.now().strftime('%Y%m%d_%H%M%S')) print(current_time) print('Loading data') texts, labels = load_data_and_labels(pos_file, neg_file) if W2V_file_addr is not None: print('Loading Word2Vec') # embeddings_index, embedding_dim = load_word2vec_nonbinary(W2V_file_addr) embeddings_index, embedding_dim = load_word2vec(W2V_file_addr) print('Found %s word vectors.' % len(embeddings_index)) else: embeddings_index = None embedding_dim = 300 checkpoint_dir = os.path.join(keras_checkpoint_dir, file_tag, current_time) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # vectorize the text samples into a 2D integer tensor print('Tokenizing the texts') tokenizer = Tokenizer(num_words=max_num_words, lower=False) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) tokenizer_addr = os.path.join(checkpoint_dir, 'tokenizer.pickle') with open(tokenizer_addr, 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) print('Tokenizer is saved as %s' % tokenizer_addr) print('Padding Sequences') data = pad_sequences(sequences, maxlen=max_sequence_length) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] num_validation_samples = int(validation_split * data.shape[0]) x_train = data[:-num_validation_samples] y_train = labels[:-num_validation_samples] x_val = data[-num_validation_samples:] y_val = labels[-num_validation_samples:] # prepare embedding matrix num_words = min(max_num_words, len(word_index) + 1) if W2V_file_addr is not None: embedding_matrix = np.zeros((num_words, embedding_dim)) for word, i in word_index.items(): if i >= max_num_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector else: embedding_matrix = np.zeros((num_words, embedding_dim)) # Build model model_input = Input(shape=(max_sequence_length, ), dtype='int32') z = Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False)(model_input) z = Reshape((max_sequence_length, embedding_dim, 1))(z) # Convolutional block conv_blocks = [] for sz in filter_sizes: conv = Conv2D(num_filters, kernel_size=(sz, embedding_dim), padding='valid', activation='relu')(z) conv = MaxPool2D(pool_size=(max_sequence_length - sz + 1, 1), strides=(1, 1), padding='valid')(conv) conv = Flatten()(conv) conv_blocks.append(conv) z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0] #z = Dense(hidden_dims, kernel_regularizer=regularizers.l2(0.0001),activity_regularizer=regularizers.l1(0.0001), activation="relu")(z) z = Dense(hidden_dims, activation="relu")(z) z = Dropout(drop)(z) model_output = Dense(units=2, activation='softmax')(z) adam = Adam(lr=learning_rate, decay=1e-6) # adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.00001) # adam = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0001) model = Model(model_input, model_output) model.summary() model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['mse', 'acc']) # <==<==biogpu12-1 #model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) #model.compile(optimizer=adam, loss='mean_squared_error', metrics=['mse', 'acc']) # <== best <==biogpu12-2 # Log Dir log_dir = os.path.join(checkpoint_dir, '../') if not os.path.exists(log_dir): os.makedirs(log_dir) f_log = open(os.path.join(log_dir, 'result_logs_' + file_tag + '.txt'), 'a') print( 'Log File: %s' % os.path.join(os.getcwd(), log_dir, 'result_logs_' + file_tag + '.txt')) f_log.write("\n") f_log.write(current_time) f_log.write("\n") f_log.write('pos_file: %s\n' % pos_file) f_log.write('neg_file: %s\n' % neg_file) f_log.write('checkpoint_folder: %s\n' % checkpoint_dir) f_log.write('embedding_dim = %s\n' % embedding_dim) f_log.write('W2V_file_addr = %s\n' % W2V_file_addr) f_log.write('filter_sizes = %s\n' % filter_sizes) f_log.write('num_filters = %s\n' % num_filters) f_log.write('hidden_dims = %s\n' % hidden_dims) f_log.write('drop = %s\n' % drop) f_log.write('validation_split = %s\n' % validation_split) f_log.write('learning_rate = %s\n' % learning_rate) f_log.write('epochs = %s\n' % epochs) f_log.write('batch_size = %s\n' % batch_size) f_log.write('max_num_words = %s\n' % max_num_words) f_log.write('max_sequence_length = %s\n' % max_sequence_length) f_log.flush() checkpoint = ModelCheckpoint(os.path.join( checkpoint_dir, file_tag + "_" + current_time + '_weights.{epoch:03d}-{val_acc:.4f}.hdf5'), monitor='val_loss', verbose=1, save_best_only=True, mode='auto') print("Traning Model...") model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2, callbacks=[checkpoint], validation_data=(x_val, y_val)) # starts training pred_results = model.predict(x_val) f1_res = f1_score(y_val.argmax(axis=1), pred_results.argmax(axis=1)) precision_res = precision_score(y_val.argmax(axis=1), pred_results.argmax(axis=1)) recall_res = recall_score(y_val.argmax(axis=1), pred_results.argmax(axis=1)) print("\n") print("F1:\t%s" % f1_res) print("Precision:\t%s" % precision_res) print("recall:\t%s" % recall_res) f_log.write("F1:\t%s\n" % f1_res) f_log.write("Precision:\t%s\n" % precision_res) f_log.write("recall:\t%s\n" % recall_res) f_log.write("\n\n") f_log.flush() f_log.close() model.save(os.path.join( checkpoint_dir, 'final_model.h5')) # creates a HDF5 file 'my_model.h5' del model # deletes the existing model print("For evaluation, please use the following checkpoint : %s " % checkpoint_dir)
# Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] x_dev_text = text_x_shuffled[dev_sample_index:] pickle.dump(x_dev_text, open("dev_x_text.pickle","wb")) del x, y, x_shuffled, y_shuffled #print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) #print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) initW = data_helpers.load_word2vec(vocab_processor, FLAGS.embedding_dim) # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), batch_size=FLAGS.batch_size,
import sys import pickle import data_helpers if __name__ == "__main__": w2v_file = sys.argv[1] print("Loading data ...") pos_file, neg_file = "data/rt-polarity.neg", "data/rt-polarity.pos" x_tokenized, y, vocab = data_helpers.load_data(pos_file, neg_file) print("Data loaded!") print("Vocabulary Size: {}".format(len(vocab))) print("Number of Samples: {}".format(len(y))) print("Load word2vec ...") w2v = data_helpers.load_word2vec(w2v_file, vocab) print("Word2vec loaded!") print("Add unknown word...") data_helpers.add_unknown_words(w2v, vocab) print("Unkown word loaded!") print("Build pretrained embedding filter...") word2index, pretrained_embedding_filter = data_helpers.get_pretrained_embedding_filter( w2v) x = data_helpers.index_data(x_tokenized, word2index) print("Pretrained embedding filter built!") pickle.dump([x, y, pretrained_embedding_filter, word2index], open("data.p", "wb"))
freply3 = "de_replies.txt" freply4 = "tfidf_replies.txt" freply5 = "true.txt" # # Path to word2vec weights fqword2vec = 'GoogleNews-vectors-negative300.txt' frword2vec = 'GoogleNews-vectors-negative300.txt' print("Processing training files") process_train_file(processed_data_dir, fquery, query_max_length) process_train_file(processed_data_dir, freply1, reply_max_length) process_train_file(processed_data_dir, freply2, reply_max_length) process_train_file(processed_data_dir, freply3, reply_max_length) process_train_file(processed_data_dir, freply4, reply_max_length) process_train_file(processed_data_dir, freply5, reply_max_length) fqvocab = '%s.vocab%d'%(fquery, query_max_length) frvocab1 = '%s.vocab%d'%(freply1, reply_max_length) frvocab2 = '%s.vocab%d'%(freply2, reply_max_length) frvocab3 = '%s.vocab%d'%(freply3, reply_max_length) frvocab4 = '%s.vocab%d'%(freply4, reply_max_length) frvocab5 = '%s.vocab%d'%(freply5, reply_max_length) word2vec, vec_dim, _ = load_word2vec(word2vec_dir, fqword2vec) make_embedding_matrix(processed_data_dir, fquery, word2vec, vec_dim, fqvocab) make_embedding_matrix(processed_data_dir, freply1, word2vec, vec_dim, frvocab1) make_embedding_matrix(processed_data_dir, freply2, word2vec, vec_dim, frvocab2) make_embedding_matrix(processed_data_dir, freply3, word2vec, vec_dim, frvocab3) make_embedding_matrix(processed_data_dir, freply4, word2vec, vec_dim, frvocab4) make_embedding_matrix(processed_data_dir, freply5, word2vec, vec_dim, frvocab5) pass
#make sure embed and vocab file paths are correct raw_data_dir = "./data" process_train_file(processed_train_dir, fquery_train, query_max_length) process_train_file(processed_train_dir, sub_query, query_max_length) process_train_file(processed_train_dir, freply_train, reply_max_length) process_train_file(processed_train_dir, true_reply, reply_max_length) process_train_file(processed_train_dir, sub_reply, reply_max_length) fqvocab = '%s.vocab%d' % (fquery_train, query_max_length) fqsvocab = '%s.vocab%d' % (sub_query, query_max_length) frvocab = '%s.vocab%d' % (freply_train, reply_max_length) frtvocab = '%s.vocab%d' % (true_reply, reply_max_length) frsvocab = '%s.vocab%d' % (sub_reply, reply_max_length) word2vec, vec_dim, _ = load_word2vec(raw_data_dir, fqword2vec) make_embedding_matrix(processed_train_dir, fquery_train, word2vec, vec_dim, fqvocab) make_embedding_matrix(processed_train_dir, sub_query, word2vec, vec_dim, fqsvocab) make_embedding_matrix(processed_train_dir, freply_train, word2vec, vec_dim, frvocab) make_embedding_matrix(processed_train_dir, freply_train, word2vec, vec_dim, frtvocab) make_embedding_matrix(processed_train_dir, freply_train, word2vec, vec_dim, frsvocab) pass