from preprocess import Preprocess from word2vec import Word2Vec from news_embedding import NewsEmbeding from han import HAN if __name__ == '__main__': PREPROCESSED = True EMBEDDING_TRAINED = False WORD_EMBEDDING_READY = False preprocess = Preprocess() if not PREPROCESSED: preprocess.preprocess() preprocess.load_data() word2vec = Word2Vec(preprocess.data_dict) if not EMBEDDING_TRAINED: print('training word2vec...') word2vec.train_model() word2vec.load_model() news_emb = NewsEmbeding(word2vec.model, preprocess.data_dict) if not WORD_EMBEDDING_READY: news_emb.embed() news_emb.load_embeddings() news_emb.get_max_corpus_date_count() news_emb.pad_embeddings() #print(news_emb.embedict_padded['AAPL']) HAN = HAN(news_emb.emb_node_num, news_emb.date_num)
if FLAGS.run_type == "train": print("Training...\n") # create new graph set as default with tf.Graph().as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_conf.gpu_options.allocator_type = "BFC" # create new session set it as default sess = tf.Session(config=session_conf) with sess.as_default(): # create new cnn model han = HAN(max_seq_len=imdb.max_seq_len, max_sent_len=imdb.max_sent_len, num_classes=len(y_test[0]), vocab_size=imdb.vocab_size, embedding_size=FLAGS.embedding_dim, max_grad_norm=FLAGS.max_grad_norm, dropout_keep_proba=FLAGS.dropout_keep_proba, learning_rate=FLAGS.learning_rate) global_step = tf.Variable(0, name="global_step", trainable=False) tvars = tf.trainable_variables() grads, global_norm = tf.clip_by_global_norm( tf.gradients(han.loss, tvars), han.max_grad_norm) optimizer = tf.train.AdamOptimizer(han.learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars), name="train_op", global_step=global_step) # checkpoint model saver = tf.train.Saver(tf.global_variables(),
EPOCH = 40 if __name__ == "__main__": bf = BatchFeeder('train', 64, 15, 50) tf.reset_default_graph() config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True) with tf.device('/device:GPU:0'): model = HAN(vocab_size=len(bf.vocab_encode_dict.keys()) + 1, embedding_size=200, classes=10, word_cell=tf.nn.rnn_cell.GRUCell(50, name='word-gru'), sentence_cell=tf.nn.rnn_cell.GRUCell(50, name='sentence-gru'), word_context_size=100, sentence_context_size=100) saver = tf.train.Saver() with tf.Session(config=config) as sess: writer = tf.summary.FileWriter('./han_graph', graph=tf.get_default_graph()) sess.run(tf.global_variables_initializer()) for i in range(EPOCH): for encoded_data, document_length_mask, sentence_length_mask, labels in bf: feed_dict = { model.inputs: encoded_data, model.word_length: sentence_length_mask,
epochs = 10 print('Loading data...') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences') print(len(x_test), 'test sequences') print('Pad sequences (samples x #sentence x #word)...') x_train = sequence.pad_sequences(x_train, maxlen=maxlen_sentence * maxlen_word) x_test = sequence.pad_sequences(x_test, maxlen=maxlen_sentence * maxlen_word) x_train = x_train.reshape((len(x_train), maxlen_sentence, maxlen_word)) x_test = x_test.reshape((len(x_test), maxlen_sentence, maxlen_word)) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) print('Build model...') model = HAN(maxlen_sentence, maxlen_word, max_features, embedding_dims) model.compile('adam', 'binary_crossentropy', metrics=['accuracy']) print('Train...') early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, callbacks=[early_stopping], validation_data=(x_test, y_test)) print('Test...') result = model.predict(x_test)
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) except: (x_train, y_train), (x_test, y_test) = load_data(num_words=max_features) logger.info('padding...') x_train = sequence.pad_sequences(x_train, maxlen=max_seqs*max_words) x_test = sequence.pad_sequences(x_test, maxlen=max_seqs*max_words) x_train = x_train.reshape((len(x_train), max_seqs, max_words)) x_test = x_test.reshape((len(x_test), max_seqs, max_words)) logger.info('train data shape is: {}'.format(x_train.shape)) logger.info('test data shape is: {}'.format(x_test.shape)) logger.info('build model...') model = HAN(max_features=max_features, max_words=max_words, max_seqs=max_seqs, emb_dim=emb_dim).build_model() model.compile('adam', 'binary_crossentropy', ['acc']) logger.info('training...') earlystop = EarlyStopping(patience=3, monitor='val_acc', mode='max') model.fit(x_train, y_train, callbacks=[earlystop], batch_size=batch_size, epochs=epochs, validation_data=[x_test, y_test]) logger.info('test...') pred = model.predict(x_test) logger.info(pred[:10]) logger.info(y_test[:10])