def test_final(self): reader = Reader() soup = reader.readfile() threads = reader.makeobjectsfromxml(soup) tokenizer = Tokenizer(threads) collection_tokenized = tokenizer.tokenize() coll_model = CollectionModel(collection_tokenized) doc_model = DocumentModel(collection_tokenized) ret_model = RetrievalModel(collection_tokenized, doc_model, coll_model) ret_model.calculate_relevance()
def testtokenizerfromfile(self): reader = Reader() soup = reader.readfile() threads = reader.makeobjectsfromxml(soup) tokenizer = Tokenizer(threads) threads_tokenized = tokenizer.tokenize() for thread in threads_tokenized: print(thread._query._body) for document in thread._documents: print(document._text)
def testdocumentmodel(self): reader = Reader() soup = reader.readfile() threads = reader.makeobjectsfromxml(soup) tokenizer = Tokenizer(threads) threads_tokenized = tokenizer.tokenize() collection_model = CollectionModel(threads_tokenized) freq_collection = collection_model.calculate_frequency() print(freq_collection) document_model = DocumentModel(threads_tokenized) freq_document = document_model.calculate_frequency() print(freq_document)
def __init__(self): with open(os.path.join(fpath, 'params.yml'), 'r') as f: self.params = yaml.load(f) self.tokenizer = Tokenizer(self.params) self.save_file = self.params["save_file"] self.num_epochs = self.params["num_epochs"] self.batch_size = self.params["batch_size"] self.vocab_size = self.params["vocab_size"] self.embedding_size = self.params["embedding_size"] self.hidden_size = self.params["hidden_size"] self.learning_rate = self.params["learning_rate"] self.keep_prob = self.params["keep_prob"] self.maximum_iterations = self.params['max_length'] - 1
def __init__(self): with open('./params.yml', 'r') as f: params = yaml.load(f) self.params = params self.tokenizer = Tokenizer(self.params) self.embedding_size = self.params["embedding_size"] self.vocab_size = self.params["vocab_size"] self.input_size = self.params["max_length"] self.num_classes = len(self.params['labels']) self.keep_prob = self.params["dropout_keep_prob"] self.learning_rate = self.params["learning_rate"] self.num_epochs = self.params["num_epochs"] self.batch_size = self.params["batch_size"] self.hidden_size = self.params["hidden_size"] self.mode = self.params["mode"]
class Seq2SeqModel: def __init__(self): with open(os.path.join(fpath, 'params.yml'), 'r') as f: self.params = yaml.load(f) self.tokenizer = Tokenizer(self.params) self.save_file = self.params["save_file"] self.num_epochs = self.params["num_epochs"] self.batch_size = self.params["batch_size"] self.vocab_size = self.params["vocab_size"] self.embedding_size = self.params["embedding_size"] self.hidden_size = self.params["hidden_size"] self.learning_rate = self.params["learning_rate"] self.keep_prob = self.params["keep_prob"] self.maximum_iterations = self.params['max_length'] - 1 def build_graph(self): self.src = tf.placeholder(tf.int64, [self.batch_size, None], name="src") self.tgt = tf.placeholder(tf.int64, [self.batch_size, None], name="tgt") self.tgt_in = tf.strided_slice( self.tgt, [0, 0], [tf.shape(self.tgt)[0], tf.shape(self.tgt)[1] - 1], [1, 1], name="tgt_in") self.tgt_out = tf.strided_slice( self.tgt, [0, 1], [tf.shape(self.tgt)[0], tf.shape(self.tgt)[1]], [1, 1], name="tgt_out") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.raw_src_len = tf.placeholder(tf.int64, [self.batch_size, 1]) self.src_lengths = tf.reshape(self.raw_src_len, shape=[-1], name="src_len") self.raw_tgt_len = tf.placeholder(tf.int64, [self.batch_size, 1]) self.tgt_lengths = tf.reshape(self.raw_tgt_len, shape=[-1], name="tgt_len") self.global_step = tf.get_variable(name="global_step", initializer=0, trainable=False) self.embeddings = tf.get_variable( name="embeddings", shape=[self.vocab_size, self.embedding_size], dtype=tf.float32) with tf.name_scope("output_projection"): self.projection_layer = tf.layers.Dense(self.vocab_size, use_bias=False, name="output_projection") self.src_embedding, self.tgt_in_embedding = get_embedding( self.embeddings, self.src, self.tgt_in) encoder_outputs, self.encoder_state = get_bi_encoder( self.src_embedding, self.hidden_size, self.src_lengths, self.dropout_keep_prob) self.decoder_cell = get_unbi_decoder(self.hidden_size, self.dropout_keep_prob) def train(self): init_train = self.params['init_train'] g = tf.Graph() with g.as_default(): self.build_graph() self.logits = get_unbi_train_decoder( self.tgt_in_embedding, self.encoder_state, self.decoder_cell, self.hidden_size, self.batch_size, self.maximum_iterations, self.projection_layer) self.loss = get_loss(self.logits, self.tgt_out, self.tgt_lengths, self.maximum_iterations) tf.summary.scalar('loss', self.loss) self.output = tf.argmax(self.logits, -1) self.train_op = get_train_op(self.learning_rate, self.loss, self.global_step) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(fpath + '/train', g) self.saver = tf.train.Saver() init = tf.global_variables_initializer() sess = tf.Session() with sess.as_default(): if init_train: sess.run(init) else: self.saver.restore(sess, self.save_file) dataset_obj = DataSet(self.params) dataset = dataset_obj.prepare_dataset() iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() while 1: try: batch = sess.run(next_element) feed_dict = { self.src: batch['src'], self.tgt: batch['tgt'], self.dropout_keep_prob: self.keep_prob, self.raw_src_len: batch['src_len'], self.raw_tgt_len: batch['tgt_len'] } summary, tgt_out, output, _, loss, step = sess.run([ merged, self.tgt_out, self.output, self.train_op, self.loss, self.global_step ], feed_dict) if step % 100 == 1: for i in range(len(tgt_out)): print( self.tokenizer.decode( [int(x) for x in output[i]])) print( self.tokenizer.decode( [int(x) for x in tgt_out[i]])) # sys.exit(0) # train_writer.add_summary(summary, step) print('train step:{} loss:{}'.format(step, loss)) except tf.errors.OutOfRangeError: print('over') break self.saver.save(sess, self.save_file) def infer(self): self.g = tf.Graph() with self.g.as_default(): self.build_graph() self.decode_mode = self.params['decode_mode'] sos_id = SOS_ID eos_id = EOS_ID if self.decode_mode == 'greedy': self.output = greedy_decode(self.batch_size, sos_id, eos_id, self.embeddings, self.decoder_cell, self.encoder_state, self.projection_layer, self.maximum_iterations) elif self.decode_mode == 'beam_search': beam_width = self.params['beam_width'] self.output = beam_search_decode(self.batch_size, sos_id, eos_id, self.embeddings, self.encoder_state, self.decoder_cell, beam_width, self.projection_layer, self.maximum_iterations) self.saver = tf.train.Saver() self.sess = tf.Session() self.saver.restore(self.sess, self.save_file) def predict(self, sentence): keep_prob = 1.0 beam_width = self.params['beam_width'] with self.sess.as_default(): # src_total,tgt_in_total,tgt_out_total,src_lengths,tgt_lengths ret, ret_len = self.tokenizer.encode(pre_process(sentence)) feed_dict = { self.src: [ret], self.dropout_keep_prob: keep_prob, self.raw_src_len: [[ret_len]], } output = self.sess.run([self.output], feed_dict) # print(output[0],output[0][0].dtype) # logger.info("batch output:{}".format(output)) if self.decode_mode == 'greedy': print( self.tokenizer.decode([int(i) for i in list(output[0][0])])) elif self.decode_mode == 'beam_search': for i in range(beam_width): res = self.tokenizer.decode( [int(x) for x in list(output[0][i])]) print(res)
import yaml model_type = 'rnn' model_params_file = os.path.join('model', model_type, 'params.yml') with open(model_params_file, 'r') as f: params = yaml.load(f) params.update({ 'num_epochs': 1, 'semi_dir': params['semi_dir'].replace('/data/', '/test/'), 'raw_data_dir': params['raw_data_dir'].replace('/data/', '/test/'), 'dataset_dir': params['dataset_dir'].replace('/data/', '/test/'), 'src_file': params['src_file'].replace('/data/', '/test/'), 'tgt_file': params['tgt_file'].replace('/data/', '/test/'), 'dataset_file': params['dataset_file'].replace('/data/', '/test/'), }) print(params) produce_semi(params) tokenizer = Tokenizer(params) semi_to_dataset(params, tokenizer)
class TextRnn: def __init__(self): with open('./params.yml', 'r') as f: params = yaml.load(f) self.params = params self.tokenizer = Tokenizer(self.params) self.embedding_size = self.params["embedding_size"] self.vocab_size = self.params["vocab_size"] self.input_size = self.params["max_length"] self.num_classes = len(self.params['labels']) self.keep_prob = self.params["dropout_keep_prob"] self.learning_rate = self.params["learning_rate"] self.num_epochs = self.params["num_epochs"] self.batch_size = self.params["batch_size"] self.hidden_size = self.params["hidden_size"] self.mode = self.params["mode"] def build(self): self.src = tf.placeholder(tf.int32, [None, self.input_size], name="src") self.tgt_raw = tf.placeholder(tf.int64, [None, 1]) self.tgt = tf.one_hot(tf.reshape(self.tgt_raw, shape=[-1]), depth=self.num_classes, name='tgt') self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.raw_seq_len = tf.placeholder(tf.int64, [None, 1]) self.seq_lengths = tf.reshape(self.raw_seq_len, shape=[-1], name="src_len") self.global_step = tf.Variable(0, name='global_step', trainable=False) with tf.name_scope('embeddings'): self.embeddings = tf.get_variable( "embedding", [self.vocab_size, self.embedding_size], dtype=tf.float32) self.src_embedding = tf.nn.embedding_lookup( self.embeddings, self.src) # . The result of the embedding operation is a 3-dimensional tensor of shape [None,input_size,embedding_size]. with tf.name_scope('rnn'): lstm_fw_cell = tf.nn.rnn_cell.BasicLSTMCell( self.hidden_size) #forward direction cell lstm_bw_cell = tf.nn.rnn_cell.BasicLSTMCell( self.hidden_size) #backward direction cell lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_fw_cell, output_keep_prob=self.dropout_keep_prob) lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_bw_cell, output_keep_prob=self.dropout_keep_prob) outputs, _ = tf.nn.bidirectional_dynamic_rnn( lstm_fw_cell, lstm_bw_cell, self.src_embedding, sequence_length=self.seq_lengths, dtype=tf.float32) #[batch_size,sequence_length,hidden_size] #creates a dynamic bidirectional recurrent neural network output_rnn = tf.concat( outputs, axis=2 ) #[batch_size,sequence_length,hidden_size*2] 两个tensor按照第2个维度(hidden size)连接 self.final_outputs = tf.reduce_sum( output_rnn, axis=1 ) #shape=[batch_size,2*hidden_size]按维度1(即senquence length)相加 with tf.name_scope('output'): W = tf.Variable( tf.truncated_normal([2 * self.hidden_size, self.num_classes])) b = tf.Variable(tf.constant(0.1, shape=[self.num_classes])) self.scores = tf.nn.xw_plus_b(self.final_outputs, W, b, name='scores') self.predictions = tf.argmax(self.scores, 1, name='predictions') with tf.name_scope('loss'): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.tgt) self.loss = tf.reduce_mean(losses) with tf.name_scope('accuracy'): correct_predictions = tf.equal(self.predictions, tf.argmax(self.tgt, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy") if self.mode == "train": optimizer = tf.train.AdamOptimizer(self.learning_rate) grads_and_vars = optimizer.compute_gradients(self.loss) #返回A list of (gradient, variable) pairs self.train_op = optimizer.apply_gradients( grads_and_vars, global_step=self.global_step) # global_step可以理解为调用train_op的次数 def train(self, dataset): save_file = self.params["save_file"] init_train = self.params['init_train'] g = tf.Graph() with g.as_default(): self.build() self.saver = tf.train.Saver() init = tf.global_variables_initializer() sess = tf.Session() with sess.as_default(): if init_train: sess.run(init) print('start to first training') else: self.saver.restore(sess, save_file) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() while 1: try: batch = sess.run(next_element) # if batch['src'].shape[0] != self.batch_size: # break feed_dict = { self.src: batch['src'], self.tgt_raw: batch['tgt'], self.dropout_keep_prob: self.keep_prob, self.raw_seq_len: batch['src_len'] } _, step, loss, accuracy = sess.run([ self.train_op, self.global_step, self.loss, self.accuracy ], feed_dict) if step % 100 == 0: print('train step:{} loss:{} accuracy:{}'.format( step, loss, accuracy)) except tf.errors.OutOfRangeError: print('over') break self.saver.save(sess, save_file) def infer(self): save_file = self.params["save_file"] self.mode = 'infer' self.g = tf.Graph() with self.g.as_default(): self.build() self.saver = tf.train.Saver() self.sess = tf.Session() self.saver.restore(self.sess, save_file) def predict(self, sentence): line_encode, line_len = self.tokenizer.encode(sentence, padding=True) with self.sess.as_default(): feed_dict = { self.src: [line_encode], self.dropout_keep_prob: 1.0, self.raw_seq_len: [[line_len]] } prediction = self.sess.run(self.predictions, feed_dict) return self.params['labels'][int(prediction[0])] def test(self, sentence_list): res = [ self.tokenizer.encode(sentence, padding=True) for sentence in sentence_list ] line_encode_list = [each[0] for each in res] line_len_list = [[each[1]] for each in res] with self.sess.as_default(): feed_dict = { self.src: line_encode_list, self.dropout_keep_prob: 1.0, self.raw_seq_len: line_len_list } prediction = self.sess.run(self.predictions, feed_dict) return [ self.params['labels'][int(prediction[i])] for i in range(len(prediction)) ] def test_batch(self, dataset): save_file = self.params["save_file"] g = tf.Graph() with g.as_default(): self.build() self.saver = tf.train.Saver() sess = tf.Session() with sess.as_default(): self.saver.restore(sess, save_file) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() while 1: try: batch = sess.run(next_element) # if batch['src'].shape[0] != self.batch_size: # break feed_dict = { self.src: batch['src'], self.tgt_raw: batch['tgt'], self.dropout_keep_prob: 1.0, self.raw_seq_len: batch['src_len'] } loss, accuracy = sess.run([self.loss, self.accuracy], feed_dict) if accuracy != 1: predictions = sess.run([self.predictions], feed_dict) predictions = list(predictions[0]) tgt = list(np.reshape(batch['tgt'], [-1])) res = { self.tokenizer.decode( [int(x) for x in batch['src'][i]]): 'real:' + self.params['labels'][int(tgt[i])] + '/predict:' + self.params['labels'][int(predictions[i])] for i in range(len(tgt)) if int(predictions[i]) != int(tgt[i]) } print(res) break print('loss:{} accuracy:{}'.format(loss, accuracy)) except tf.errors.OutOfRangeError: print('over') break
config = Config( project_name='poc', model_type='rnn', update_semi=True, update_dataset=True, max_length=30, is_tgt_label=True, update_vocab=True, # extra_reserved_tokens=['<BotName>'], n_observations=None) if config.init_params['update_semi']: # raw to semi produce_semi(config.params) tokenizer = Tokenizer(config.params) config.set_params({'vocab_size': tokenizer.vocab_size}) config.set_params({'update_vocab': False}) if config.init_params['update_dataset']: # semi to tfrecord(including Tokenization) n_observations = semi_to_dataset(config.params, tokenizer) if not config.init_params['n_observations']: config.set_params({'n_observations': n_observations}) print(config.params) final_params_path = os.path.join(config.params['model_dir'], 'params.yml') print(final_params_path) with open(final_params_path, 'w') as f: yaml.dump(config.init_params, f)