def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[: outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def decode(): print("Decoding") with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.txt" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.tags" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) test_file_path = os.path.join(FLAGS.data_dir, "test_pos.txt") # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = "He reckons the current account deficit will narrow to only # 1.8 billion in September ." print("Reading Test File from: " + test_file_path) read_test_file = open(test_file_path, "r") for sentence in read_test_file: if len(sentence) == 0: continue # while True: print("\nSentence = " + sentence) tokenized_list = sentence.strip().split() print(tokenized_list) print("Length of Tokenized Words: " + str(len(tokenized_list))) print("Tokenized with Basic Tokenizer") # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab) # print (token_ids) if len(token_ids) == 0: continue # Which bucket does it belong to? # bucket_id = min([b for b in xrange(len(_buckets)) # if _buckets[b][0] > len(token_ids)]) bucket_array = [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)] if len(bucket_array) == 0: continue bucket_id = min(bucket_array) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[: outputs.index(data_utils.EOS_ID)] print("Final Output: ") print("______________") print(outputs) # Print out French sentence corresponding to outputs. print(" ".join([rev_fr_vocab[output] for output in outputs])) print("Total Length of Tags: " + str(len(outputs))) print("\n> ", end="\n")
def evaluate_sentence(model, sess): b = model.batch_size model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.txt" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.tags" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) sentence = "He reckons the current account deficit will narrow to only # 1.8 billion in September ." print(sentence) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab) # Which bucket does it belong to? bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[: outputs.index(data_utils.EOS_ID)] print(outputs) # Print out French sentence corresponding to outputs. print(" ".join([rev_fr_vocab[output] for output in outputs])) model.batch_size = b
def test(): """Test the translation model.""" nltk.download('punkt') with tf.Session() as sess: model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. src_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.src_lang + "_mapping%d.txt" % FLAGS.src_lang_vocab_size dst_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.dst_lang + "_mapping%d.txt" % FLAGS.dst_lang_vocab_size src_lang_vocab, _ = data_utils.initialize_vocabulary(src_lang_vocab_path) _, rev_dst_lang_vocab = data_utils.initialize_vocabulary(dst_lang_vocab_path) weights = [0.25, 0.25, 0.25, 0.25] first_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.src_lang)) second_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.dst_lang)) total_bleu_value = 0.0 computing_bleu_iterations = 0 for first_lang_raw in first_lang_file: second_lang_gold_raw = second_lang_file.readline() # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(first_lang_raw), src_lang_vocab) # Which bucket does it belong to? try: bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) except ValueError: continue # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out sentence corresponding to outputs. model_tran_res = " ".join([tf.compat.as_str(rev_dst_lang_vocab[output]) for output in outputs]) second_lang_gold_tokens = word_tokenize(second_lang_gold_raw) model_tran_res_tokens = word_tokenize(model_tran_res) try: current_bleu_value = sentence_bleu([model_tran_res_tokens], second_lang_gold_tokens, weights) total_bleu_value += current_bleu_value computing_bleu_iterations += 1 except ZeroDivisionError: pass if computing_bleu_iterations % 10 == 0: print("BLEU value after %d iterations: %.2f" % (computing_bleu_iterations, total_bleu_value / computing_bleu_iterations)) final_bleu_value = total_bleu_value / computing_bleu_iterations print("Final BLEU value after %d iterations: %.2f" % (computing_bleu_iterations, final_bleu_value)) return
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? try: bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([ tf.compat.as_str(rev_fr_vocab[output]) for output in outputs ])) except: print("Exception: input too long") pass finally: print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def init_for_conversation(self): self.sess = tf.Session() # Create model and load parameters. self.model = self.create_model(self.sess, True) self.model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. self.source_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.source" % FLAGS.vocab_size) self.target_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.target" % FLAGS.vocab_size) self.source_vocab, _ = data_utils.initialize_vocabulary(self.source_vocab_path) _, self.rev_target_vocab = data_utils.initialize_vocabulary(self.target_vocab_path)
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = len(_buckets) - 1 for i, bucket in enumerate(_buckets): if bucket[0] >= len(token_ids): bucket_id = i break else: logging.warning("Sentence truncated: %s", sentence) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits, hidden_states = model.step_with_states(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])) print(" ".join([summarise_state(state) for state in hidden_states])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def __init__(self, params): self.data_dir = params['data_dir'] self.train_dir = params['train_dir'] self.size = params['size'] self.num_layers = params['n_layers'] self.sess = tf.Session() # Create model and load parameters. self.model = self.create_model(self.sess, True) self.model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. en_vocab_path = os.path.join(self.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(self.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) self.en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, self.fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
def evaluate_valid(model, session, dev_set, current_step, printed_size): # Load vocabularies. input_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.input" % FLAGS.vocab_size) target_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.target" % FLAGS.vocab_size) _, rev_input_vocab = data_utils.initialize_vocabulary(input_vocab_path) _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path) eval_datas = [] for bucket_id in xrange(len(_buckets)): eval_datas_bucket = [] if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, output_logits = model.step(session, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float( "inf") print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) log_data("dev_loss", current_step, model.global_step.eval(), eval_loss, bucket_id, eval_ppx) _buckets_loss[bucket_id] = eval_loss valid_input = np.transpose(encoder_inputs) valid_decode = np.transpose(decoder_inputs) for i in range(len(valid_input)): outputs = [int(np.argmax(logit[i:i+1], axis=1)) for logit in output_logits] #print(valid_input[i], valid_decode[i], outputs, [logit[i][int(np.argmax(logit[i:i+1], axis=1))] for logit in output_logits]) if data_utils.EOS_ID in outputs: outputs = outputs[0:outputs.index(data_utils.EOS_ID)] istr, tstr, ostr = ids2str(valid_input[i][::-1], rev_input_vocab),ids2str(valid_decode[i], rev_target_vocab), ids2str(outputs, rev_target_vocab) eval_datas_bucket.append([istr, tstr, ostr]) for i in range(min(printed_size, len(valid_input))): print(" sampled valid (i,t,o)",eval_datas_bucket[i][0], eval_datas_bucket[i][1], eval_datas_bucket[i][2]) eval_datas += eval_datas_bucket sys.stdout.flush() return eval_datas
def translate_add(self, sentence): global sess global model #with tf.Session() as sess: # Create model and load parameters. #model = create_model(self.sess, True) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. qt_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.qt" % FLAGS.qt_vocab_size) ans_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.ans" % FLAGS.ans_vocab_size) qt_vocab, _ = data_utils.initialize_vocabulary(qt_vocab_path) _, rev_ans_vocab = data_utils.initialize_vocabulary(ans_vocab_path) # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), qt_vocab) # Which bucket does it belong to? bucket_id = min( [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out answer sentence corresponding to outputs. result = " ".join( [tf.compat.as_str(rev_ans_vocab[output]) for output in outputs]) print("Server sent data:%s" % result) return result
def create_model(session, forward_only): """Create translation model and initialize or load parameters in session.""" model = seq2seq_model.Seq2SeqModel( FLAGS.en_vocab_size, FLAGS.fr_vocab_size, _buckets, FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, forward_only=forward_only) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): model.saver.restore(session, ckpt.model_checkpoint_path) print("Reading model parameters from %s" % ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) # Load vocabularies. en_vocab_path = "n_data/prompt_vocab.txt" fr_vocab_path = "n_data/response_vocab.txt" global en_vocab global rev_fr_vocab en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) return model
def gen_batches(rootdir=TEXT_PATH, batch_size=2, answer_size=5, dialog_size=4, vocab_path=VOCAB_PATH): finished = False generator = gen_dialogs(rootdir) vocab, rev_vocab = initialize_vocabulary(vocab_path) if dialog_size % 2 != 0: dialog_size += 1 while not finished: padded_dialogs = [] encoder_inputs = [] decoder_inputs = [] target_weights = [] try: for i in range(0, batch_size): padded_dialogs.append( padded(next(generator), answer_size=answer_size, dialog_size=dialog_size)) except StopIteration: finished = True for i in range(0, dialog_size): data = [[ padded_dialogs[batch_idx][i][length_idx] for batch_idx in range(0, batch_size) ] for length_idx in range(0, answer_size)] if i % 2 == 0: encoder_inputs.append(data) else: data = [[_GO] * batch_size] + data decoder_inputs.append(data) weights = np.ones((answer_size + 1, batch_size), dtype=np.float32) for i in range(0, answer_size + 1): for j in range(0, batch_size): if data[i][j] == _PAD: weights[i][j] = 0.0 target_weights.append(weights) yield to_ids(encoder_inputs, vocab), to_ids(decoder_inputs, vocab), target_weights
def gen_batches(rootdir=TEXT_PATH, batch_size=2, answer_size=5, dialog_size=4, vocab_path=VOCAB_PATH): finished = False generator = gen_dialogs(rootdir) vocab, rev_vocab = initialize_vocabulary(vocab_path) if dialog_size%2 != 0: dialog_size += 1 while not finished: padded_dialogs = [] encoder_inputs = [] decoder_inputs = [] target_weights = [] try: for i in range(0, batch_size): padded_dialogs.append(padded(next(generator), answer_size=answer_size, dialog_size=dialog_size)) except StopIteration: finished = True for i in range(0, dialog_size): data = [ [padded_dialogs[batch_idx][i][length_idx] for batch_idx in range(0, batch_size)] for length_idx in range(0, answer_size) ] if i%2 == 0: encoder_inputs.append(data) else: data = [[_GO]*batch_size] + data decoder_inputs.append(data) weights = np.ones((answer_size+1, batch_size), dtype=np.float32) for i in range(0, answer_size+1): for j in range(0, batch_size): if data[i][j] == _PAD: weights[i][j] = 0.0 target_weights.append(weights) yield to_ids(encoder_inputs, vocab), to_ids(decoder_inputs, vocab), target_weights
def decode(): with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True) model.batch_size = 1 # We decode one sentence at a time. mrlf = open(outmrlfilename, "w+") # Load vocabularies. en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size) fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size) en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path) _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path) # Decode from standard input. #sys.stdout.write("> ") #sys.stdout.flush() #sentence = sys.stdin.readline() #sentence = MRL_Linearizer.stemNL(sentence) def single_sentence_decoding(sentence): # Get token-ids for the input sentence. token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) # Which bucket does it belong to? bucket_id = min([ b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) return decode_once(output_logits, rev_fr_vocab) #interactive session? if FLAGS.demo == False: for mrl, sentence in testdataiterator(): #decoding the whole test corpus print("translating:" + str(sentence)) #stemming input sentence sentence = MRL_Linearizer.stemNL(sentence) value, counter = single_sentence_decoding(sentence) print('Found at iteration: ' + str(counter)) print(value) #writing the translations on a file mrlf.write( str(counter) + "|||" + value + "|||" + Delinearizer.delinearizer(value) + "\n") mrlf.flush() #create file containing only the mrls with open("out.txt") as f: with open("nmtout.mrl", "w+") as out: for line in f: tokens = line.split("|||") out.write(tokens[2].replace("$", "")) else: sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() sentence = MRL_Linearizer.stemNL(sentence) print("translating:" + str(sentence)) value, counter = single_sentence_decoding(sentence) print('Found at iteration: ' + str(counter)) print(value) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def train(): print("Preparing korpora data in %s" % FLAGS.data_dir) en_train, fr_train, en_dev, fr_dev, _, _ = prepare_korpora_data( FLAGS.data_dir, FLAGS.vocab_size) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(en_dev, fr_dev) train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_buckets_scale = calc_buckects_scale(train_bucket_sizes) # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 log_data("step_loss", current_step, model.global_step.eval(), step_loss, bucket_id) # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(float(loss)) if loss < 300 else float("inf") print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) log_data("checkpoint_loss", current_step, model.global_step.eval(), loss, model.learning_rate.eval(),step_time,perplexity) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "korpora.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. input_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.input" % FLAGS.vocab_size) target_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.target" % FLAGS.vocab_size) _, rev_input_vocab = data_utils.initialize_vocabulary(input_vocab_path) _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path) for i in range(min(10, len(np.transpose(encoder_inputs)))): print(" sampled input (i,t,o)", ids2str(np.transpose(encoder_inputs)[i][::-1], rev_input_vocab), ids2str(np.transpose(decoder_inputs)[i], rev_target_vocab)) evaluate_valid(model, sess, dev_set, current_step, 10) #recalculate bucket proporation based on dev error, in order to focus on bad bucket train_buckets_scale = calc_buckects_scale(train_bucket_sizes)