Esempio n. 1
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
            # Which bucket does it belong to?
            bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[: outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Esempio n. 2
0
def decode():
    print("Decoding")
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.txt" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.tags" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        test_file_path = os.path.join(FLAGS.data_dir, "test_pos.txt")

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = "He reckons the current account deficit will narrow to only # 1.8 billion in September ."
        print("Reading Test File from: " + test_file_path)

        read_test_file = open(test_file_path, "r")
        for sentence in read_test_file:
            if len(sentence) == 0:
                continue
            # while True:
            print("\nSentence = " + sentence)
            tokenized_list = sentence.strip().split()
            print(tokenized_list)
            print("Length of Tokenized Words: " + str(len(tokenized_list)))
            print("Tokenized with Basic Tokenizer")
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab)
            # print (token_ids)
            if len(token_ids) == 0:
                continue
            # Which bucket does it belong to?
            # bucket_id = min([b for b in xrange(len(_buckets))
            #                 if _buckets[b][0] > len(token_ids)])
            bucket_array = [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]
            if len(bucket_array) == 0:
                continue
            bucket_id = min(bucket_array)
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[: outputs.index(data_utils.EOS_ID)]
            print("Final Output: ")
            print("______________")
            print(outputs)
            # Print out French sentence corresponding to outputs.
            print(" ".join([rev_fr_vocab[output] for output in outputs]))
            print("Total Length of Tags: " + str(len(outputs)))

            print("\n> ", end="\n")
Esempio n. 3
0
def evaluate_sentence(model, sess):
    b = model.batch_size
    model.batch_size = 1  # We decode one sentence at a time.
    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.txt" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.tags" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    sentence = "He reckons the current account deficit will narrow to only # 1.8 billion in September ."
    print(sentence)
    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab)
    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
        outputs = outputs[: outputs.index(data_utils.EOS_ID)]
        print(outputs)
        # Print out French sentence corresponding to outputs.
        print(" ".join([rev_fr_vocab[output] for output in outputs]))
    model.batch_size = b
Esempio n. 4
0
def test():
  """Test the translation model."""
  nltk.download('punkt')
  with tf.Session() as sess:
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    src_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.src_lang + "_mapping%d.txt" % FLAGS.src_lang_vocab_size
    dst_lang_vocab_path = PATH_TO_DATA_FILES + FLAGS.dst_lang + "_mapping%d.txt" % FLAGS.dst_lang_vocab_size
    src_lang_vocab, _ = data_utils.initialize_vocabulary(src_lang_vocab_path)
    _, rev_dst_lang_vocab = data_utils.initialize_vocabulary(dst_lang_vocab_path)

    weights = [0.25, 0.25, 0.25, 0.25]

    first_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.src_lang))
    second_lang_file = open(generate_src_lang_sentences_file_name(FLAGS.dst_lang))
		
    total_bleu_value = 0.0
    computing_bleu_iterations = 0

    for first_lang_raw in first_lang_file:
      second_lang_gold_raw = second_lang_file.readline()
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(first_lang_raw), src_lang_vocab)
      # Which bucket does it belong to?
      try:
        bucket_id = min([b for b in xrange(len(_buckets))
                         if _buckets[b][0] > len(token_ids)])
      except ValueError:
        continue
      # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
	  {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # Print out sentence corresponding to outputs.
      model_tran_res = " ".join([tf.compat.as_str(rev_dst_lang_vocab[output]) for output in outputs])
      second_lang_gold_tokens = word_tokenize(second_lang_gold_raw)
      model_tran_res_tokens = word_tokenize(model_tran_res)
      try:
        current_bleu_value = sentence_bleu([model_tran_res_tokens], second_lang_gold_tokens, weights)
        total_bleu_value += current_bleu_value
        computing_bleu_iterations += 1
      except ZeroDivisionError:
        pass
      if computing_bleu_iterations % 10 == 0:
        print("BLEU value after %d iterations: %.2f"
              % (computing_bleu_iterations, total_bleu_value / computing_bleu_iterations))
    final_bleu_value = total_bleu_value / computing_bleu_iterations
    print("Final BLEU value after %d iterations: %.2f" % (computing_bleu_iterations, final_bleu_value))
    return
Esempio n. 5
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), en_vocab)
            # Which bucket does it belong to?

            try:
                bucket_id = min([
                    b for b in xrange(len(_buckets))
                    if _buckets[b][0] > len(token_ids)
                ])
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: [(token_ids, [])]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True)
                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [
                    int(np.argmax(logit, axis=1)) for logit in output_logits
                ]
                # If there is an EOS symbol in outputs, cut them at that point.
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]

# Print out French sentence corresponding to outputs.
                print(" ".join([
                    tf.compat.as_str(rev_fr_vocab[output])
                    for output in outputs
                ]))
            except:
                print("Exception: input too long")
                pass
            finally:
                print("> ", end="")
                sys.stdout.flush()
                sentence = sys.stdin.readline()
Esempio n. 6
0
    def init_for_conversation(self):
        self.sess = tf.Session()

        # Create model and load parameters.
        self.model = self.create_model(self.sess, True)
        self.model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        self.source_vocab_path = os.path.join(FLAGS.data_dir,
                                         "vocab%d.source" % FLAGS.vocab_size)
        self.target_vocab_path = os.path.join(FLAGS.data_dir,
                                         "vocab%d.target" % FLAGS.vocab_size)
        self.source_vocab, _ = data_utils.initialize_vocabulary(self.source_vocab_path)
        _, self.rev_target_vocab = data_utils.initialize_vocabulary(self.target_vocab_path)
Esempio n. 7
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.en" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.fr" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
      # Which bucket does it belong to?
      bucket_id = len(_buckets) - 1
      for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
          bucket_id = i
          break
      else:
        logging.warning("Sentence truncated: %s", sentence) 

      # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits, hidden_states = model.step_with_states(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # Print out French sentence corresponding to outputs.
      print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
      print(" ".join([summarise_state(state) for state in hidden_states]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
Esempio n. 8
0
    def __init__(self, params):
        self.data_dir = params['data_dir']
        self.train_dir = params['train_dir']
        self.size = params['size']
        self.num_layers = params['n_layers']

        self.sess = tf.Session()
        # Create model and load parameters.
        self.model = self.create_model(self.sess, True)
        self.model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(self.data_dir,
                                     "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(self.data_dir,
                                     "vocab%d.fr" % FLAGS.fr_vocab_size)
        self.en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, self.fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
Esempio n. 9
0
def evaluate_valid(model, session, dev_set, current_step, printed_size):
    # Load vocabularies.
    input_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.input" % FLAGS.vocab_size)
    target_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.target" % FLAGS.vocab_size)
    _, rev_input_vocab = data_utils.initialize_vocabulary(input_vocab_path)
    _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path)
    eval_datas = []
    for bucket_id in xrange(len(_buckets)):
          eval_datas_bucket = []
          if len(dev_set[bucket_id]) == 0:
            print("  eval: empty bucket %d" % (bucket_id))
            continue
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              dev_set, bucket_id)
          _, eval_loss, output_logits = model.step(session, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float(
              "inf")
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
          log_data("dev_loss", current_step, model.global_step.eval(), eval_loss, bucket_id, eval_ppx)
          _buckets_loss[bucket_id] = eval_loss
          valid_input = np.transpose(encoder_inputs)
          valid_decode = np.transpose(decoder_inputs)
          for i in range(len(valid_input)):
            outputs = [int(np.argmax(logit[i:i+1], axis=1)) for logit in output_logits]
            
            #print(valid_input[i], valid_decode[i], outputs, [logit[i][int(np.argmax(logit[i:i+1], axis=1))] for logit in output_logits])
            
            if data_utils.EOS_ID in outputs:
              outputs = outputs[0:outputs.index(data_utils.EOS_ID)]
            istr, tstr, ostr = ids2str(valid_input[i][::-1], rev_input_vocab),ids2str(valid_decode[i], rev_target_vocab), ids2str(outputs, rev_target_vocab)
            eval_datas_bucket.append([istr, tstr, ostr])
            
          for i in range(min(printed_size, len(valid_input))):
            print("  sampled valid (i,t,o)",eval_datas_bucket[i][0], eval_datas_bucket[i][1], eval_datas_bucket[i][2])
          eval_datas += eval_datas_bucket  
          sys.stdout.flush()
    return eval_datas
Esempio n. 10
0
def translate_add(self, sentence):
    global sess
    global model
    #with tf.Session() as sess:
    # Create model and load parameters.
    #model = create_model(self.sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    qt_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.qt" % FLAGS.qt_vocab_size)
    ans_vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.ans" % FLAGS.ans_vocab_size)
    qt_vocab, _ = data_utils.initialize_vocabulary(qt_vocab_path)
    _, rev_ans_vocab = data_utils.initialize_vocabulary(ans_vocab_path)

    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence),
                                                 qt_vocab)
    # Which bucket does it belong to?
    bucket_id = min(
        [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        # Print out answer sentence corresponding to outputs.
    result = " ".join(
        [tf.compat.as_str(rev_ans_vocab[output]) for output in outputs])
    print("Server sent data:%s" % result)
    return result
Esempio n. 11
0
def create_model(session, forward_only):
  """Create translation model and initialize or load parameters in session."""
  model = seq2seq_model.Seq2SeqModel(
      FLAGS.en_vocab_size, FLAGS.fr_vocab_size, _buckets,
      FLAGS.size, FLAGS.num_layers, FLAGS.max_gradient_norm, FLAGS.batch_size,
      FLAGS.learning_rate, FLAGS.learning_rate_decay_factor,
      forward_only=forward_only)
  ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
  if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
    model.saver.restore(session, ckpt.model_checkpoint_path)
    print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
  else:
    print("Created model with fresh parameters.")
    session.run(tf.initialize_all_variables())
  # Load vocabularies.
  en_vocab_path = "n_data/prompt_vocab.txt"
  fr_vocab_path = "n_data/response_vocab.txt"

  global en_vocab
  global rev_fr_vocab
  en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
  _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
  return model
Esempio n. 12
0
def gen_batches(rootdir=TEXT_PATH,
                batch_size=2,
                answer_size=5,
                dialog_size=4,
                vocab_path=VOCAB_PATH):
    finished = False
    generator = gen_dialogs(rootdir)
    vocab, rev_vocab = initialize_vocabulary(vocab_path)

    if dialog_size % 2 != 0:
        dialog_size += 1

    while not finished:
        padded_dialogs = []
        encoder_inputs = []
        decoder_inputs = []
        target_weights = []

        try:
            for i in range(0, batch_size):
                padded_dialogs.append(
                    padded(next(generator),
                           answer_size=answer_size,
                           dialog_size=dialog_size))
        except StopIteration:
            finished = True

        for i in range(0, dialog_size):
            data = [[
                padded_dialogs[batch_idx][i][length_idx]
                for batch_idx in range(0, batch_size)
            ] for length_idx in range(0, answer_size)]
            if i % 2 == 0:
                encoder_inputs.append(data)
            else:
                data = [[_GO] * batch_size] + data
                decoder_inputs.append(data)

                weights = np.ones((answer_size + 1, batch_size),
                                  dtype=np.float32)
                for i in range(0, answer_size + 1):
                    for j in range(0, batch_size):
                        if data[i][j] == _PAD:
                            weights[i][j] = 0.0

                target_weights.append(weights)

        yield to_ids(encoder_inputs, vocab), to_ids(decoder_inputs,
                                                    vocab), target_weights
Esempio n. 13
0
def gen_batches(rootdir=TEXT_PATH, batch_size=2, answer_size=5, dialog_size=4, vocab_path=VOCAB_PATH):
    finished = False
    generator = gen_dialogs(rootdir)
    vocab, rev_vocab = initialize_vocabulary(vocab_path)

    if dialog_size%2 != 0:
        dialog_size += 1

    while not finished:
        padded_dialogs = []
        encoder_inputs = []
        decoder_inputs = []
        target_weights = []

        try:
            for i in range(0, batch_size):
                padded_dialogs.append(padded(next(generator),
                                             answer_size=answer_size, dialog_size=dialog_size))
        except StopIteration:
            finished = True

        for i in range(0, dialog_size):
            data = [
                [padded_dialogs[batch_idx][i][length_idx]
                 for batch_idx in range(0, batch_size)]
                for length_idx in range(0, answer_size)
            ]
            if i%2 == 0:
                encoder_inputs.append(data)
            else:
                data = [[_GO]*batch_size] + data
                decoder_inputs.append(data)

                weights = np.ones((answer_size+1, batch_size), dtype=np.float32)
                for i in range(0, answer_size+1):
                    for j in range(0, batch_size):
                        if data[i][j] == _PAD:
                            weights[i][j] = 0.0

                target_weights.append(weights)

        yield to_ids(encoder_inputs, vocab), to_ids(decoder_inputs, vocab), target_weights
Esempio n. 14
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.
        mrlf = open(outmrlfilename, "w+")

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        #sys.stdout.write("> ")
        #sys.stdout.flush()
        #sentence = sys.stdin.readline()
        #sentence = MRL_Linearizer.stemNL(sentence)

        def single_sentence_decoding(sentence):

            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), en_vocab)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)

            return decode_once(output_logits, rev_fr_vocab)

        #interactive session?
        if FLAGS.demo == False:
            for mrl, sentence in testdataiterator():

                #decoding the whole test corpus

                print("translating:" + str(sentence))

                #stemming input sentence
                sentence = MRL_Linearizer.stemNL(sentence)

                value, counter = single_sentence_decoding(sentence)
                print('Found at iteration: ' + str(counter))
                print(value)

                #writing the translations on a file
                mrlf.write(
                    str(counter) + "|||" + value + "|||" +
                    Delinearizer.delinearizer(value) + "\n")
                mrlf.flush()

                #create file containing only the mrls
                with open("out.txt") as f:
                    with open("nmtout.mrl", "w+") as out:
                        for line in f:
                            tokens = line.split("|||")
                            out.write(tokens[2].replace("$", ""))
        else:
            sys.stdout.write("> ")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
            sentence = MRL_Linearizer.stemNL(sentence)
            print("translating:" + str(sentence))
            value, counter = single_sentence_decoding(sentence)
            print('Found at iteration: ' + str(counter))
            print(value)
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Esempio n. 15
0
def train():
  print("Preparing korpora data in %s" % FLAGS.data_dir)
  en_train, fr_train, en_dev, fr_dev, _, _ = prepare_korpora_data(
      FLAGS.data_dir, FLAGS.vocab_size)

  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    dev_set = read_data(en_dev, fr_dev)
    train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_buckets_scale = calc_buckects_scale(train_bucket_sizes)

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1
      log_data("step_loss", current_step, model.global_step.eval(), step_loss, bucket_id)

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        log_data("checkpoint_loss", current_step, model.global_step.eval(), loss, 
                 model.learning_rate.eval(),step_time,perplexity)
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "korpora.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
        
        # Run evals on development set and print their perplexity.
        input_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.input" % FLAGS.vocab_size)
        target_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.target" % FLAGS.vocab_size)
        _, rev_input_vocab = data_utils.initialize_vocabulary(input_vocab_path)
        _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path)
        for i in range(min(10, len(np.transpose(encoder_inputs)))):
            print("  sampled input (i,t,o)",
                ids2str(np.transpose(encoder_inputs)[i][::-1], rev_input_vocab),
                ids2str(np.transpose(decoder_inputs)[i], rev_target_vocab))
            
        evaluate_valid(model, sess, dev_set, current_step, 10)
        #recalculate bucket proporation based on dev error, in order to focus on bad bucket
        train_buckets_scale = calc_buckects_scale(train_bucket_sizes)