Exemple #1
0
def interactive():
  with tf.Session() as sess:
    # Create model and load parameters.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
    ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
    model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path)
    _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    w = sys.stdin.readline()
    word = " ".join(list(w))
    while word:
      gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
      if not gr_absent:
        res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
        print(res_phoneme_seq)
      else:
        print("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
      print("> ", end="")
      sys.stdout.flush()
      w = sys.stdin.readline()
      word = " ".join(list(w))
Exemple #2
0
def decode_from_stdin(show_all_n_best=False, FLAGS=None, buckets=None):

    assert FLAGS is not None
    assert buckets is not None

    # with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:

        # Create model and load parameters.
        model = create_seq2seq_model(sess, True, FLAGS, buckets, translate=True)

        # Load vocabularies.
        source_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.source_lang)

        target_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.target_lang)

        src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file)
        _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:

            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab)

            # Get output logits for the sentence.
            output_hypotheses, output_scores = model.translation_step(sess, token_ids, beam_size=FLAGS.beam_size, dump_remaining=False)

            outputs = []

            for x in output_hypotheses:
                try:
                    outputs.append(x[:x.index(data_utils.EOS_ID)])
                except ValueError:
                    pass

            output_hypotheses = outputs

            # print translations
            if show_all_n_best:
                for x in xrange(len(outputs)):
                    out = outputs[x]
                    # Print out French sentence corresponding to outputs.
                    print(str(numpy.exp(-output_scores[x])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out]))
            else:
                out = outputs[0]
                # Print out French sentence corresponding to outputs.
                print(str(numpy.exp(-output_scores[0])) + "\t" + " ".join([rev_tgt_vocab[output] for output in out]))

            # wait for a new sentence to translate
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir, "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence, en_vocab)
            # Which bucket does it belong to?
            bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[: outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([rev_fr_vocab[output] for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.tgt" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.src" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path )
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path )

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    sentence = map(lambda x:x.decode('utf-8'), ['こんにちは']).pop()
    with open('./narou/narou_dev.src.txt', 'r')  as f:
        lines = f.read().split('\n')
        print(lines)
    #while sentence:
    for sentence in lines:
      print(sentence)
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
      print(token_ids)
      # Which bucket does it belong to?
      try:
        bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      except:
        continue
        # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]

      # Print out French sentence corresponding to outputs.
      print("ANS:>", " ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
      print("> ", end="")
      continue
      sys.stdout.flush()
      sentence = sys.stdin.readline()
def eval():
    # Load vocabularies.
    nl_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.nl" % FLAGS.nl_vocab_size)
    cm_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.cm.ast" % FLAGS.cm_vocab_size)
    nl_vocab, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path)
    cm_vocab, rev_cm_vocab = data_utils.initialize_vocabulary(cm_vocab_path)

    train_set, dev_set, _ = load_data()
    model = knn.KNNModel()
    model.train(train_set)
    eval_tools.eval_set(model_name, dev_set, rev_nl_vocab, FLAGS)
def evaluate(filename):
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.en" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.fr" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    writer = open('pred.txt', 'w')
    count = 0
    with open(filename, 'r') as reader:
      for sentence in reader:
        count += 1
        if count % 1000 == 0:
          print (count)        
        chunks = parser(sentence)
        #print (chunks)
        # Get token-ids for the input sentence.
        for sen in chunks:
          token_ids = data_utils.sentence_to_token_ids(sen.strip('\n'), en_vocab)
          # Which bucket does it belong to?
          bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
          # Get a 1-element batch to feed the sentence to the model.
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
          # Get output logits for the sentence.
          _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          # This is a greedy decoder - outputs are just argmaxes of output_logits.
          outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
          # If there is an EOS symbol in outputs, cut them at that point.
#          print ("previous: ")
#          print (outputs)
          if data_utils.EOS_ID in outputs:
            outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
#          print ("after: ")
#          print (outputs)
          output = (" ".join([rev_fr_vocab[output] for output in outputs]))
          #print (rev_fr_vocab)
          #print ("output: %s" % output)
          writer.write(output.split()[0]+'\n')
    writer.close()
def inter_decode():
  if not (FLAGS.inter_decode_sent and FLAGS.inter_decode_position and FLAGS.inter_decode_map):
    raise ValueError(" Invalid argument, please set inter_decode setting! ")
  with tf.Session() as sess:
    # Load dictionary
    srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min)
    trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min)
    srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path)
    trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path)

    # Create model
    model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Decode from standard input.  ---> interactive decoding
    # sys.stdout.write("> ")
    # sys.stdout.flush()
    # sentence = sys.stdin.readline()
    sentence = FLAGS.inter_decode_sent

    # read supplement input: children, weight.
    # init_pos = eval(sys.stdin.readline())
    # mapp = eval(sys.stdin.readline())
    init_pos = eval(FLAGS.inter_decode_position)
    mapp = eval(FLAGS.inter_decode_map)


    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab)
    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(_buckets))
                     if _buckets[b][0] > len(token_ids)])
    # Get a 1-element batch to feed the sentence to the model.
    # pdb.set_trace()
    encoder_input, decoder_input, target_weight, pos, maps = model.get_batch(
        {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, 
                decoder_inputs_positions=pos, decoder_inputs_maps=maps)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
      outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    
    final_pos = out_pos[0].tolist()
    for l in xrange(len(outputs)-1):
      final_pos.extend(out_pos[l+1].tolist())

    return final_pos
def decode():
    # Load vocabularies.
    nl_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.nl" % FLAGS.nl_vocab_size)
    cm_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.cm" % FLAGS.cm_vocab_size)
    nl_vocab, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path)
    cm_vocab, rev_cm_vocab = data_utils.initialize_vocabulary(cm_vocab_path)

    train_set, dev_set, _ = load_data()
    model = knn.KNNModel()
    model.train(train_set)

    decode_set(model, dev_set, rev_nl_vocab, rev_cm_vocab)
Exemple #9
0
def get_vocabs():
  """Initialize and return vocabularies and pathes to them.

  Returns:
    gr_vocab: Graphemes vocabulary;
    rev_ph_vocab: Reversed phonemes vocabulary;
    gr_vocab_path: Path to the graphemes vocabulary;
    ph_vocab_path: Path to the phonemes vocabulary.
  """
  # Initialize vocabularies
  gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
  ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
  gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path)
  _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path)
  return (gr_vocab, rev_ph_vocab, gr_vocab_path, ph_vocab_path)
  def __init__(self):
    self.sess = tf.Session()
    self.download_trained_if_not_exists()

    # Create model and load parameters.
    self.model = create_model(self.sess, True)
    self.model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.en" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.fr" % FLAGS.fr_vocab_size)
    self.en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, self.rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
Exemple #11
0
def init_session(sess, conf='seq2seq.ini'):
    """
    DOCSTRING
    """
    global gConfig
    gConfig = get_config(conf)
    model = create_model(sess, True)
    model.batch_size = 1
    enc_vocab_path = os.path.join(gConfig['working_directory'],
                                  "vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],
                                  "vocab%d.dec" % gConfig['dec_vocab_size'])
    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
    return sess, model, enc_vocab, rev_dec_vocab
Exemple #12
0
def decode():
    input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        in_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.in" % FLAGS.in_vocab_size)
        out_vocab_path = os.path.join(FLAGS.data_dir,
                                      "vocab%d.out" % FLAGS.out_vocab_size)
        in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path)
        _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path)

        # Decode from standard input.
        with gfile.GFile("test.txt", "r") as f:
            sentence = f.readline()
        for _ in range(1):
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab)
            # Which bucket does it belong to?
            bucket_id = len(_buckets) - 1
            for i, bucket in enumerate(_buckets):
                if bucket[0] >= len(token_ids):
                    bucket_id = i
                    break
            else:
                logging.warning("Sentence truncated: %s", sentence)

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            with gfile.GFile("output.txt", "w") as f:
                f.write(" ".join([rev_out_vocab[output]
                                  for output in outputs]))
Exemple #13
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.en" % FLAGS.en_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.fr" % FLAGS.fr_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), en_vocab)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([
                tf.compat.as_str(rev_fr_vocab[output]) for output in outputs
            ]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemple #14
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_enc.txt" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d_dec.txt" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)



    # Decode sentence and store it
    with open(gConfig["test_enc"], 'r') as test_enc:
        with open(gConfig["output"], 'w') as predicted_headline:
            sentence_count = 0
            for sentence in test_enc:
                # Get token-ids for the input sentence.
                token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab)
                # Which bucket does it belong to? And place the sentence to the last bucket if its token length is larger then X.
                bucket_id = min([b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids)] + [len(_buckets)-1])
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                           target_weights, bucket_id, True)

                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

                # If there is an EOS symbol in outputs, cut them at that point.
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                # Write predicted headline corresponding to article.
                predicted_headline.write(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])+'\n')
                sentence_count += 1
                if sentence_count % 100 == 0:
                    print("predicted data line %d" % sentence_count)
                    sys.stdout.flush()

        predicted_headline.close()
    test_enc.close()

    print("Finished decoding and stored predicted results in %s!" % gConfig["output"])
Exemple #15
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = FLAGS.batch_size  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.fr" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.en" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    #sys.stdout.write("> ")
    #sys.stdout.flush()
    predif = open(FLAGS.predifname).readlines()
    predof = open(FLAGS.predofname, 'w')
    #sentence = predif.readline()
    #count = 0
    batch_decode = []
    for predin in predif:
        token_ids = data_utils.sentence_to_token_ids(predin, en_vocab)
        # Which bucket does it belong to?
        bucket_id = 0#min([b for b in xrange(len(_buckets))
                    #   if _buckets[b][0] > len(token_ids)])
        batch_decode.append((token_ids, []))
        if len(batch_decode) == FLAGS.batch_size:

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              {bucket_id: batch_decode}, bucket_id)

            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
            #embed()
            outputs = np.transpose(np.array(output_logits), (1, 0, 2))
            outputs = np.argmax(outputs, axis=2)
            # If there is an EOS symbol in outputs, cut them at that point.
            for ii, out in enumerate(outputs):
                idxx = np.where(out == data_utils.EOS_ID)[0]
                if len(idxx)>0:
                    out = out[:idxx[0]]
                predo = " ".join([rev_fr_vocab[word] for word in out])
                print (predo)
                predof.write(predo + '\n')
            batch_decode = []
def testBLEU():
  source = sys.argv[1]
  target = sys.argv[2]
  with tf.Session() as sess:
    model = create_model(sess, True, True)
    model.batch_size = 1  
    s_vocab_path = os.path.join(FLAGS.data_dir,
                                "vocab%d.%s" % (FLAGS.s_vocab_size, source))
    t_vocab_path = os.path.join(FLAGS.data_dir,
                                "vocab%d.%s" % (FLAGS.t_vocab_size, target))
    s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path)
    _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path)
    BLEUscore = {0:[], 1:[], 2:[], 3:[]}
    s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source)
    t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target)
    f_s = open(s_test_path, 'r')
    f_t = open(t_test_path, 'r')
    step = 0
    for sentence in f_s:
      print(step)
      
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab)
      bucket_id = len(_buckets) - 1
      for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
          bucket_id = i
          break
      else:
        logging.warning("Sentence truncated: %s", sentence) 

      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs]
      reference = f_t.readline().split(' ')
      try:
        temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate)
      except:
        temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5))
      BLEUscore[bucket_id].append(temp_score)
      step += 1
      print(temp_score)
    for key,val in BLEUscore.iteritems():
      print(key, ": ", np.mean(val))
Exemple #17
0
def init_session(sess, conf='seq2seq.ini'):
    global gConfig
    gConfig = get_config(conf)
 
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    return sess, model, enc_vocab, rev_dec_vocab
Exemple #18
0
def chat():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, forward_only=True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.in" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        sentence = re.sub(u'[^\u4e00-\u9fa5,。;:?!‘’“”、]', '',
                          sentence.decode('utf-8'))
        sentence = re.sub(u'(?P<chinese>[\u4e00-\u9fa5,。;:?!‘’“”、])',
                          add_space, sentence)

        while sentence:
            predicted_sentence = get_predicted_sentence(
                sentence, vocab, rev_vocab, model, sess)
            print(predicted_sentence)
            print("> ")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
            sentence = re.sub(u'[^\u4e00-\u9fa5,。;:?!‘’“”、]', '',
                              sentence.decode('utf-8'))
            sentence = re.sub(u'(?P<chinese>[\u4e00-\u9fa5,。;:?!‘’“”、])',
                              add_space, sentence)
Exemple #19
0
def predict():
    def _get_test_dataset():
        with open(TEST_DATASET_PATH) as test_fh:
            test_sentences = [s.strip() for s in test_fh.readlines()]
        return test_sentences

    results_filename = '_'.join([
        'results',
        str(FLAGS.num_layers),
        str(FLAGS.size),
        str(FLAGS.vocab_size)
    ])
    results_path = os.path.join(FLAGS.results_dir, results_filename)

    with tf.Session() as sess, open(results_path, 'w') as results_fh:
        # Create model and load parameters.
        model = create_model(sess, forward_only=True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.in" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        test_dataset = _get_test_dataset()

        for sentence in test_dataset:
            # Get token-ids for the input sentence.
            predicted_sentence = get_predicted_sentence(
                sentence, vocab, rev_vocab, model, sess)
            print(sentence + ' -> ' + predicted_sentence)

            results_fh.write(predicted_sentence + '\n')
Exemple #20
0
 def __init__(self):
     chatbot.FLAGS.train_dir = 'tmp'
     chatbot.FLAGS.data_dir = 'tmp'
     self.sess = tf.InteractiveSession()
     self.model = chatbot.create_model(self.sess, True)
     self.model.batch_size = 1
     data_dir = 'tmp'
     input_vocab_size = 40000
     output_vocab_size = 40000
     input_vocab_path = os.path.join(data_dir,
                                     "vocab%d.in" % input_vocab_size)
     output_vocab_path = os.path.join(data_dir,
                                      "vocab%d.out" % output_vocab_size)
     self.in_vocab, _ = data_utils.initialize_vocabulary(input_vocab_path)
     _, self.rev_out_vocab = data_utils.initialize_vocabulary(
         output_vocab_path)
Exemple #21
0
    def __init__(self):
        self.VOCAB_SIZE = 1000
        self.SEQ_LEN = 20

        vocab_path = 'data/sequence/vocab{}'.format(self.VOCAB_SIZE + 4)
        if os.path.exists(vocab_path):
            vocab, self.rev_vocab = data_utils.initialize_vocabulary(
                vocab_path)

        def gen_data(f, num_data, test=False):
            for _ in range(num_data):
                inp_init = np.random.randint(self.VOCAB_SIZE)
                inp_len = np.random.randint(1, high=self.SEQ_LEN + 1)
                inp, out_init = self.compute(inp_init, inp_len)
                buf = ' '.join(str(i) for i in inp)
                buf += '\n'
                f.write(buf)
                if not test:
                    out_len = np.random.randint(1, high=self.SEQ_LEN + 1)
                    out, _ = self.compute(out_init, out_len)
                    buf = ' '.join(str(i) for i in out)
                    buf += '\n'
                    f.write(buf)

        if not os.path.exists('data/sequence/train_sequence.txt'):
            with open('data/sequence/train_sequence.txt', 'w') as f:
                gen_data(f, 100000)
            with open('data/sequence/dev_sequence.txt', 'w') as f:
                gen_data(f, 10000)
            with open('data/sequence/test_sequence.txt', 'w') as f:
                gen_data(f, 10000, test=True)
Exemple #22
0
def read_data(src_path, vocab_path):
    data_set = []
    max_length1, max_length2 = 0, 0
    from_vocab, rev_from_vocab = data_utils.initialize_vocabulary(vocab_path)
    with tf.gfile.GFile(src_path, mode="r") as src_file:
        src = src_file.readline()
        counter = 0
        while src:
            if counter % 100000 == 0:
                print("  reading data line %d" % counter)
                sys.stdout.flush()
            # if counter > 100000:
            #      break
            sentences = []
            s = []
            for x in src.split(" "):
                id = int(x)
                if id != -1:
                    s.append(id)
                else:
                    if len(s) > max_length1:
                        max_length1 = len(s)
                    if len(s) > 25:
                        s = s[:25]
                    sentences.append(s)
                    s = []
            data_set.append(sentences)
            counter += 1
            src = src_file.readline()
    print(counter)
    print(max_length1)
    return data_set
Exemple #23
0
def predict():
    with tf.Session() as sess:
        model_obj = model.Seq2SeqModel(config, 'decode')
        model_obj.batch_size = 1
        model_obj.model_restore(sess)

        vocab_path = config.source_vocabulary
        vocab, vocab_list = data_utils.initialize_vocabulary(vocab_path)

        while True:
            question = input("输入:")
            if question == "" or question == 'exit':
                break
            sentence = " ".join(list(jieba.cut(question)))
            token_ids_sentence = data_utils.sentence_to_token_ids(
                sentence, vocab)
            if config.beam_with > 1:
                predicted_sentence = model_obj.predict_beam_search(
                    sess, np.array([token_ids_sentence]),
                    np.array([len(token_ids_sentence)]), vocab_list)
            else:
                predicted_sentence = model_obj.predict(
                    sess, np.array([token_ids_sentence]),
                    np.array([len(token_ids_sentence)]), vocab_list)
            print("输出:", predicted_sentence)
Exemple #24
0
def read_chat_data(data_path, vocabulary_path, max_size=None):
    counter = 0
    vocab, _ = initialize_vocabulary(vocabulary_path)
    print("size of vocab: %s" % len(vocab))
    print("max size: %s" % max_size)
    data_set = [[] for _ in _buckets]
    with codecs.open(data_path, "rb") as fi:
        for line in fi.readlines():
            counter += 1
            if max_size != 0 and counter > max_size:
                break
            if counter % 10000 == 0:
                print("  reading data line %d" % counter)
                sys.stdout.flush()
            entities = line.decode().lower().split("\t")
            # print entities
            if len(entities) == 2:
                source = entities[0]
                target = entities[1]
                source_ids = [
                    int(x) for x in sentence_to_token_ids(source, vocab)
                ]
                target_ids = [
                    int(x) for x in sentence_to_token_ids(target, vocab)
                ]
                target_ids.append(EOS_ID)
                for bucket_id, (source_size,
                                target_size) in enumerate(_buckets):
                    if len(source_ids) < source_size and len(
                            target_ids) < target_size:
                        data_set[bucket_id].append([source_ids, target_ids])
                        break
    return data_set
Exemple #25
0
    def __init__(self):
        self.SEQ_LEN = 10

        vocab_path = 'data/addition/vocab{}'.format(10 + 4)
        if os.path.exists(vocab_path):
            _, self.rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        def gen_data(f, num_data, test=False):
            for _ in range(num_data):
                seq_len = np.random.randint(2, high=self.SEQ_LEN + 1)
                seq = np.random.randint(10, size=seq_len)
                inp_seq = ' '.join(str(i) for i in seq)
                inp_seq += '\n'
                f.write(inp_seq)
                if not test:
                    flag = np.random.randint(1, high=seq_len)
                    num1 = ''.join(str(i) for i in seq[:flag])
                    num1 = int(num1)
                    num2 = ''.join(str(i) for i in seq[flag:])
                    num2 = int(num2)
                    out = str(num1 + num2)
                    out_seq = ' '.join(i for i in out)
                    out_seq += '\n'
                    f.write(out_seq)

        if not os.path.exists('data/addition/train_addition.txt'):
            with open('data/addition/train_addition.txt', 'w') as f:
                gen_data(f, 100000)
            with open('data/addition/dev_addition.txt', 'w') as f:
                gen_data(f, 10000)
            with open('data/addition/test_addition.txt', 'w') as f:
                gen_data(f, 10000, test=True)
Exemple #26
0
    def __init__(self):
        self.UPBOUND = 9
        self.SEQ_LEN = 10
        vocab_path = 'data/counting/vocab{}'.format(self.UPBOUND + 1 + 4)
        vocab, self.rev_vocab = data_utils.initialize_vocabulary(vocab_path)
        self.number_rev_vocab = tf.string_to_number(
            tf.constant(self.rev_vocab[4:]), tf.int32)

        def gen_data(f, num_data, test=False):
            for _ in range(num_data):
                inp_len = np.random.randint(1, high=self.SEQ_LEN)
                inp = np.random.randint(self.UPBOUND + 1, size=inp_len)
                buf = ' '.join(str(i) for i in inp)
                buf += '\n'
                f.write(buf)
                if not test:
                    #out_flags_num = np.random.randint(inp_len + 1)
                    out_flag = np.random.randint(inp_len)
                    out = [out_flag, inp[out_flag], len(inp) - out_flag - 1]
                    buf = ' '.join(str(i) for i in out)
                    buf += '\n'
                    f.write(buf)

        if not os.path.exists('data/counting/train_counting.txt'):
            with open('data/counting/train_counting.txt', 'w') as f:
                gen_data(f, 100000)
            with open('data/counting/dev_counting.txt', 'w') as f:
                gen_data(f, 10000)
            with open('data/counting/test_counting.txt', 'w') as f:
                gen_data(f, 10000, test=True)
Exemple #27
0
def prepare_data(config):
    train_path = os.path.join(config.train_dir, "chitchat.train")
    data_path_list = [train_path + ".answer", train_path + ".query"]
    vocab_path = os.path.join(config.train_dir,
                              "vocab%d.all" % config.vocab_size)
    data_utils.create_vocabulary(vocab_path, data_path_list, config.vocab_size)
    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
    #
    # if os.path.isfile(config.dev_set) and os.path.isfile(config.train_set):
    #     dev_set_file = open(config.dev_set, "rb")
    #     dev_set = pickle.load(dev_set_file)
    #     dev_set_file.close()
    #
    #     train_set_file = open(config.train_set, "rb")
    #     train_set = pickle.load(train_set_file)
    #     train_set_file.close()
    # else:
    print("Prepare Chitchat data in %s" % config.train_dir)
    train_query, train_answer, dev_query, dev_answer = data_utils.prepare_chitchat_data(
        config.train_dir, vocab, config.vocab_size)

    print("Reading development and training data (limit: %d)." %
          config.max_train_data_size)
    dev_set = read_data(config, dev_query, dev_answer)
    train_set = read_data(config, train_query, train_answer)

    # dev_set_file = open(config.dev_set, "wb")
    # pickle.dump(dev_set, dev_set_file)
    # dev_set_file.close()
    #
    # train_set_file = open(config.train_set, "wb")
    # pickle.dump(train_set, train_set_file)
    # train_set_file.close()

    return vocab, rev_vocab, dev_set, train_set
Exemple #28
0
def chat(args):
    with tf.Session() as sess:
        # Create model and load parameters.
        args.batch_size = 1  # We decode one sentence at a time.
        model = create_model(sess, args)

        # Load vocabularies.
        vocab_path = os.path.join(args.data_dir,
                                  "vocab%d.in" % args.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

        while sentence:
            predicted_sentence = get_predicted_sentence(
                args, sentence, vocab, rev_vocab, model, sess)
            # print(predicted_sentence)
            if isinstance(predicted_sentence, list):
                for sent in predicted_sentence:
                    print("  (%s) -> %s" % (sent['prob'], sent['dec_inp']))
            else:
                print(sentence, ' -> ', predicted_sentence)

            sys.stdout.write("> ")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemple #29
0
def init_session(sess, conf='seq2seq.ini'):
    global gConfig
    gConfig = get_config(conf)

    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    return sess, model, enc_vocab, rev_dec_vocab
Exemple #30
0
def decode():
    '''
  Manually input sentence interactively and the headline will be printed out
  '''
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = FLAGS.batch_size  #repeat single sentence 10 times as one batch  # We decode one sentence at a time.

        # Load vocabularies.
        vocab_path = os.path.join(FLAGS.data_dir, "vocab")
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        # Decode from standard input interactively
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            sentence = SeqSentence(sentence)
            if (len(sentence.strip('\n')) == 0):
                sys.stdout.flush()
                sentence = sys.stdin.readline()
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), vocab)
            # print (token_ids) # print token ids
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in xrange(len(buckets))
                if buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            # print ("current bucket id" + str(bucket_id))
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)

            # Get output logits for the sentence.
            _, _, output_logits_batch = model.step(sess, encoder_inputs,
                                                   decoder_inputs,
                                                   target_weights, bucket_id,
                                                   True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            output_logits = []
            for item in output_logits_batch:
                output_logits.append(item[0])

            #print (output_logits)
            #print (len(output_logits))
            #print (output_logits[0])

            outputs = [int(np.argmax(logit)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            print(" ".join(
                [tf.compat.as_str(rev_vocab[output]) for output in outputs]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemple #31
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.from_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)
        en_vocab, rev_en_vocab = data_utils.initialize_vocabulary(
            en_vocab_path)
        #_, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        #sys.stdout.write("> ")
        #sys.stdout.flush()
        #sentence = sys.stdin.readline()
        #while sentence:
        id_counts = 0
        with open('./wikisql_in_nmt/dev.seq') as f, open(
                'tmp.eval.ids.true', 'w') as ft:
            lines = f.readlines()
            for l in tqdm(lines, total=len(lines)):
                # Get token-ids for the input sentence.
                token_ids = data_utils.sentence_to_token_ids(
                    tf.compat.as_bytes(l), en_vocab)
                token_ids.append(data_utils.EOS_ID)
                # Which bucket does it belong to?
                bucket_id = len(_buckets) - 1
                for i, bucket in enumerate(_buckets):
                    if bucket[0] >= len(token_ids):
                        bucket_id = i
                        break
                else:
                    logging.warning("Sentence truncated: %s", sentence)

                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights, target_id, sent_id = model.get_batch(
                    {bucket_id: [(token_ids, [], 1)]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, target_id,
                                                 bucket_id, True)
                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [
                    int(np.argmax(logit, axis=1)) for logit in output_logits
                ]
                # If there is an EOS symbol in outputs, cut them at that point.
                #if data_utils.EOS_ID in outputs:
                #  outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                # Print out French sentence corresponding to outputs.
                ft.write(" ".join([
                    tf.compat.as_str(rev_en_vocab[int(encoder_inputs[output])])
                    for output in outputs
                ]) + "|" + str(id_counts) + '\n')
                id_counts += 1
Exemple #32
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        source_vocab_path = os.path.join(FLAGS.data_dir,
                                         ("vocab%d." + FLAGS.source_ext) % FLAGS.source_vocab_size)
        target_vocab_path = os.path.join(FLAGS.data_dir,
                                         ("vocab%d." + FLAGS.target_ext) % FLAGS.target_vocab_size)
        source_vocab, _ = data_utils.initialize_vocabulary(source_vocab_path)
        _, rev_target_vocab = data_utils.initialize_vocabulary(target_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), source_vocab)
            # Which bucket does it belong to?
            bucket_id = min([b for b in xrange(len(_buckets))
                             if _buckets[b][0] > len(token_ids)])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                             target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out target language sentence corresponding to outputs.
            out_sentence = " ".join([tf.compat.as_str(rev_target_vocab[output]) for output in outputs])
            print(out_sentence)
            if FLAGS.translation_file != "":
                with gfile.GFile(FLAGS.translation_file, mode="ab") as fw:
                    fw.write(FLAGS.source_ext + "> " + sentence)
                    fw.write(FLAGS.target_ext + "> " + out_sentence + b"\n\n")
                    fw.flush()
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemple #33
0
def decode_input():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d_enc.txt" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d_dec.txt" % gConfig['dec_vocab_size'])

        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

        while sentence:
            token_ids = data_utils.sentence_to_token_ids(sentence, enc_vocab)
            bucket_id = min([
                b for b in range(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ] + [len(_buckets) - 1])
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]

            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            print(" ".join([
                tf.compat.as_str(rev_dec_vocab[output]) for output in outputs
            ]))

            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
def decode():

  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
  config = tf.ConfigProto(gpu_options=gpu_options)

  with tf.Session(config=config) as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
      # bucket_belong???
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      # Get the required batch.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output_logits
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)

      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      final_output = " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])

      if('_UNK' in final_output ):
          final_output = "I didn\'t learn how to respond to that."
      print(final_output)
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
def original():
    # test_vec = [26, 12, 10, 11, 15, 17, 28, 171, 18, 339]

    # print "[command] ", decode_vec_to_str(test_vec, nl_dictionary)

    # find k nearest neighbor
    # knn = find_k_nearest_neighbor(test_vec, nl_vec_list, 1)

    # for p in knn:
    #  print "[nn vec] ", p
    # print the decoding result of these filters
    #  print "[nearest neighbor] ", decode_vec_to_str(p[0], nl_dictionary)

    sys.stdout = open('result.txt', 'w')

    # Load vocabularies.
    nl_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.nl" % FLAGS.nl_vocab_size)
    cm_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.cm" % FLAGS.cm_vocab_size)
    nl_vocab, rev_nl_vocab = data_utils.initialize_vocabulary(nl_vocab_path)
    cm_vocab, rev_cm_vocab = data_utils.initialize_vocabulary(cm_vocab_path)
    # the file containing traning nl vectors and cmd vectors
    train_set, dev_set, _ = load_data()

    model = knn.KNNModel()
    model.train(train_set)

    test_cmd_vec_list = [cmd_vec for _, _, _, cmd_vec in dev_set]
    test_nl_vec_list = [nl_vec for _, _, nl_vec, _ in dev_set]

    for i in range(len(test_nl_vec_list)):
        test_vec = test_nl_vec_list[i]
        cmd_vec = test_cmd_vec_list[i]

        nl, cmd, score = model.test(test_vec, 1)

        print "[text-case ", i, "] ========================================================="
        print "  [original-pair]"
        print "     ", knn.decode_vec_to_str(test_vec, rev_nl_vocab)
        print "     ", knn.decode_vec_to_str(cmd_vec, rev_cm_vocab)
        print "  [new-pair]"
        print "     ", knn.decode_vec_to_str(nl, rev_nl_vocab)
        print "     ", knn.decode_vec_to_str(cmd, rev_cm_vocab)
        print knn.decode_vec_to_str(cmd, rev_cm_vocab)
Exemple #36
0
def get_mem_s2t():
    slines = open("./data/train.ids30000.src")
    tlines = open("./data/train.ids30000.trg")
    mlines = open("./data/aligns")
    mem = {}
    for sline, tline, mline in zip(slines, tlines, mlines):
        zh_words = sline.strip().split(' ')
        en_words = tline.strip().split(' ')
        maps = mline.strip().split(' ')
        for m in maps:
            zhid, enid = m.split('-')
            zh_word = zh_words[int(zhid)]
            if int(zh_word) == 3:
                continue
            en_word = en_words[int(enid)]
            if int(en_word) == 3:
                continue
            if int(zh_word) not in mem:
                mem[int(zh_word)] = []
            mem[int(zh_word)].append(int(en_word))

    for m in mem:
        l = len(mem[m])
        words = Counter(mem[m])
        words = sorted(words.items(), key=lambda x: x[1], reverse=True)
        mem[m] = map(lambda x: (x[0], x[1] / float(l)), words)

    del slines
    del tlines
    del mlines

    en_vocab_path = "./data/vocab30000.src"
    fr_vocab_path = "./data/vocab30000.trg"
    en_vocab, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path)
    fr_vocab, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    for i, word in enumerate(rev_en_vocab):
        if i not in mem:
            if word in fr_vocab:
                mem[i] = [(fr_vocab[word], 1.0), (fr_vocab['_NULL'], 0.0)]
            else:
                mem[i] = [(fr_vocab['_NULL'], 0.0), (fr_vocab['_NULL'], 0.0)]

    f = open("./data/mems2t.pkl", 'wb')
    pkl.dump(mem, f)
Exemple #37
0
def decode():

  # Only allocate part of the gpu memory when predicting.
  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
  config = tf.ConfigProto(gpu_options=gpu_options)

  with tf.Session(config=config) as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
      # Which bucket does it belong to?
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      # Get a 1-element batch to feed the sentence to the model.
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output logits for the sentence.
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      # This is a greedy decoder - outputs are just argmaxes of output_logits.
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      # If there is an EOS symbol in outputs, cut them at that point.
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # Print out French sentence corresponding to outputs.
      print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
Exemple #38
0
def decode_tester(sess, model):
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    en_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.from" % FLAGS.from_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.to" % FLAGS.to_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    # sys.stdout.write("> ")
    # sys.stdout.flush()
    sentence = "Who is the president of the United States?"
    # print(" input: " + sentence)
    # while sentence:
    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence),
                                                 en_vocab)
    # Which bucket does it belong to?
    bucket_id = len(_buckets) - 1
    for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
            bucket_id = i
            break
    else:
        logging.warning("Sentence truncated: %s", sentence)

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    # Print out French sentence corresponding to outputs.
    print("\toutput: " + " ".join(
        [tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
    sys.stdout.flush()
Exemple #39
0
def decode():
    """
    DOCSTRING
    """
    gpu_options = tensorflow.GPUOptions(per_process_gpu_memory_fraction=0.2)
    config = tensorflow.ConfigProto(gpu_options=gpu_options)
    with tensorflow.Session(config=config) as sess:
        model = create_model(sess, True)
        model.batch_size = 1
        enc_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.enc" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(
            gConfig['working_directory'],
            "vocab%d.dec" % gConfig['dec_vocab_size'])
        enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            token_ids = data_utils.sentence_to_token_ids(
                tensorflow.compat.as_bytes(sentence), enc_vocab)
            bucket_id = min([
                b for b in range(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            outputs = [
                int(numpy.argmax(logit, axis=1)) for logit in output_logits
            ]
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            print(" ".join([
                tensorflow.compat.as_str(rev_dec_vocab[output])
                for output in outputs
            ]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemple #40
0
def decode(input):
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.from_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from passed input variable
        sentence = input

        # Get token-ids for the input sentence.
        token_ids = data_utils.sentence_to_token_ids(
            tf.compat.as_bytes(sentence), en_vocab)
        # Which bucket does it belong to?
        bucket_id = len(_buckets) - 1
        for i, bucket in enumerate(_buckets):
            if bucket[0] >= len(token_ids):
                bucket_id = i
                break
        else:
            logging.warning("Sentence truncated: %s", sentence)

        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, True)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.
        outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
            outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        # Print out French sentence corresponding to outputs.
        #-- print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
        return " ".join(
            [tf.compat.as_str(rev_fr_vocab[output]) for output in outputs])
def generate(middle_sentence, forwards_sentence, backwards_sentence):
    '''
    Generates forwards and backwards sentences given a middle sentence.
    Args:
      middle_sentence: middle sentence (not tokenized)
      forwards_sentence: preceding sentence (not tokenized)
      backwards_sentence: following sentence (not tokenized)
    '''
    train_path, vocab_path, train_ids_path = data_utils.prepare_skip_thought_data(
        FLAGS.data_dir, FLAGS.train_data_name, FLAGS.vocab_size)

    with tf.Session() as sess:
        m = SkipThoughtModel(FLAGS.vocab_size, max_sentence_len=FLAGS.max_sentence_len,
                             batch_size=FLAGS.batch_size,
                             learning_rate=FLAGS.learning_rate,
                             learning_rate_decay_factor=FLAGS.learning_rate_decay_factor,
                             encoder_cell_size=FLAGS.encoder_cell_size,
                             word_embedding_size=FLAGS.word_embedding_size,
                             decoder_cell_size=FLAGS.decoder_cell_size,
                             max_gradient_norm=FLAGS.max_gradient_norm,
                             initial_decoder_state=None)

        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            m.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("No model found")
            return

        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
        tokenized_middle_sentence = data_utils.sentence_to_token_ids(
            middle_sentence, vocab)
        tokenized_forwards_sentence = data_utils.sentence_to_token_ids(
            forwards_sentence, vocab)
        tokenized_backwards_sentence = data_utils.sentence_to_token_ids(
            " ".join(reversed(backwards_sentence.split())), vocab)

        forwards_batch_logits, backwards_batch_logits = m.step(sess, [m.forwards_batch_logits_tensor, m.backwards_batch_logits_tensor], *m.prep_data(
            [tokenized_middle_sentence], [tokenized_forwards_sentence], [tokenized_backwards_sentence]))

        forwards_logits = forwards_batch_logits[:, 0, :]
        backwards_logits = backwards_batch_logits[:, 0, :]
        print(forwards_logits)
        print(forwards_logits.shape)

        forwards_sentence = map(
            lambda x: rev_vocab[x], map(np.argmax, forwards_logits))
        backwards_sentence = map(
            lambda x: rev_vocab[x], map(np.argmax, backwards_logits))

        print("Generated Forwards Sentence")
        print(" ".join(forwards_sentence))
        print("Generated Backwards Sentence")
        print(" ".join(backwards_sentence))
def decode():
    with tf.Session(config=config) as sess:
        #print ("Hello!!")
        model = create_model(sess, True)
        model.batch_size = 1

        in_vocab_path = os.path.join(FLAGS.data_dir, "vocab_in.txt")
        out_vocab_path = os.path.join(FLAGS.data_dir, "vocab_out.txt")

        in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path)
        _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path)

        print("Hello!!")
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            sentence = wakati(sentence)
            token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab)

            bucket_id = min([
                b for b in xrange(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])

            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)

            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)

            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]

            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]

            print("".join([rev_out_vocab[output] for output in outputs]))
            print("\n> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemple #43
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        first_vocab_path = os.path.join(
            FLAGS.train_dir, "vocab%d.first" % FLAGS.first_vocab_size)
        last_vocab_path = os.path.join(FLAGS.train_dir,
                                       "vocab%d.last" % FLAGS.last_vocab_size)
        first_vocab, _ = data_utils.initialize_vocabulary(first_vocab_path)
        _, rev_last_vocab = data_utils.initialize_vocabulary(last_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = FLAGS.input
        # Get token-ids for the input sentence.
        token_ids = data_utils.sentence_to_token_ids(sentence, first_vocab)
        # Which bucket does it belong to?
        bucket_id = min([
            b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)
        ])
        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, True)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.
        outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
            outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        # Print out French sentence corresponding to outputs.
        result = (" ".join([rev_last_vocab[output] for output in outputs]))
        print(result)
        output = os.path.join(FLAGS.output_dir, str(int(time.time())) + ".txt")
        with open(output, "w") as text_file:
            text_file.write(result)
        print(output)
        sys.stdout.flush()
Exemple #44
0
def decode():

  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
  config = tf.ConfigProto(gpu_options=gpu_options)

  with tf.Session(config=config) as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    enc_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.enc" % gConfig['enc_vocab_size'])
    dec_vocab_path = os.path.join(gConfig['working_directory'],"vocab%d.dec" % gConfig['dec_vocab_size'])

    enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path)
    _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      # Get token-ids for the input sentence.
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)
      # bucket_belong???
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])
      # Get the required batch. 
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      # Get output_logits
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      
      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      
      print(" ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
Exemple #45
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
    ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
    model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
    model.batch_size = 1  # We decode one sentence at a time.

    # Load vocabularies.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path)
    _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path)

    # Decode from input file.
    graphemes = open(FLAGS.decode).readlines()

    output_file_path = FLAGS.output

    if output_file_path:
      with gfile.GFile(output_file_path, mode="w") as output_file:
        for w in graphemes:
          word = " ".join(list(w))
          gr_absent = [gr for gr in w if gr not in gr_vocab]
          if not gr_absent:
            res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
            output_file.write(w.replace('\n',' '))
            output_file.write(res_phoneme_seq)
            output_file.write('\n')
          else:
            raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
    else:
      for w in graphemes:
        word = " ".join(list(w))
        gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
        if not gr_absent:
          res_phoneme_seq = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
          print(w.replace('\n',' ') + res_phoneme_seq)
          sys.stdout.flush()
        else:
          raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )
Exemple #46
0
 def __init__(self, save_dir, source='10.0.2.32', is_local=False):
     super().__init__(source, is_local)
     self.client = MongoClient(source)
     self.corpus = 'cornell-corpus'
     self.col = 'dialogs'
     self.open()
     self.buckets = _buckets
     self.vocabfileA = os.path.join(save_dir, self.corpus + '_vocabfileA')
     self.vocabfileB = os.path.join(save_dir, self.corpus + '_vocabfileB')
     if not os.path.isfile(self.vocabfileA):
         self.create_vocab("A", self.vocabfileA)
     if not os.path.isfile(self.vocabfileB):
         self.create_vocab("B", self.vocabfileB)
     print("initializing vocab")
     self.vocabA, self.vocabA_rev = data_utils.initialize_vocabulary(
         self.vocabfileA)
     self.vocabB, self.vocabB_rev = data_utils.initialize_vocabulary(
         self.vocabfileB)
     print("vocab initialized")
Exemple #47
0
def decode():
  with tf.Session() as sess:
    print ("Hello!!")
    model = create_model(sess, True)                         
    model.batch_size = 1  
    
    in_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab_in.txt")     
    out_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab_out.txt" )
                                                                        
    in_vocab, _ = data_utils.initialize_vocabulary(in_vocab_path)        
    _, rev_out_vocab = data_utils.initialize_vocabulary(out_vocab_path)    

    
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()    
    while sentence:

      token_ids = data_utils.sentence_to_token_ids(sentence, in_vocab)   
      
      bucket_id = min([b for b in xrange(len(_buckets))
                       if _buckets[b][0] > len(token_ids)])               

      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
    
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,      
                                       target_weights, bucket_id, True)

      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]       

      if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]                      

      print(" ".join([rev_out_vocab[output] for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()                                             
def inter_decode(sent, position, mapp):
  with tf.Session() as sess:
    # Load dictionary
    srce_vocab_path = os.path.join(data_dir, "train", "vocab%d.srce" % 2)
    trgt_vocab_path = os.path.join(data_dir, "train", "vocab%d.trgt" % 0)
    srce_vocab, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path)
    trgt_vocab, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path)

    # Create model
    model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True)
    # model.batch_size = 1  # We decode one sentence at a time.

    sentence = sent
    init_pos = eval(position)
    mapp = eval(mapp)

    # Get token-ids for the input sentence.
    token_ids = data_utils.sentence_to_token_ids(sentence, srce_vocab)
    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(_buckets))
                     if _buckets[b][0] > len(token_ids)])
    # Get a 1-element batch to feed the sentence to the model.
    encoder_input, decoder_input, target_weight, pos, maps = model.get_batch(
        {bucket_id: [(token_ids, [], init_pos, mapp)]}, bucket_id)
    # Get output logits for the sentence.
    _, _, output_logits, attentions, env, out_pos = model.step(sess, encoder_input, decoder_input, target_weight, bucket_id, True, 
                decoder_inputs_positions=pos, decoder_inputs_maps=maps)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
    
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
      outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    
    final_pos = out_pos[0].tolist()
    for l in xrange(len(outputs)-1):
      final_pos.extend(out_pos[l+1].tolist())

    return final_pos
Exemple #49
0
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    model = create_model(sess, True)
    model.batch_size = 1  #预测阶段我们只输入一个句子

    # 加载词汇表
    en_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.en" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(FLAGS.data_dir,"vocab%d.fr" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # 翻译:我们用控制台输入英语句子
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    #对输入结果进行翻译解码
    while sentence:
      # 先把输入的单词,转换成索引形式
      token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab)
      # 根据句子的长度,判读属于哪个buckets
      bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          {bucket_id: [(token_ids, [])]}, bucket_id)
      #得到概率输出序列
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
      #取一个输出序列的argmax最大的概率单词
      outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

      if data_utils.EOS_ID in outputs:#如果翻译结果中存在EOS_ID,那么我们只需截取前面的单词,作为结果
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
      # 大印结果
      print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
      print("> ", end="")
      sys.stdout.flush()
      sentence = sys.stdin.readline()
def decode():
  with tf.Session() as sess:
    # Create model and load parameters.
    if FLAGS.beam_size > 0:
        use_beamsearch = True
    else:
        use_beamsearch = False
    model = create_model(sess, True, use_beamsearch=use_beamsearch)
    model.batch_size = 1  # We decode one sentence at a time.
    if FLAGS.use_ori:
      tokenizer = useori_tokenizer
      vocab_data_dir = os.path.join(FLAGS.data_dir, 'ori')
    else:
      tokenizer = cut_tokenizer
      vocab_data_dir = os.path.join(FLAGS.data_dir, 'cut')

    # Load vocabularies.
    en_vocab_path = os.path.join(vocab_data_dir,
                                 "vocab%d.q" % FLAGS.en_vocab_size)
    fr_vocab_path = os.path.join(vocab_data_dir,
                                 "vocab%d.a" % FLAGS.fr_vocab_size)
    en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
    _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

    # Decode from standard input.
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
      try:
        # Get token-ids for the input sentence.
        token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), en_vocab, tokenizer=tokenizer)
        if len(token_ids) >= _buckets[-1][0]:
            token_ids = token_ids[0:(_buckets[-1][0]-1)]
        print(token_ids)
        # Which bucket does it belong to?
        bucket_id = min([b for b in xrange(len(_buckets))
                         if _buckets[b][0] > len(token_ids)])
        # TODO: indeed can produce longer answers, but with some repeat parts consequently
        #bucket_id = len(_buckets) - 1

        if FLAGS.beam_size > 0:
            def cal_function(decoder_token_ids, idx):
                print('decoder_token_ids:', decoder_token_ids)
                # Get a 1-element batch to feed the sentence to the model.
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: [(token_ids, decoder_token_ids)]}, bucket_id)
                # Get output logits for the sentence.
                _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                                 target_weights, bucket_id, True)
                #print(np.shape(output_logits[idx-1]))
                #print(output_logits[idx-1])
                fake_logits = output_logits[idx-1].reshape([-1])
                return log_sigmoid(inputs=fake_logits)
            beam_search = BeamSearch(beam_size=FLAGS.beam_size)
            beam_search.run(max_step=model.buckets[bucket_id][1], cal_function=cal_function)
            final_token_paths = beam_search.get_final_token_paths()
            for outputs in final_token_paths:
                print(outputs)
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                # Print out French sentence corresponding to outputs.
                print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
                print('done')
        else:
            model.use_beamsearch = False
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                             target_weights, bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            print(outputs)
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
            print('done')
      except ValueError, e:
        print(e)
        print("Bad input! Try again:")
      finally:
Exemple #51
0
import tensorflow as tf
import numpy as np
from model import create_model, buildSentence, respond
from config.config import FLAGS, _buckets, name
import data_utils
import os.path

sess = tf.Session()
# Create model and load parameters.
model = create_model(sess, True)
model.batch_size = 1  # We decode one sentence at a time.

# Load vocabularies.
vocab_path = os.path.join(FLAGS.data_dir,
                             "vocab%d.in" % FLAGS.vocab_size)
vocab, vocab_rev = data_utils.initialize_vocabulary(vocab_path)

print '%s: %s' % (name, respond('hi.', sess, model, vocab, vocab_rev))

print '%s: %s' % (name, respond('hello.', sess, model, vocab, vocab_rev))

print '%s: %s' % (name, respond('hey.', sess, model, vocab, vocab_rev))

print '%s: %s' % (name, respond('how are you?', sess, model, vocab, vocab_rev))

print '%s: %s' % (name, respond('what is the meaning of life?', sess, model, vocab, vocab_rev))

print '%s: %s' % (name, respond('you are a machine.', sess, model, vocab, vocab_rev))

print '%s: %s' % (name, respond('you\'re a machine.', sess, model, vocab, vocab_rev))
Exemple #52
0
def translate_file(source_path=dev_code_file, target_path=translated_dev_code): 
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4)
    with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        code_vocab_path = os.path.join(data_dir,
           "vocab%d.code" % FLAGS.code_vocab_size)
        en_vocab_path = os.path.join(data_dir,
           "vocab%d.en" % FLAGS.en_vocab_size)
        code_vocab, _ = data_utils.initialize_vocabulary(code_vocab_path)
        _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path)

        with tf.gfile.GFile(source_path, mode="r") as source_file:
            with tf.gfile.GFile(target_path, mode="w") as translated_file:
            
                sentence = source_file.readline()
                counter = 0
                print (" Translating file %s " % dev_code_file)
                
                while sentence:
                    # Get token-ids for the input sentence.
                    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), code_vocab)

                    buckets = [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)]
                    if buckets:
                        bucket_id = min(buckets)
                    else:
                        # print ("line %d with tokens %d" % (counter, len(token_ids)))
                        translated_file.write("_UNK \n")
                        sentence = source_file.readline()
                        continue
                    
                    # Which bucket does it belong to?
                    # bucket_id = min([b for b in xrange(len(_buckets))
                                    # if _buckets[b][0] > len(token_ids)])
                                    
                    # Get a 1-element batch to feed the sentence to the model.
                    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                                                                                            {bucket_id: [(token_ids, [])]}, bucket_id)

                    # Get output logits for the sentence.
                    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                                                    target_weights, bucket_id, True)
                                                                    
                                                                    
                    # This is a greedy decoder - outputs are just argmaxes of output_logits.
                    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

                    # If there is an EOS symbol in outputs, cut them at that point.
                    if data_utils.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                        
                    # Write translated sentence to translation file.
                    translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n")
                    
                    # print ("> %s" % sentence)
                    # print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]))
                    
                    # Get next sentence and print checkpoints.
                    counter +=1
                    sentence = source_file.readline()
                    if( counter % 500 is 0):
                        print(" Line %d translated" % counter)
                    
                print (" File translated")
Exemple #53
0
def decode_from_file(files, model_path=None, use_best=False, get_ids=True, FLAGS=None, buckets=None):

    assert FLAGS is not None
    assert buckets is not None

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:

        # load model parameters.
        model = create_seq2seq_model(sess, model_path=model_path, forward_only=True,
                                     use_best=use_best, FLAGS=FLAGS, buckets=buckets,
                                     translate=True)

        # Load vocabularies.
        source_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.src_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.source_lang)

        target_vocab_file = FLAGS.data_dir + \
                            (FLAGS.train_data % str(FLAGS.tgt_vocab_size)) + \
                            ('.vocab.%s' % FLAGS.target_lang)

        src_vocab, _ = data_utils.initialize_vocabulary(source_vocab_file)
        _, rev_tgt_vocab = data_utils.initialize_vocabulary(target_vocab_file)

        start_total_time = time.time()
        total_sentence_count = 0

        for file_path in files:

            print("Translating file %s\n" % file_path)

            sentence_count = 0

            # Decode from file.
            with gfile.GFile(file_path, mode='r') as source:
                with gfile.GFile(file_path + '.trans', mode='w') as destiny:
                    sentence = source.readline()

                    start_time = time.time()
                    while sentence:

                        sentence_count += 1
                        print("Translating sentence %d ", sentence_count)

                        if get_ids:

                            # Get token-ids for the input sentence.
                            token_ids = data_utils.sentence_to_token_ids(sentence, src_vocab)

                        else:

                            # if sentence is already converted, just split the ids
                            token_ids = [int(ss) for ss in sentence.strip().split()]

                        # Get output logits for the sentence.
                        output_hypotheses, output_scores = model.translation_step(sess,
                                                                                  token_ids,
                                                                                  FLAGS.beam_size,
                                                                                  normalize=True,
                                                                                  dump_remaining=True)

                        outputs = output_hypotheses[0]

                        # Print out sentence corresponding to outputs.
                        destiny.write(" ".join([rev_tgt_vocab[output] for output in outputs]))
                        destiny.write("\n")
                        sentence = source.readline()

                    end_time = time.time() - start_time

                    print("\nDone file %s" % file_path)
                    print("Avg. %.3f sentences/sec" % (sentence_count / end_time))

            total_sentence_count += sentence_count

        end_total_time = time.time() - start_total_time

        print("\nDone!")
        print("Avg. %.3f sentences/sec" % (total_sentence_count / end_total_time))
def test_BLEU():
    # Perform BLEU score testing here
    with tf.Session() as sess:
      model = create_model(sess, True, False)
      source = sys.argv[1]
      target = sys.argv[2]
      model.batch_size = 1  # We decode one sentence at a time.

      # Load vocabularies.
      s_vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.%s" % (FLAGS.s_vocab_size, source))
      t_vocab_path = os.path.join(FLAGS.data_dir,
                                  "vocab%d.%s" % (FLAGS.t_vocab_size, target))
      s_vocab, _ = data_utils.initialize_vocabulary(s_vocab_path)
      _, rev_t_vocab = data_utils.initialize_vocabulary(t_vocab_path)

      # Decode from standard input.
      BLEUscore = {0:[], 1:[], 2:[], 3:[]}
      s_test_path = os.path.join(FLAGS.data_dir, "test.%s" % source)
      t_test_path = os.path.join(FLAGS.data_dir, "test.%s" % target)
      f_s = open(s_test_path, 'r')
      f_t = open(t_test_path, 'r')
      # print(f_s.readline())
      step = 0
      for sentence in f_s:
        print(step)
        # sentence = f_ja.readline()
        # Get token-ids for the input sentence.
        token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), s_vocab)
        # Which bucket does it belong to?
        bucket_id = len(_buckets) - 1
        for i, bucket in enumerate(_buckets):
          if bucket[0] >= len(token_ids):
            bucket_id = i
            break
        else:
          logging.warning("Sentence truncated: %s", sentence) 

        # Get a 1-element batch to feed the sentence to the model.
        encoder_inputs, decoder_inputs, target_weights = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                         target_weights, bucket_id, True)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.
        outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
          outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        # Print out Japanese sentence corresponding to outputs.
        candidate = [tf.compat.as_str(rev_t_vocab[output]) for output in outputs]
        reference = f_t.readline().split(' ')
        print(candidate, reference)
        try:
          temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate)
        except:
          temp_score = nltk.translate.bleu_score.sentence_bleu([reference], candidate, weights=(.5, .5))
        BLEUscore[bucket_id].append(temp_score)
        step += 1
        print(temp_score)
      for key,val in BLEUscore.iteritems():
        print(key, ": ", np.mean(val))
def decode():
   with tf.Session() as sess:
	   # Create model and load parameters.
	   # second arguments means this model are not Training
	   model = create_model(sess, True)
	   # we decode one sentence at a time
	   model.batch_size = 1

	   # Load vocabularies

	   vocab_path = os.path.join(FLAGS.data_dir,"Word_map.txt")
	   vocab, Q_vocab = data_utils.initialize_vocabulary(vocab_path)


	   while 1:
		   # Get token-ids for the input sentence
		   sys.stdout.write("Input >> ")
		   sys.stdout.flush()
		   sentence = sys.stdin.readline()
		   token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence), vocab)
		   #print sentence
		   #print token_ids
		   #print np.shape(token_ids)
		   # Which bucket oes it belong to?
		   bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

		   # Get a 1-element batch to feed the sentence to the model.
		   encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id : [(token_ids,[])]},bucket_id)
		   #print np.shape(decoder_inputs)
		   # Get output logits for the sentence.
		   _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)

		   bucket_length = (_buckets[bucket_id])[1]
#		   softmax_output_logits = np.zeros(),dtype=np.float)
		   #outputs = np.zeros(bucket_length,np.int)
		   outputs = []
		   max_outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits]
		   
		  # for i in range(bucket_length):
		  # 	softmax_output_logits = sess.run(tf.nn.softmax(output_logits[i]))
		  #	cum_sum = np.cumsum(softmax_output_logits)
		  #	random_number_02 = np.random.random_sample()
			#print softmax_output_logits.max()
			#print softmax_output_logits.argmax()
#	  	max_outputs.append(softmax_output_logits.argmax())
		  #	output = min( [j for j in xrange(len(cum_sum)) if cum_sum[j] > random_number_02] )
		  #	outputs.append(output)
		   # This is a greedy decoder - outputs are just argmaxes of output_logits.
#		   outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
		   # If there is an EOS symbol in outputs, cut them at that point.
#
#		   if data_utils.EOS_ID in outputs:
#			   outputs = outputs[:outputs.index(data_utils.EOS_ID)]
		   if data_utils.EOS_ID in max_outputs:
		   	   max_outputs = max_outputs[:max_outputs.index(data_utils.EOS_ID)]
		   #print Q_vocab[outputs[0]]
		   #print (outputs)
#		   print ("sampling output >>")
#		   print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in outputs]))
		
		   #print (max_outputs)
		   print ("output >>")
		   print (" ".join([tf.compat.as_str(Q_vocab[output]) for output in max_outputs]))
		   print("=====================")
def decode():
  with tf.Session() as sess:
    # load dictionary
    srce_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.srce" % FLAGS.srce_vocab_min)
    trgt_vocab_path = os.path.join(FLAGS.data_dir, "train", "vocab%d.trgt" % FLAGS.trgt_vocab_min)
    
    _, re_srce_vocab = data_utils.initialize_vocabulary(srce_vocab_path)
    _, re_trgt_vocab = data_utils.initialize_vocabulary(trgt_vocab_path)

    # Load test data.
    if FLAGS.decode_test:
      srce_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids%d.srce" % FLAGS.srce_vocab_min)
      trgt_test_ids_path = os.path.join(FLAGS.data_dir, "test", "ids.trgt")
      srce_test_data_path = os.path.join(FLAGS.data_dir, "test/data.srce")
      trgt_test_data_path = os.path.join(FLAGS.data_dir, "test/data.trgt")

      # Prepare test data
      data_utils.data_to_token_ids(srce_test_data_path, srce_test_ids_path, srce_vocab_path)
      data_utils.data_to_token_ids(trgt_test_data_path, trgt_test_ids_path, trgt_vocab_path)
      trgt_test_pos = os.path.join(FLAGS.data_dir, "test", "positions.trgt")
      trgt_test_map = os.path.join(FLAGS.data_dir, "test", "map.srce")
      test_set = read_data(srce_test_ids_path, trgt_test_ids_path, trgt_test_pos, trgt_test_map)

    elif FLAGS.decode_dev:
      srce_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.srce" % FLAGS.srce_vocab_min)
      trgt_dev_ids_path = os.path.join(FLAGS.data_dir, "dev", "ids%d.trgt" % FLAGS.trgt_vocab_min)
      trgt_dev_pos = os.path.join(FLAGS.data_dir, "dev", "positions.trgt")
      trgt_dev_map = os.path.join(FLAGS.data_dir, "dev", "map.srce")
      test_set = read_data(srce_dev_ids_path, trgt_dev_ids_path, trgt_dev_pos, trgt_dev_map)

    else:
      raise ValueError(" Please set decode_test or decode_dev to True! ")

    # Create model and load parameters.
    model = create_model(sess, len(re_srce_vocab), len(re_trgt_vocab), True)
    model.batch_size = 1  # We decode one sentence at a time.

    # Decode test data.  ---> read from files

    decode_result_path = os.path.join(FLAGS.data_dir, ("result/result_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob)))
    decode_data_path = os.path.join(FLAGS.data_dir, ("result/gold_size%d_dropout%.2f" % (FLAGS.size, FLAGS.keep_prob)))
    
    test_bucket_sizes = [len(test_set[b]) for b in xrange(len(_buckets))]
    print ("test bucket size: ", test_bucket_sizes)

    count = 0
    correct = 0

    with open(decode_result_path, 'w') as fpred:
      with open(decode_data_path, 'w') as fgold: # note that the test data has been sorted by bucket size
        for b in xrange(len(_buckets)):
          print ("bucket%d:" % b)
          
          if len(test_set[b]) == 0: # empty bucket
            continue
          
          for sent in test_set[b]:
            
            encoder_input, decoder_input, target_weight, pos, maps = model.get_batch({b: [sent]}, b)
            # get output_logits
            _, _, output_logits, _, _, _= model.step(sess, encoder_input, decoder_input, target_weight, b, True, 
                  decoder_inputs_positions=pos, decoder_inputs_maps=maps)
            # greedy decoder: outputs are argmax of output_logits
            outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
              outputs = outputs[:outputs.index(data_utils.EOS_ID)]

            # write to file
            fpred.write(data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n')
            gold = sent[1]
            if data_utils.EOS_ID in sent[1]:
              gold = sent[1][:sent[1].index(data_utils.EOS_ID)]
            fgold.write(data_utils.token_ids_to_sentence(gold, re_trgt_vocab) + '\n')

            if gold == outputs:
              correct += 1
            # else:
            #   print ("source: ", data_utils.token_ids_to_sentence(sent[0], re_srce_vocab), '\t', pos, '\t', maps)
            #   print ("target: ", data_utils.token_ids_to_sentence(gold, re_trgt_vocab))
            #   print ("predict: ", data_utils.token_ids_to_sentence(outputs, re_trgt_vocab) + '\n')

            count += 1
    print("count = %d, correct = %d, accuracy = %f" % (count, correct, float(correct)/count))
Exemple #57
0
def evaluate():
  with tf.Session() as sess:
    # Create model and load parameters.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab_size = data_utils.get_vocab_size(gr_vocab_path)
    ph_vocab_size = data_utils.get_vocab_size(ph_vocab_path)
    model = create_model(sess, True, gr_vocab_size, ph_vocab_size)
    model.batch_size = 1  # We decode one word at a time.

    # Load vocabularies.
    gr_vocab_path = os.path.join(FLAGS.model, "vocab.grapheme")
    ph_vocab_path = os.path.join(FLAGS.model, "vocab.phoneme")
    gr_vocab, _ = data_utils.initialize_vocabulary(gr_vocab_path)
    _, rev_ph_vocab = data_utils.initialize_vocabulary(ph_vocab_path)

    # Decode from input file.
    test = open(FLAGS.evaluate).read().split('\n')
    test_graphemes = []
    test_phonemes = []

    for line in test:
      lst = line.split()
      if len(lst)>=2:
        test_graphemes.append(lst[0])
        test_phonemes.append(" ".join(lst[1:]))

    duplicates = {}
    total_dupl_num = 0
    for i, gr in enumerate(test_graphemes):
      if test_graphemes.count(gr) > 1:
        total_dupl_num += test_graphemes.count(gr) - 1
        if gr in duplicates:
          duplicates[gr].append(test_phonemes[i])
        else:
          duplicates[gr] = [test_phonemes[i]]

    errors = 0
    counter = 0
    dupl_error_calculated = []
    for i, w in enumerate(test_graphemes):
      if w not in duplicates:
        counter += 1
        word = " ".join(list(w))
        gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
        if not gr_absent:
          model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab) 
          if model_assumption != test_phonemes[i]:
            errors += 1
        else:
          raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) ) 
      elif w not in dupl_error_calculated:
        counter += 1
        dupl_error_calculated.append(w)
        word = " ".join(list(w))
        gr_absent = [gr for gr in w.replace('\n','') if gr not in gr_vocab]
        if not gr_absent:
          model_assumption = decode_word(word, sess, model, gr_vocab, rev_ph_vocab)
          if model_assumption not in duplicates[w]:
            errors += 1
        else:
          raise ValueError("Symbols: %s not in trained model's vocabulary" % ",".join(gr_absent) )

    print("WER : ", errors/counter )
    print("Accuracy : ", (1-errors/counter) )
def train():
  print ('Applying Parameters:')
  for k,v in FLAGS.__dict__['__flags'].iteritems():
    print ('%s: %s' % (k, str(v)))
  print("Preparing data in %s" % FLAGS.data_dir)
  vocab_path = ''
  tag_vocab_path = ''
  label_vocab_path = ''
  in_seq_train, out_seq_train, label_train, in_seq_dev, out_seq_dev, label_dev, in_seq_test, out_seq_test, label_test, vocab_path, tag_vocab_path, label_vocab_path = data_utils.prepare_multi_task_data(
    FLAGS.data_dir, FLAGS.in_vocab_size, FLAGS.out_vocab_size)     
     
  result_dir = FLAGS.train_dir + '/test_results'
  if not os.path.isdir(result_dir):
      os.makedirs(result_dir)

  current_taging_valid_out_file = result_dir + '/tagging.valid.hyp.txt'
  current_taging_test_out_file = result_dir + '/tagging.test.hyp.txt'

  vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
  tag_vocab, rev_tag_vocab = data_utils.initialize_vocabulary(tag_vocab_path)
  label_vocab, rev_label_vocab = data_utils.initialize_vocabulary(label_vocab_path)
    
  with tf.Session() as sess:
    # Create model.
    print("Max sequence length: %d." % _buckets[0][0])
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    
    model, model_test = create_model(sess, len(vocab), len(tag_vocab), len(label_vocab))
    print ("Creating model with source_vocab_size=%d, target_vocab_size=%d, and label_vocab_size=%d." % (len(vocab), len(tag_vocab), len(label_vocab)))

    # Read data into buckets and compute their sizes.
    print ("Reading train/valid/test data (training set limit: %d)."
           % FLAGS.max_train_data_size)
    dev_set = read_data(in_seq_dev, out_seq_dev, label_dev)
    test_set = read_data(in_seq_test, out_seq_test, label_test)
    train_set = read_data(in_seq_train, out_seq_train, label_train)
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0

    best_valid_score = 0
    best_test_score = 0
    while model.global_step.eval() < FLAGS.max_training_steps:
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, tags, tag_weights, batch_sequence_length, labels = model.get_batch(train_set, bucket_id)
      if task['joint'] == 1:
        _, step_loss, tagging_logits, classification_logits = model.joint_step(sess, encoder_inputs, tags, tag_weights, labels,
                                   batch_sequence_length, bucket_id, False)
      elif task['tagging'] == 1:
        _, step_loss, tagging_logits = model.tagging_step(sess, encoder_inputs, tags, tag_weights,
                                   batch_sequence_length, bucket_id, False)
      elif task['intent'] == 1:
        _, step_loss, classification_logits = model.classification_step(sess, encoder_inputs, labels,
                                   batch_sequence_length, bucket_id, False)                                   

      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        print ("global step %d step-time %.2f. Training perplexity %.2f" 
            % (model.global_step.eval(), step_time, perplexity))
        sys.stdout.flush()
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0 
        
        def run_valid_test(data_set, mode): # mode: Eval, Test
        # Run evals on development/test set and print the accuracy.
            word_list = list() 
            ref_tag_list = list() 
            hyp_tag_list = list()
            ref_label_list = list()
            hyp_label_list = list()
            correct_count = 0
            accuracy = 0.0
            tagging_eval_result = dict()
            for bucket_id in xrange(len(_buckets)):
              eval_loss = 0.0
              count = 0
              for i in xrange(len(data_set[bucket_id])):
                count += 1
                encoder_inputs, tags, tag_weights, sequence_length, labels = model_test.get_one(
                  data_set, bucket_id, i)
                tagging_logits = []
                classification_logits = []
                if task['joint'] == 1:
                  _, step_loss, tagging_logits, classification_logits = model_test.joint_step(sess, encoder_inputs, tags, tag_weights, labels,
                                             sequence_length, bucket_id, True)
                elif task['tagging'] == 1:
                  _, step_loss, tagging_logits = model_test.tagging_step(sess, encoder_inputs, tags, tag_weights,
                                             sequence_length, bucket_id, True)
                elif task['intent'] == 1:
                  _, step_loss, classification_logits = model_test.classification_step(sess, encoder_inputs, labels,
                                             sequence_length, bucket_id, True) 
                eval_loss += step_loss / len(data_set[bucket_id])
                hyp_label = None
                if task['intent'] == 1:
                  ref_label_list.append(rev_label_vocab[labels[0][0]])
                  hyp_label = np.argmax(classification_logits[0],0)
                  hyp_label_list.append(rev_label_vocab[hyp_label])
                  if labels[0] == hyp_label:
                    correct_count += 1
                if task['tagging'] == 1:
                  word_list.append([rev_vocab[x[0]] for x in encoder_inputs[:sequence_length[0]]])
                  ref_tag_list.append([rev_tag_vocab[x[0]] for x in tags[:sequence_length[0]]])
                  hyp_tag_list.append([rev_tag_vocab[np.argmax(x)] for x in tagging_logits[:sequence_length[0]]])

            accuracy = float(correct_count)*100/count
            if task['intent'] == 1:
              print("  %s accuracy: %.2f %d/%d" % (mode, accuracy, correct_count, count))
              sys.stdout.flush()
            if task['tagging'] == 1:
              if mode == 'Eval':
                  taging_out_file = current_taging_valid_out_file
              elif mode == 'Test':
                  taging_out_file = current_taging_test_out_file
              tagging_eval_result = conlleval(hyp_tag_list, ref_tag_list, word_list, taging_out_file)
              print("  %s f1-score: %.2f" % (mode, tagging_eval_result['f1']))
              sys.stdout.flush()
            return accuracy, tagging_eval_result
            
        # valid
        valid_accuracy, valid_tagging_result = run_valid_test(dev_set, 'Eval')        
        if task['tagging'] == 1 and valid_tagging_result['f1'] > best_valid_score:
          best_valid_score = valid_tagging_result['f1']
          # save the best output file
          subprocess.call(['mv', current_taging_valid_out_file, current_taging_valid_out_file + '.best_f1_%.2f' % best_valid_score])
        # test, run test after each validation for development purpose.
        test_accuracy, test_tagging_result = run_valid_test(test_set, 'Test')        
        if task['tagging'] == 1 and test_tagging_result['f1'] > best_test_score:
          best_test_score = test_tagging_result['f1']
          # save the best output file
          subprocess.call(['mv', current_taging_test_out_file, current_taging_test_out_file + '.best_f1_%.2f' % best_test_score])
Exemple #59
0
def train():
  """Train a en->fr translation model using WMT data."""
  # Prepare WMT data.
  print("Preparing WMT data in %s" % FLAGS.data_dir)
  en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
      FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)
  fr_vocab_path = os.path.join(FLAGS.data_dir,
                                 "vocab%d.en" % FLAGS.fr_vocab_size)
  #en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
  _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
  with tf.Session(config=tf.ConfigProto(device_count={'GPU':1}, gpu_options = gpu_options)) as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False)

    # Read data into buckets and compute their sizes.
    print ("Reading development and training data (limit: %d)."
           % FLAGS.max_train_data_size)
    dev_set = read_data(en_dev, fr_dev)
    train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
    #embed()
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)
      _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(loss) if loss < 300 else float('inf')
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        print ("step loss:%.4f", step_loss)
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt"+str(loss))
        model.saver = tf.train.Saver(tf.all_variables(), max_to_keep=0)
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
Exemple #60
0
def orig_train():
  """Train a en->fr translation model using WMT data."""
  # Prepare WMT data.
  print("Preparing WMT data in %s" % FLAGS.data_dir)
  en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_wmt_data(
      FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size)
  _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)
  

  with tf.Session() as sess:
    # Create model.
    print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size))
    model = create_model(sess, False)

    # Read data into buckets and compute their sizes.
    print ("Reading development  training data (limit: %d)."
           % FLAGS.max_train_data_size)
    dev_set = read_data(en_dev, fr_dev)
    train_set = read_data(en_train, fr_train, FLAGS.max_train_data_size)
    train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))

    # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
    # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
    # the size if i-th training bucket, as used later.
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                           for i in xrange(len(train_bucket_sizes))]

    # This is the training loop.
    step_time, loss = 0.0, 0.0
    current_step = 0
    previous_losses = []
    while True:
      # Choose a bucket according to data distribution. We pick a random number
      # in [0, 1] and use the corresponding interval in train_buckets_scale.
      random_number_01 = np.random.random_sample()
      bucket_id = min([i for i in xrange(len(train_buckets_scale))
                       if train_buckets_scale[i] > random_number_01])

      # Get a batch and make a step.
      start_time = time.time()
      encoder_inputs, decoder_inputs, target_weights = model.get_batch(
          train_set, bucket_id)
      _, step_loss, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                   target_weights, bucket_id, False)
      step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint
      loss += step_loss / FLAGS.steps_per_checkpoint
      current_step += 1

      # Once in a while, we save checkpoint, print statistics, and run evals.
      if current_step % FLAGS.steps_per_checkpoint == 0:
        # Print statistics for the previous epoch.
        perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
        print ("global step %d learning rate %.4f step-time %.2f perplexity "
               "%.2f" % (model.global_step.eval(), model.learning_rate.eval(),
                         step_time, perplexity))
        # Decrease learning rate if no improvement was seen over last 3 times.
        if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
          sess.run(model.learning_rate_decay_op)
        previous_losses.append(loss)
        # Save checkpoint and zero timer and loss.
        checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt")
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)
        step_time, loss = 0.0, 0.0
        # output train/hyp
        out_batch_size = output_logits[0].shape[0]
        rand_idx = np.random.randint(0, out_batch_size)
        inputs = [int(x[rand_idx]) for x in decoder_inputs]
        outputs = [int(np.argmax(logit[rand_idx])) for logit in output_logits]
        if data_utils.EOS_ID in inputs:
          inputs = inputs[:inputs.index(data_utils.EOS_ID)]
        if data_utils.EOS_ID in outputs:
          outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        print("  trg = " + " ".join([tf.compat.as_str(rev_fr_vocab[input]) for input in inputs]))
        print("  hyp = " + " ".join([tf.compat.as_str(rev_fr_vocab[output]) for output in outputs]))
        
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(_buckets)):
          if len(dev_set[bucket_id]) == 0:
            print("  eval: empty bucket %d" % (bucket_id))
            continue
          encoder_inputs, decoder_inputs, target_weights = model.get_batch(
              dev_set, bucket_id)
          _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                       target_weights, bucket_id, True)
          eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float(
              "inf")
          print("  eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
        sys.stdout.flush()