Ejemplo n.º 1
0
def get_predicted_sentence(input_sentence, vocab, rev_vocab, model, sess):
    input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab)

    bucket_id = min([
        b for b in xrange(len(BUCKETS)) if BUCKETS[b][0] > len(input_token_ids)
    ])
    outputs = []

    feed_data = {bucket_id: [(input_token_ids, outputs)]}
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        feed_data, bucket_id)

    _, _, output_logits = model.step(sess,
                                     encoder_inputs,
                                     decoder_inputs,
                                     target_weights,
                                     bucket_id,
                                     forward_only=True)

    outputs = []
    for logit in output_logits:
        selected_token_id = int(np.argmax(logit, axis=1))

        if selected_token_id == data_utils.EOS_ID:
            break
        else:
            outputs.append(selected_token_id)

    output_sentence = ' '.join([rev_vocab[output] for output in outputs])

    return output_sentence
Ejemplo n.º 2
0
def interactive_comparison():
    """Compare two sentences separated by a semi-colon"""
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.from_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("(1) > ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        contexts = []
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), en_vocab)
            print("tokenids:", token_ids)
            # Which bucket does it belong to?
            bucket_id = get_bucket(en_vocab, sentence)

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)

            # Get the output context vector
            output_context = model.step_context(sess, encoder_inputs,
                                                decoder_inputs, target_weights,
                                                bucket_id)

            # Append the context so we can compute the dot product
            contexts.append(output_context)

            # Display the output
            print("bucket_id: ", bucket_id)
            print("output_context", output_context)

            # Now we compute similarity metrics
            if len(contexts) == 2:
                cosine_distance = cosine_similarity(*contexts)
                euclid_distance = np.linalg.norm(contexts[1] - contexts[0])
                print('cosine_similarity', cosine_distance)
                print('euclid_distance', euclid_distance)
                print('-------------------------------')
                contexts = []

            # Start again
            next_sentence = len(contexts) + 1
            print("(%i) > " % next_sentence, end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Ejemplo n.º 3
0
def decode():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        en_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.from" % FLAGS.from_vocab_size)
        fr_vocab_path = os.path.join(FLAGS.data_dir,
                                     "vocab%d.to" % FLAGS.to_vocab_size)
        en_vocab, _ = data_utils.initialize_vocabulary(en_vocab_path)
        _, rev_fr_vocab = data_utils.initialize_vocabulary(fr_vocab_path)

        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), en_vocab)
            print("tokenids:", token_ids)
            # Which bucket does it belong to?
            bucket_id = get_bucket(en_vocab, sentence)

            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # Get the output context vector
            output_context = model.step_context(sess, encoder_inputs,
                                                decoder_inputs, target_weights,
                                                bucket_id)
            # Display the output
            print("bucket_id: ", bucket_id)

            print("output_context", output_context)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            print(" ".join([
                tf.compat.as_str(rev_fr_vocab[output]) for output in outputs
            ]))
            print("> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Ejemplo n.º 4
0
def get_bucket(en_vocab, sentence):
    """
  Return the bucket_id that the sentence belongs to
  """
    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence),
                                                 en_vocab)
    # Which bucket does it belong to?
    bucket_id = len(_buckets) - 1
    for i, bucket in enumerate(_buckets):
        if bucket[0] >= len(token_ids):
            bucket_id = i
            break
    else:
        logging.warning("Sentence truncated: %s", sentence)
    return bucket_id
Ejemplo n.º 5
0
def get_context(sess, model, en_vocab, sentence):
    """
  Return the context vector for the sentence
  """
    token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(sentence),
                                                 en_vocab)
    # Which bucket does it belong to?
    bucket_id = get_bucket(en_vocab, sentence)

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)

    # Get the output context vector
    return model.step_context(sess, encoder_inputs, decoder_inputs,
                              target_weights, bucket_id)
Ejemplo n.º 6
0
def get_sentence_to_context_map(sentences):
    """
  Process all of the sentences with the model
  Return a map between sentence text and the context vectors
  The order of the map is undefined due to the bucketing process
  """
    # Load the vocab
    en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE)

    # Allocate the sentences to buckets
    bucketed = {}
    for sentence in sentences:
        bucket_id = get_bucket(en_vocab, sentence)
        bucketed.setdefault(bucket_id, [])
        bucketed[bucket_id].append(sentence)

    mapped = {}
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True, train_dir=TRAIN_DIR)
        model.batch_size = BATCH_SIZE  # We decode 64 sentence at a time.
        # Iterate over each bucket
        for bucket_id, sentences in bucketed.iteritems():
            for batch in chunker(sentences, BATCH_SIZE):
                data = []
                # Tokenize each sentence
                for sentence in batch:
                    token_ids = data_utils.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    expected_output = []
                    data.append((token_ids, expected_output))
                # Use the model to obtain contexts for each sentence in the batch
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: data}, bucket_id)
                contexts = model.step_context(sess, encoder_inputs,
                                              decoder_inputs, target_weights,
                                              bucket_id)
                features = np.hstack(contexts)
                print 'Encoded {0} sentences into {1} dimensional vectors'.format(
                    *features.shape)
                # Now we align sentences with their contexts
                for i, sentence in enumerate(batch):
                    mapped[sentence] = features[i, :].tolist()
    return mapped
Ejemplo n.º 7
0
def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):
    """Tokenize data file and turn into token-ids using given vocabulary file.

    This function loads data line-by-line from data_path, calls the above
    sentence_to_token_ids, and saves the result to target_path. See comment
    for sentence_to_token_ids on the details of token-ids format.

    Args:
      data_path: path to the data file in one-sentence-per-line format.
      target_path: path where the file with token-ids will be created.
      vocabulary_path: path to the vocabulary file.
      tokenizer: a function to use to tokenize each sentence;
        if None, basic_tokenizer will be used.
      normalize_digits: Boolean; if true, all digits are replaced by 0s.
    """
    if not gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = data_utils.initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % 100000 == 0:
                        print("  tokenizing line %d" % counter)

                    utterences = line.split('\t')

                    tokenized_utterences = []
                    for utter in utterences:
                        token_ids = data_utils.sentence_to_token_ids(tf.compat.as_bytes(utter), vocab,
                                                          tokenizer, normalize_digits)
                        tokenized_utterences.append(" ".join([str(tok) for tok in token_ids]))

                    tokens_file.write("\t".join(tokenized_utterences) + "\n")
Ejemplo n.º 8
0
def get_predicted_sentence(args,
                           input_sentence,
                           vocab,
                           rev_vocab,
                           model,
                           sess,
                           debug=False,
                           return_raw=False):
    def model_step(enc_inp, dec_inp, dptr, target_weights, bucket_id):
        _, _, logits = model.step(sess,
                                  enc_inp,
                                  dec_inp,
                                  target_weights,
                                  bucket_id,
                                  forward_only=True)
        prob = softmax(logits[dptr][0])

        return prob

    def greedy_dec(output_logits, rev_vocab):
        selected_token_ids = [
            int(np.argmax(logit, axis=1)) for logit in output_logits
        ]
        if data_utils.EOS_ID in selected_token_ids:
            eos = selected_token_ids.index(data_utils.EOS_ID)
            selected_token_ids = selected_token_ids[:eos]
        output_sentence = ' '.join(
            [dict_lookup(rev_vocab, t) for t in selected_token_ids])

        return output_sentence

    input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab)

    # Which bucket does it belong to?
    bucket_id = min([
        b for b in range(len(args.buckets))
        if args.buckets[b][0] > len(input_token_ids)
    ])
    outputs = []
    feed_data = {bucket_id: [(input_token_ids, outputs)]}

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        feed_data, bucket_id)
    if debug:
        print("\n[get_batch]\n", encoder_inputs, decoder_inputs,
              target_weights)

    # Original greedy decoding
    if args.beam_size == 1:
        _, _, output_logits = model.step(sess,
                                         encoder_inputs,
                                         decoder_inputs,
                                         target_weights,
                                         bucket_id,
                                         forward_only=True)

        return [{"dec_inp": greedy_dec(output_logits, rev_vocab), 'prob': 1}]

    # Get output logits for the sentence.
    beams, new_beams, results = [(1, 0, {
        'eos': 0,
        'dec_inp': decoder_inputs,
        'prob': 1,
        'prob_ts': 1,
        'prob_t': 1
    })], [], []  # initialize beams as (log_prob, empty_string, eos)
    dummy_encoder_inputs = [
        np.array([data_utils.PAD_ID]) for _ in range(len(encoder_inputs))
    ]

    for dptr in range(len(decoder_inputs) - 1):
        if dptr > 0:
            target_weights[dptr] = [1.]
            beams, new_beams = new_beams[:args.beam_size], []
        if debug:
            print("=====[beams]=====", beams)
        heapq.heapify(beams)  # since we will remove something
        for prob, _, cand in beams:
            if cand['eos']:
                results += [(prob, 0, cand)]
                continue

            # normal seq2seq
            if debug:
                print(
                    cand['prob'], " ".join(
                        [dict_lookup(rev_vocab, w) for w in cand['dec_inp']]))

            all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr,
                                     target_weights, bucket_id)
            if args.antilm:
                # anti-lm
                all_prob_t = model_step(dummy_encoder_inputs, cand['dec_inp'],
                                        dptr, target_weights, bucket_id)
                # adjusted probability
                all_prob = all_prob_ts - args.antilm * all_prob_t  # + args.n_bonus * dptr + random() * 1e-50
            else:
                all_prob_t = [0] * len(all_prob_ts)
                all_prob = all_prob_ts

            # suppress copy-cat (respond the same as input)
            if dptr < len(input_token_ids):
                all_prob[input_token_ids[dptr]] = all_prob[
                    input_token_ids[dptr]] * 0.01

            # for debug use
            if return_raw:
                return all_prob, all_prob_ts, all_prob_t

            # beam search
            for c in np.argsort(all_prob)[::-1][:args.beam_size]:
                new_cand = {
                    'eos': (c == data_utils.EOS_ID),
                    'dec_inp': [(np.array([c]) if i == (dptr + 1) else k)
                                for i, k in enumerate(cand['dec_inp'])],
                    'prob_ts':
                    cand['prob_ts'] * all_prob_ts[c],
                    'prob_t':
                    cand['prob_t'] * all_prob_t[c],
                    'prob':
                    cand['prob'] * all_prob[c],
                }
                new_cand = (new_cand['prob'], random(), new_cand
                            )  # stuff a random to prevent comparing new_cand

                try:
                    if (len(new_beams) < args.beam_size):
                        heapq.heappush(new_beams, new_cand)
                    elif (new_cand[0] > new_beams[0][0]):
                        heapq.heapreplace(new_beams, new_cand)
                except Exception as e:
                    print("[Error]", e)
                    print("-----[new_beams]-----\n", new_beams)
                    print("-----[new_cand]-----\n", new_cand)

    results += new_beams  # flush last cands

    # post-process results
    res_cands = []
    for prob, _, cand in sorted(results, reverse=True):
        cand['dec_inp'] = " ".join(
            [dict_lookup(rev_vocab, w) for w in cand['dec_inp']])
        res_cands.append(cand)

    return res_cands[:args.beam_size]
def get_predicted_sentence(args, input_sentence, vocab, rev_vocab, model, sess, debug=False, return_raw=False):
    def model_step(enc_inp, dec_inp, dptr, target_weights, bucket_id):
      _, _, logits = model.step(sess, enc_inp, dec_inp, target_weights, bucket_id, forward_only=True)
      prob = softmax(logits[dptr][0])
      # print("model_step @ %s" % (datetime.now()))
      return prob

    def greedy_dec(output_logits, rev_vocab):
      selected_token_ids = [int(np.argmax(logit, axis=1)) for logit in output_logits]
      if data_utils.EOS_ID in selected_token_ids:
        eos = selected_token_ids.index(data_utils.EOS_ID)
        selected_token_ids = selected_token_ids[:eos]
      output_sentence = ' '.join([dict_lookup(rev_vocab, t) for t in selected_token_ids])
      return output_sentence

    input_token_ids = data_utils.sentence_to_token_ids(input_sentence, vocab)

    # Which bucket does it belong to?
    bucket_id = min([b for b in range(len(args.buckets)) if args.buckets[b][0] > len(input_token_ids)])
    outputs = []
    feed_data = {bucket_id: [(input_token_ids, outputs)]}

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(feed_data, bucket_id)
    if debug: print("\n[get_batch]\n", encoder_inputs, decoder_inputs, target_weights)

    ### Original greedy decoding
    if args.beam_size == 1:
      _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only=True)
      return [{"dec_inp": greedy_dec(output_logits, rev_vocab), 'prob': 1}]

    # Get output logits for the sentence.
    beams, new_beams, results = [(1, 0, {'eos': 0, 'dec_inp': decoder_inputs, 'prob': 1, 'prob_ts': 1, 'prob_t': 1})], [], [] # initialize beams as (log_prob, empty_string, eos)
    dummy_encoder_inputs = [np.array([data_utils.PAD_ID]) for _ in range(len(encoder_inputs))]
    
    for dptr in range(len(decoder_inputs)-1):
      if dptr > 0: 
        target_weights[dptr] = [1.]
        beams, new_beams = new_beams[:args.beam_size], []
      if debug: print("=====[beams]=====", beams)
      heapq.heapify(beams)  # since we will remove something
      for prob, _, cand in beams:
        if cand['eos']: 
          results += [(prob, 0, cand)]
          continue

        # normal seq2seq
        if debug: print(cand['prob'], " ".join([dict_lookup(rev_vocab, w) for w in cand['dec_inp']]))

        all_prob_ts = model_step(encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
        if args.antilm:
          # anti-lm
          all_prob_t  = model_step(dummy_encoder_inputs, cand['dec_inp'], dptr, target_weights, bucket_id)
          # adjusted probability
          all_prob    = all_prob_ts - args.antilm * all_prob_t #+ args.n_bonus * dptr + random() * 1e-50
        else:
          all_prob_t  = [0]*len(all_prob_ts)
          all_prob    = all_prob_ts

        # suppress copy-cat (respond the same as input)
        if dptr < len(input_token_ids):
          all_prob[input_token_ids[dptr]] = all_prob[input_token_ids[dptr]] * 0.01

        # for debug use
        if return_raw: return all_prob, all_prob_ts, all_prob_t
        
        # beam search  
        for c in np.argsort(all_prob)[::-1][:args.beam_size]:
          new_cand = {
            'eos'     : (c == data_utils.EOS_ID),
            'dec_inp' : [(np.array([c]) if i == (dptr+1) else k) for i, k in enumerate(cand['dec_inp'])],
            'prob_ts' : cand['prob_ts'] * all_prob_ts[c],
            'prob_t'  : cand['prob_t'] * all_prob_t[c],
            'prob'    : cand['prob'] * all_prob[c],
          }
          new_cand = (new_cand['prob'], random(), new_cand) # stuff a random to prevent comparing new_cand
          
          try:
            if (len(new_beams) < args.beam_size):
              heapq.heappush(new_beams, new_cand)
            elif (new_cand[0] > new_beams[0][0]):
              heapq.heapreplace(new_beams, new_cand)
          except Exception as e:
            print("[Error]", e)
            print("-----[new_beams]-----\n", new_beams)
            print("-----[new_cand]-----\n", new_cand)
    
    results += new_beams  # flush last cands

    # post-process results
    res_cands = []
    for prob, _, cand in sorted(results, reverse=True):
      cand['dec_inp'] = " ".join([dict_lookup(rev_vocab, w) for w in cand['dec_inp']])
      res_cands.append(cand)
    return res_cands[:args.beam_size]