Python convert_to_token Examples, data_utils.convert_to_token Python Examples

Example #1

0

Show file

def handle_incoming_messages():
    data = request.json
    sender = data['entry'][0]['messaging'][0]['sender']['id']
    message = data['entry'][0]['messaging'][0]['message']['text']
    sentence = (' ').join([s for s in message])
    token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence),
                                            vocab_dict, False)
    bucket_id = len(buckets) - 1
    for i, bucket in enumerate(buckets):
        if bucket[0] >= len(token_ids):
            bucket_id = i
            break
    # Get a 1-element batch to feed the sentence to the model.
    encoder_input, decoder_input, weight = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)
    # Get output logits for the sentence.
    output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
    outputs = [int(np.argmax(logit, axis=1)) for logit in output]
    # If there is an EOS symbol in outputs, cut them at that point.
    if data_utils.EOS_ID in outputs:
        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
    message = "".join(
        [tf.compat.as_str(vocab_list[output]) for output in outputs])
    message = sub_words(message)
    message = qulify_sentence(message)
    reply(sender, message)
    return "ok"

Example #2

0

Show file

File: infer_scores.py Project: jojotenya/evaluation_kit

def test(filename):
    if FLAGS.src_word_seg == 'word':
        import jieba_fast as jieba
        jieba.load_userdict("dict_fasttext.txt")
    sess = tf.Session()
    src_vocab_dict, _ = data_utils.read_map(source_mapping)
    trg_vocab_dict, _ = data_utils.read_map(target_mapping)
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1
    #model.decoder_max_len = None

    #sources = ["你是誰","你是誰"]
    #targets = ["你是不是想人家","我是說你是我老婆"]
    df = pd.read_csv(filename)
    df = df.fillna('')
    sources = list(df["context"])
    targets = list(df["utterance"])
    scores = []
    for source, target in zip(sources, targets):
        if FLAGS.src_word_seg == 'word':
            source = (' ').join(jieba.lcut(source))
        elif FLAGS.src_word_seg == 'char':
            source = (' ').join([s for s in source])
        if FLAGS.trg_word_seg == 'word':
            target = (' ').join(jieba.lcut(target))
        elif FLAGS.trg_word_seg == 'char':
            target = (' ').join([t for t in target])
        src_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(source),
                                                    src_vocab_dict, False)
        trg_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(target),
                                                    trg_vocab_dict, False)
        trg_len = len(trg_token_ids)
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(src_token_ids):
                bucket_id = i
                break
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(src_token_ids, [])]}, bucket_id)
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)[:trg_len]
        output = [o[0][t] for t, o in zip(trg_token_ids, output)]
        output = np.mean(output)
        scores.append(output)
    scores = np.mean(scores)
    return scores

Example #3

0

Show file

File: run_srnn.py Project: jojotenya/Seq2seq-Chatbot-With-Deep-Reinforcement-Learning

def test():
    if FLAGS.src_word_seg == 'word':
        import jieba
        jieba.load_userdict('dict_fasttext.txt')
    sess = tf.Session()
    src_vocab_dict, _ = data_utils.read_map(source_mapping)
    _, trg_vocab_dict = data_utils.read_map(target_mapping)
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1

    sys.stdout.write("Input sentence: ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    if FLAGS.src_word_seg == 'word':
        sentence = (' ').join(jieba.lcut(sentence))
        print('sentence: ', sentence)
    elif FLAGS.src_word_seg == 'char':
        sentence = (' ').join([s for s in sentence])
    while (sentence):
        token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence),
                                                src_vocab_dict, False)
        bucket_id = len(buckets) - 1
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(token_ids):
                bucket_id = i
                break
        # Get a 1-element batch to feed the sentence to the model.
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.

        inference(model, output, src_vocab_dict, trg_vocab_dict)
        # Print out French sentence corresponding to outputs.
        #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_dict[output]) for output in outputs]))
        print("User input  : ", end="")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        if FLAGS.src_word_seg == 'word':
            sentence = (' ').join(jieba.lcut(sentence))
            print('sentence: ', sentence)
        elif FLAGS.src_word_seg == 'char':
            sentence = (' ').join([s for s in sentence])

Example #4

0

Show file

File: run.py Project: trinh-hoang-hiep/seq2seq-RL-chatbot-with-PyTorch-QnA

def test():
    sess = tf.Session()
    vocab_dict, vocab_list = data_utils.read_map(FLAGS.source_data_dir + '.' +
                                                 str(FLAGS.vocab_size) +
                                                 '.mapping')
    model = create_seq2seq(sess, 'TEST')
    model.batch_size = 1

    sys.stdout.write("Input sentence: ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()

    while (sentence):
        token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence),
                                                vocab_dict, False)
        bucket_id = len(buckets) - 1
        for i, bucket in enumerate(buckets):
            if bucket[0] >= len(token_ids):
                bucket_id = i
                break
        # Get a 1-element batch to feed the sentence to the model.
        encoder_input, decoder_input, weight = model.get_batch(
            {bucket_id: [(token_ids, [])]}, bucket_id)
        # Get output logits for the sentence.
        output = model.run(sess, encoder_input, decoder_input, weight,
                           bucket_id)
        # This is a greedy decoder - outputs are just argmaxes of output_logits.
        outputs = [int(np.argmax(logit, axis=1)) for logit in output]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
            outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        # Print out French sentence corresponding to outputs.
        print("Syetem reply: " + " ".join(
            [tf.compat.as_str(vocab_list[output]) for output in outputs]))
        print("User input  : ", end="")
        sys.stdout.flush()
        sentence = sys.stdin.readline()

Example #5

0

Show file

File: seq2seq_model.py Project: alison05921/RL_Style_Chatbot

    def run(self,
            sess,
            encoder_inputs,
            decoder_inputs,
            target_weights,
            bucket_id,
            forward_only=False,
            X=None,
            Y=None):

        if self.mode == 'TEST':
            encoder_size = self.buckets[bucket_id][0]
            decoder_size = self.buckets[-1][-1]
            decoder_inputs = np.reshape(
                np.repeat(decoder_inputs[0], decoder_size), (-1, 1))
            target_weights = np.reshape(
                np.repeat(target_weights[0], decoder_size), (-1, 1))
            #print('decoder_inputs: ',len(decoder_inputs))
        else:
            encoder_size, decoder_size = self.buckets[bucket_id]
        #print('bucket_id: ',bucket_id)
        #print('encoder_size: ',encoder_size)
        #print('decoder_size: ',decoder_size)

        input_feed = {}
        for l in range(encoder_size):
            input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
        for l in range(decoder_size):
            input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
            input_feed[self.target_weights[l].name] = target_weights[l]

        last_target = self.decoder_inputs[decoder_size].name
        input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)

        if self.mode == 'MLE':
            if forward_only:
                output_feed = [self.outputs[bucket_id], self.losses[bucket_id]]
                outputs = sess.run(output_feed, input_feed)
                return outputs
            else:
                output_feed = [
                    self.outputs[bucket_id], self.losses[bucket_id],
                    self.update[bucket_id]
                ]
                outputs = sess.run(output_feed, input_feed)
                return outputs

        elif self.mode == 'TEST':
            output_feed = [self.outputs[bucket_id]]
            outputs = sess.run(output_feed, input_feed)
            return outputs
        elif self.mode == 'RL':
            # check mode: sample or from decoder input
            # True for sample and False for from decoder input
            input_feed[self.loop_or_not] = True
            # input_feed == {..., <tf.Tensor 'Placeholder:0' shape=<unknown> dtype=bool>: True}

            # step 1: get seq2seq sampled output
            output_feed = [self.RL_index[bucket_id]]
            # output_feed == [<tf.Tensor 'concat:0' shape=(?, 10) dtype=int64>]
            outputs = sess.run(output_feed, input_feed)
            # outputs == batch_size list of token
            # sentence_rl is a batch sized list of sampled decoded natural sentence
            #sentence_rl = self.token2word_RL(outputs[0])
            #for a in sentence_rl:
            #  print(a)
            # step 2: get rewards according to some rules
            reward = np.ones((self.batch_size), dtype=np.float32)
            new_data = []
            for i in range(self.batch_size):
                token_ids = list(outputs[0][i])
                # token_ids是tf.multinomial取樣出來的東西
                if data_utils.EOS_ID in token_ids:
                    token_ids = token_ids[:token_ids.index(data_utils.EOS_ID)]
                new_data.append(([], token_ids + [data_utils.EOS_ID], "", ""))

                # in this case, X is language model score
                # reward 1: ease of answering
                temp_reward = [
                    self.prob(
                        token_ids,
                        data_utils.convert_to_token(tf.compat.as_bytes(sen),
                                                    self.trg_vocab_dict, False)
                        + [data_utils.EOS_ID], X, bucket_id) / float(len(sen))
                    for sen in self.dummy_reply
                ]

                r1 = -np.mean(temp_reward)

                # reward 2: semantic coherence
                r_input = list(reversed([o[i] for o in encoder_inputs]))
                if data_utils.EOS_ID in r_input:
                    r_input = r_input[:r_input.index(data_utils.EOS_ID)]

                if r_input == []:
                    r_input = [data_utils.EOS_ID]

                r2 = self.prob(r_input, token_ids, X, bucket_id) / float(
                    len(token_ids)) if len(token_ids) != 0 else 0

                # reward 3: sentiment analysis score
                word_token = [
                    self.trg_vocab_list[token].decode('utf-8')
                    for token in token_ids
                ]
                r3 = Y(word_token, np.array([len(token_ids)],
                                            dtype=np.int32))[0]

                reward[i] = r1 * self.r1 + r2 * self.r2 + r3 * self.r3

            #print(reward)
            # advantage
            reward = reward - np.mean(reward)
            #print(reward)
            _, decoder_inputs, target_weights = self.get_batch(
                {bucket_id: new_data}, bucket_id, rand=False)

            # step 3: update seq2seq model
            for l in range(decoder_size):
                input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
                input_feed[self.target_weights[l].name] = target_weights[l]

            input_feed[self.reward] = reward
            input_feed[self.loop_or_not] = False
            output_feed = [
                self.outputs[bucket_id], self.losses[bucket_id],
                self.update[bucket_id]
            ]
            #output_feed = [self.losses[bucket_id]]
            outputs = sess.run(output_feed, input_feed)

            return outputs

Example #6

0

Show file

File: main.py Project: alison05921/RL_Style_Chatbot

def test():
  if FLAGS.src_word_seg == 'word':
    import jieba
    jieba.initialize()
  sess = tf.Session()
  src_vocab_dict, _ = data_utils.read_map(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping')
  _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping')
  model = create_seq2seq(sess, 'TEST')
  model.batch_size = 1
  
  sys.stdout.write("Input sentence: ")
  sys.stdout.flush()
  sentence = sys.stdin.readline()
  if FLAGS.src_word_seg == 'word':
    sentence = (' ').join(jieba.lcut(sentence))
    print('sentence: ',sentence)
  elif FLAGS.src_word_seg == 'char':
    sentence = (' ').join([s for s in sentence])
  while(sentence):
    token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False)
    bucket_id = len(buckets) - 1
    for i, bucket in enumerate(buckets):
      if bucket[0] >= len(token_ids):
        bucket_id = i
        break
    # Get a 1-element batch to feed the sentence to the model.
    encoder_input, decoder_input, weight = model.get_batch({bucket_id: [(token_ids, [], "", "")]}, bucket_id)
    # Get output logits for the sentence.
    output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    
    # beam search all
    if bool(model.beam_search) is True:
        if bool(FLAGS.debug):
            outs = []
            for _ in range(model.beam_size):
                outs.append([])
   
            for out in output:
                for i,o in enumerate(out):
                    outs[i].append(o)
            outs = np.array(outs)
            #print('outs: ',outs.shape)
            outputss = []
            for out in outs:
                #print('out: ',out.shape)
                outputs = [int(np.argmax(logit)) for logit in out]
                outputss.append(outputs)
    
            for i,outputs in enumerate(outputss):
                sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
                sys_reply = data_utils.sub_words(sys_reply)
                sys_reply = qulify_sentence(sys_reply)
                if i == 0:
                    print(colored("Syetem reply(bs best): " + sys_reply,"red"))
                else:
                    print("Syetem reply(bs all): " + sys_reply)
        else:
            output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
            outputs = [int(np.argmax(logit, axis=1)) for logit in output]
            if data_utils.EOS_ID in outputs:
              outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
            sys_reply = data_utils.sub_words(sys_reply)
            sys_reply = qulify_sentence(sys_reply)
            print("Syetem reply(bs best): " + sys_reply)
            

    # MLE
    else:
        output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)
        print(output)
        print('output: ', len(output), output.shape, output[0].shape)
        outputs = [int(np.argmax(logit, axis=1)) for logit in output]
        # If there is an EOS symbol in outputs, cut them at that point.
        if data_utils.EOS_ID in outputs:
          outputs = outputs[:outputs.index(data_utils.EOS_ID)]
        sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])
        sys_reply = data_utils.sub_words(sys_reply)
        sys_reply = qulify_sentence(sys_reply)
        print("Syetem reply(MLE): " + sys_reply)


    # Print out French sentence corresponding to outputs.
    #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]))
    print ("User input  : ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    if FLAGS.src_word_seg == 'word':
      sentence = (' ').join(jieba.lcut(sentence))
      print ('sentence: ', sentence)
    elif FLAGS.src_word_seg == 'char':
      sentence = (' ').join([s for s in sentence])