Python sentence_to_token_idsの例、prepareData.sentence_to_token_ids Pythonの例

コード例 #1

0

ファイルを表示

ファイル: execute.py プロジェクト: fenice420/Deep-Learning-1

def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence):
    # Get token-ids for the input sentence.
    token_ids = prepareData.sentence_to_token_ids(tf.compat.as_bytes(sentence),
                                                  enc_vocab)

    # Which bucket does it belong to?
    bucket_id = min(
        [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)

    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)

    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

    # If there is an EOS symbol in outputs, cut them at that point.
    if prepareData.EOS_ID in outputs:
        outputs = outputs[:outputs.index(prepareData.EOS_ID)]

    return " ".join(
        [tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])

コード例 #2

0

ファイルを表示

def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence):
    # 把句子转换成编号列表
    # tf.compat.as_bytes() 用于将句子用 utf-8 编码而不管传进来的句子是已经编码好的还是没有编码好的
    token_ids = prepareData.sentence_to_token_ids(tf.compat.as_bytes(sentence),
                                                  enc_vocab)

    # 判断问属于哪个 bucket，取最小的
    bucket_id = min(
        [b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

    # 对句子编号进行处理，产生模型的正确输入
    encoder_inputs, decoder_inputs, target_weights = model.get_batch(
        {bucket_id: [(token_ids, [])]}, bucket_id)

    # 使用模型进行预测
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                     target_weights, bucket_id, True)

    # 将输出转换成词语的数字编号
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

    # 如果生成了 EOS，那么就将 EOS 后面的部分全部裁减掉
    if prepareData.EOS_ID in outputs:
        outputs = outputs[:outputs.index(prepareData.EOS_ID)]

    return " ".join(
        [tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])

コード例 #3

0

ファイルを表示

ファイル: execute.py プロジェクト: xuqy1981/chatbot-1

def test2():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(
            gConfig['data_directory'],
            "vocab%d.enc" % gConfig['enc_vocab_size'])
        dec_vocab_path = os.path.join(
            gConfig['data_directory'],
            "vocab%d.dec" % gConfig['dec_vocab_size'])

        enc_vocab, _ = prepareData.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = prepareData.initialize_vocabulary(dec_vocab_path)

        # Decode from standard input.
        sys.stdout.write("ask> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            # Get token-ids for the input sentence.
            sentence = sentence.strip('\n')
            token_ids = prepareData.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), enc_vocab)
            #print ("问句词向量id:", token_ids)
            # Which bucket does it belong to?
            bucket_id = min([
                b for b in range(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])
            # Get a 1-element batch to feed the sentence to the model.
            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if prepareData.EOS_ID in outputs:
                outputs = outputs[:outputs.index(prepareData.EOS_ID)]
            # Print out French sentence corresponding to outputs.
            #print("回答词向量id:", outputs)
            result = "".join([
                tf.compat.as_str(rev_dec_vocab[output]) for output in outputs
            ])
            print("answer> " + result)
            print("ask> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()

コード例 #4

0

ファイルを表示

def test2():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(FLAGS.data_directory,
                                      "vocab%d.enc" % FLAGS.enc_vocab_size)
        dec_vocab_path = os.path.join(FLAGS.data_directory,
                                      "vocab%d.dec" % FLAGS.dec_vocab_size)

        enc_vocab, _ = prepareData.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = prepareData.initialize_vocabulary(dec_vocab_path)

        test_data_path = os.path.join(FLAGS.data_directory, "test.enc")

        with open(test_data_path, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                #ask_sentence = line.strip()x_list = x.split(' ')
                x_list = line.split(' ')
                sentence = "".join(x_list)
                #sentence = line.strip(' ')
                token_ids = prepareData.sentence_to_token_ids(
                    tf.compat.as_bytes(sentence), enc_vocab)
                bucket_id = min([
                    b for b in range(len(_buckets))
                    if _buckets[b][0] > len(token_ids)
                ])

                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: [(token_ids, [])]}, bucket_id)
                _, _, output_logits = model.step(sess, encoder_inputs,
                                                 decoder_inputs,
                                                 target_weights, bucket_id,
                                                 True)
                # This is a greedy decoder - outputs are just argmaxes of output_logits.
                outputs = [
                    int(np.argmax(logit, axis=1)) for logit in output_logits
                ]
                # If there is an EOS symbol in outputs, cut them at that point.
                if prepareData.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(prepareData.EOS_ID)]

                result = "".join([
                    tf.compat.as_str(rev_dec_vocab[output])
                    for output in outputs
                ])
                with open(FLAGS.result_path, 'a', encoding='utf-8') as ff:
                    ff.write("ask: " + sentence + "\n")
                    ff.write("answer: " + result + "\n")
                    ff.write("\n")

コード例 #5

0

ファイルを表示

def test():
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True)
        model.batch_size = 1  # We decode one sentence at a time.

        # Load vocabularies.
        enc_vocab_path = os.path.join(FLAGS.data_directory,
                                      "vocab%d.enc" % FLAGS.enc_vocab_size)
        dec_vocab_path = os.path.join(FLAGS.data_directory,
                                      "vocab%d.dec" % FLAGS.dec_vocab_size)

        enc_vocab, _ = prepareData.initialize_vocabulary(enc_vocab_path)
        _, rev_dec_vocab = prepareData.initialize_vocabulary(dec_vocab_path)

        sys.stdout.write("我> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            sentence = sentence.strip('\n')
            token_ids = prepareData.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), enc_vocab)

            bucket_id = min([
                b for b in range(len(_buckets))
                if _buckets[b][0] > len(token_ids)
            ])

            encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                {bucket_id: [(token_ids, [])]}, bucket_id)
            # Get output logits for the sentence.
            _, _, output_logits = model.step(sess, encoder_inputs,
                                             decoder_inputs, target_weights,
                                             bucket_id, True)
            # This is a greedy decoder - outputs are just argmaxes of output_logits.
            outputs = [
                int(np.argmax(logit, axis=1)) for logit in output_logits
            ]
            # If there is an EOS symbol in outputs, cut them at that point.
            if prepareData.EOS_ID in outputs:
                outputs = outputs[:outputs.index(prepareData.EOS_ID)]

            result = "".join([
                tf.compat.as_str(rev_dec_vocab[output]) for output in outputs
            ])
            print("AI> " + result)
            print("我> ", end="")
            sys.stdout.flush()
            sentence = sys.stdin.readline()

コード例 #6

0

ファイルを表示

ファイル: execute.py プロジェクト: miranda-sunshine/chatbot

def decode_line(sess, model, enc_vocab, rev_dec_vocab, sentence):
    # Get token-ids for the input sentence.
    token_ids = prepareData.sentence_to_token_ids(tf.compat.as_bytes(sentence), enc_vocab)

    # Which bucket does it belong to?
    bucket_id = min([b for b in xrange(len(_buckets)) if _buckets[b][0] > len(token_ids)])

    # Get a 1-element batch to feed the sentence to the model.
    encoder_inputs, decoder_inputs, target_weights = model.get_batch({bucket_id: [(token_ids, [])]}, bucket_id)

    # Get output logits for the sentence.
    _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True)

    # This is a greedy decoder - outputs are just argmaxes of output_logits.
    outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]

    # If there is an EOS symbol in outputs, cut them at that point.
    if prepareData.EOS_ID in outputs:
        outputs = outputs[:outputs.index(prepareData.EOS_ID)]

    return " ".join([tf.compat.as_str(rev_dec_vocab[output]) for output in outputs])