Esempio n. 1
0
    def __init__(self):
        self.size = 8  # LSTM神经元size
        self.GO_ID = 1  # 输出序列起始标记
        self.EOS_ID = 2  # 结尾标记
        self.PAD_ID = 0  # 空值填充0
        self.min_freq = 1  # 样本词频超过这个值才会存入词表
        self.epochs = 20000  # 训练次数
        self.batch_num = 1000  # 参与训练的问答对个数
        self.input_seq_len = 25  # 输入序列长度
        self.output_seq_len = 50  # 输出序列长度
        self.init_learning_rate = 0.5  # 初始学习率
        self.wordToken = word_token.WordToken()

        # 放在全局的位置,为了动态算出 num_encoder_symbols 和 num_decoder_symbols
        self.max_token_id = self.wordToken.load_file_list(
            ['./dialog/question', './dialog/answer'], self.min_freq)
        self.num_encoder_symbols = self.max_token_id + 5
        self.num_decoder_symbols = self.max_token_id + 5

        self.sess = tf.Session()
        encoder_inputs, decoder_inputs, target_weights, outputs, loss, update, saver, learning_rate_decay_op, learning_rate = self.get_model(
            feed_previous=True)
        saver.restore(self.sess, './model/' + str(self.epochs) + '/demo_')

        self.encoder_inputs = encoder_inputs
        self.decoder_inputs = decoder_inputs
        self.target_weights = target_weights
        self.outputs = outputs
Esempio n. 2
0
PAD_ID = 0
# 输出序列起始标记
GO_ID = 1
# 结尾标记
EOS_ID = 2
# LSTM神经元size
size = 8
# 初始学习率
init_learning_rate = 1
# 在样本中出现频率超过这个值才会进入词表
min_freq = 10

# 训练的轮数
train_round = 10000

wordToken = word_token.WordToken()

# 放在全局的位置,为了动态算出num_encoder_symbols和num_decoder_symbols
max_token_id = wordToken.load_file_list(
    ['samples/question.txt', 'samples/answer.txt'], min_freq)
num_encoder_symbols = max_token_id + 5  #表示encoder_inputs中的整数词id的数目
num_decoder_symbols = max_token_id + 5


def get_id_list_from(sentence):
    """
    获取输入句子的分词对应id列表
    """
    sentence_id_list = []
    seg_list = jieba.cut(sentence)
    for str in seg_list:
Esempio n. 3
0
from tensorflow.contrib.legacy_seq2seq.python.ops import seq2seq
import word_token
import jieba
import random
import bm25_fitness_data
size = 8               # LSTM神经元size
GO_ID = 1              # 输出序列起始标记
EOS_ID = 2             # 结尾标记
PAD_ID = 0             # 空值填充0
min_freq = 1           # 样本频率超过这个值才会存入词表
epochs = 20000          # 训练次数
batch_num = 1000       # 参与训练的问答对个数
input_seq_len = 25         # 输入序列长度
output_seq_len = 50        # 输出序列长度
init_learning_rate = 0.5     # 初始学习率
wordToken = word_token.WordToken()   # 这是个词袋模型

max_token_id = wordToken.load_file_list(['./samples/question', './samples/answer'], min_freq)
num_encoder_symbols = max_token_id + 5    # 算上加上填充、结尾标记、输出标记
num_decoder_symbols = max_token_id + 5
APP_ID = '21290378'
API_KEY = 'ZoKi9QNvTdPseK1jOSVWvGZK'
SECRET_KEY = 'jjHqdb1SDwQBecELrc7SaWIHXnYpg8HB'
client = AipNlp(APP_ID, API_KEY, SECRET_KEY)  # 调用百度词,句相似度API
options = {}
options["model"] = "bert"

q_list = []     # question
a_list = []     # answer
type_list = []  # ill type
with open('question.txt', encoding="utf-8") as f: