结果文件 chat_parsed.txt 2、读入chat_parsed.txt,解析每一个会话,合并连续q、a,提取订单信息等, 结果文件 session_parsed.txt ================================================================================ """ import re from zb.tools.logger import create_logger from data_pre.config import BaseConf from jddc_utils import file_op base_conf = BaseConf() logger = create_logger(base_conf.log_file, name='pre', cmd=True) logger.info("Logger create success, log file is %s" % base_conf.log_file) def _init_res_file(): file_qaqaq = base_conf.file_qaqaq file_a = base_conf.file_a file_op.empty_file(file_qaqaq) file_op.empty_file(file_a) return file_qaqaq, file_a # ------------------------------------------------------------------------------ def _update_nums(sess_info, line_cols): """统计每一个session中用户和客服的说话次数""" if line_cols['waiter_send'] == '0': sess_info['q_nums'] += 1
# coding=utf-8 from zb.tools.logger import create_logger from zb.tools.file_tools import read_file import os from .utils import JiebaSeg, ApiSeg, LacSeg from .similarity import SentenceSimilarity from .config import BaseConf conf = BaseConf() logger = create_logger(conf.log_file, name='tfidf', cmd=conf.cmd_log) if conf.refresh_model: logger.info("refresh model, prepare to delete old model file.") pkl_file = os.path.join(conf.model_path, "corpus_and_dictionary.pkl") file_model = os.path.join(conf.model_path, "tfidf.model") file_index1 = os.path.join(conf.model_path, 'index.index') file_index2 = os.path.join(conf.model_path, 'index.index.index.npy') if os.path.exists(pkl_file): os.remove(pkl_file) if os.path.exists(file_model): os.remove(file_model) if os.path.exists(file_index1): os.remove(file_index1) if os.path.exists(file_index2): os.remove(file_index2) logger.info("old model file deleted.") def run_prediction(input_file_path, output_file_path):
from zb.tools.logger import create_logger from jddc_utils import file_op # 配置参数 # ------------------------------------------------------------------ from data_pre.config import BaseConf base_conf = BaseConf() conf = { "chat": base_conf.file_chat, "chat_pred": base_conf.file_chat_pred, "log_file": base_conf.log_file, "cmd_log": base_conf.cmd_log } logger = create_logger(conf['log_file'], name='chat_pre', cmd=conf['cmd_log']) # ------------------------------------------------------------------ def transform_text(text): """特殊字符转换""" str_tf = { "#E-s[数字x]": "微笑", "#E-j[数字x]": "愤怒", " ": " ", "[数字x]%": "比例", "[金额x]%": "比例", "%": " ", "#": " ", "&": " ",
def run_prediction(input_file_path, output_file_path): log_file = os.path.join(base_conf.log_path, 'prediction.log') logger = create_logger(log_file, name='predictor', cmd=True) logger.info('run prediction ...') test_conf = TestConf() with tf.Session() as sess: # Create model structure and load parameters model = create_model(sess, True, model_path=test_conf.model) model.batch_size = 1 # We decode one sentence at a time. # Load vocabularies. enc_vocab_path = os.path.join(test_conf.work_path, "vocab%d.enc" % test_conf.enc_vocab_size) dec_vocab_path = os.path.join(test_conf.work_path, "vocab%d.dec" % test_conf.dec_vocab_size) enc_vocab, _ = data_utils.initialize_vocabulary(enc_vocab_path) _, rev_dec_vocab = data_utils.initialize_vocabulary(dec_vocab_path) # Decode from standard input. test_path = input_file_path result_path = output_file_path empty_file(result_path) with codecs.open(test_path, mode='r', encoding='utf-8') as rf: with codecs.open(result_path, mode='a', encoding='utf-8') as wf: try: sentence = rf.readline() while sentence: sentence = sentence.rstrip('<s>') # Get token-ids for the input sentence. logger.info("current sentence: " + sentence) token_ids = data_utils.sentence_to_token_ids( sentence, enc_vocab) logger.info("token_ids: " + ' '.join([str(i) for i in token_ids])) # Which bucket does it belong to? bucket_id = min([ b for b in range(len(_buckets)) if _buckets[b][0] > len(token_ids) ]) logger.info('bucket_id: ' + str(bucket_id)) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(token_ids, [])]}, bucket_id) """logits可以理解成未进入softmax的概率,一般是输出层的输出,softmax的输入""" # Get output logits for the sentence. _, _, output_logits = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID )] # Print out French sentence corresponding to outputs.(corresponding to:与...一致...) result = "".join([ tf.compat.as_str(rev_dec_vocab[output]) for output in outputs if tf.compat.as_str( rev_dec_vocab[output]) not in [",", "_UNK"] ]) wf.write(result + '\n') logger.info("result: " + result) sentence = rf.readline() except Exception as e: traceback.print_exc() logging.error("run prediction fail:", e) logger.info('run prediction end!')
def train(): train_conf = TrainConf() log_file = os.path.join(train_conf.log_path, 'train.log') logger = create_logger(log_file, name='Train', cmd=True) logger.info("start training") # prepare dataset logger.info("prepare dataset ...") enc_train, dec_train, enc_dev, dec_dev, _, _ = data_utils.prepare_custom_data( train_conf.work_path, train_conf.train_enc, train_conf.train_dec, train_conf.dev_enc, train_conf.dev_dec, train_conf.enc_vocab_size, train_conf.dec_vocab_size) logger.info("dataset prepared!") logger.info("enc_train: %s; dec_train: %s;" % (enc_train, dec_train)) logger.info("enc_dev: %s; dec_dev: %s;" % (enc_dev, dec_dev)) # setup config to use BFC allocator config = tf.ConfigProto( device_count={"CPU": 8}, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, ) config.gpu_options.allocator_type = 'BFC' with tf.Session(config=config) as sess: # Create model. logger.info("Creating %d layers of %d units." % (train_conf.num_layers, train_conf.layer_size)) model = create_model(sess, False) logger.info("Read data into buckets and compute their sizes.") dev_set = read_data(enc_dev, dec_dev) train_set = read_data(enc_train, dec_train, train_conf.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in range(len(train_bucket_sizes)) ] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01 ]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / train_conf.steps_per_checkpoint loss += step_loss / train_conf.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % train_conf.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') logger.info( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max( previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. save the model!! checkpoint_path = os.path.join(train_conf.work_path, "seq2seq.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in range(len(_buckets)): if len(dev_set[bucket_id]) == 0: logger.info(" eval: empty bucket %d" % bucket_id) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp( eval_loss) if eval_loss < 300 else float('inf') logger.info(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
import numpy as np import tensorflow as tf import codecs import logging import traceback from zb.tools.logger import create_logger from seq2seq import data_utils from seq2seq.seq2seq_model import Seq2SeqModel from seq2seq.config import BaseConf, TestConf, TrainConf base_conf = BaseConf() log_file = os.path.join(base_conf.log_path, 'execute.log') logger = create_logger(log_file, name='exec', cmd=True) # We use a number of buckets and pad to the closest one for efficiency. # See seq2seq_model.Seq2SeqModel for details of how they work. # (source_size, target_size) _buckets = [(50, 50), (80, 60), (150, 70), (500, 90)] def empty_file(file): with codecs.open(file, mode='w', encoding='utf-8') as f: f.truncate() def read_data(source_path, target_path, max_size=None): """Read data from source and target files and put into buckets.
============================================================================= """ import requests import grequests from zb.tools.logger import create_logger from jddc_utils import file_op # 配置参数 # ------------------------------------------------------------------------------ from data_pre.config import BaseConf conf = BaseConf() logger = create_logger(conf.log_file, name="tokenize", cmd=conf.cmd_log) def get_text_tokenize(text): """调用京东的分词器API""" token = conf.api_token inner = conf.inner headers = { "User-Agent": 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 ' 'Mobile Safari/537.36', } url_outer = "http://jdialog-lexeme.jd.com/lexeme?token={token}&text={text}" url_inner = "http://jdialog-lexeme-stage.jd.com/lexeme?token={token}&text={text}" if inner: url = url_inner.format(token=token, text=text)
class Config(object): # base path base_path = "/submitwork/data" file_stopwords = os.path.join(base_path, 'stopwords.txt') # data path data_path = os.path.join(base_path, 'prepared') # 在这里切换QA数据集,不同数据集对得分的影响较大 file_questions = os.path.join(data_path, 'questions.txt') file_answers = os.path.join(data_path, 'answers.txt') # conf for results res_path = os.path.join(base_path, 'BM25') insure_folder_exists(res_path) file_questions_segs = os.path.join(res_path, 'questions_segs.txt') log_file = os.path.join(res_path, 'bm25_implement.log') # other top = 5 cmd_log = True for_search = True api_token = '9fb9785b4ea044e5871a8cbdae354e03' inner = False # ------------------------------------------------------------------------------------------------------ conf = Config() logger = create_logger(name='bm25', log_file=conf.log_file, cmd=conf.cmd_log)