def get_entityType_pinyin(entity_type): entity_info_dict = {} entity_file = os.path.join(entity_folder, "%s.txt" % entity_type) with open(entity_file, "r") as fr: lines = fr.readlines() priority = 3 if entity_type in ["song"]: priority -= 0.5 print(curLine(), "get %d %s from %s, priority=%f" % (len(lines), entity_type, entity_file, priority)) for line in lines: raw_entity = line.strip() add_pinyin(raw_entity, entity_info_dict, priority, entity_type) ### TODO 从标注语料中挖掘得到 entity_file = os.path.join(entity_files_folder, "%s.json" % entity_type) with open(entity_file, "r") as fr: current_entity_dict = json.load(fr) print(curLine(), "get %d %s from %s, priority=%f" % (len(current_entity_dict), entity_type, entity_file, priority)) for entity_before, entity_after_times in current_entity_dict.items(): entity_after = entity_after_times[0] priority = 4 if entity_type in ["song"]: priority -= 0.5 add_pinyin(entity_after, entity_info_dict, priority, entity_type) return entity_info_dict
def pinyin_similar_word_noduoyin(entity_info_dict, word): if word in entity_info_dict: # 存在实体,无需纠错 return 1.0, word best_similar_word = None top_similar_score = 0 try: all_combination = ["".join(lazy_pinyin(word)) ] # get_pinyin_combination(entity=word) # for current_combination in all_combination: # 当前的各种发音 if len(current_combination) == 0: print(curLine(), "word:", word) continue similar_word = None current_distance = 10000 for entity, (com, pri) in entity_info_dict.items(): char_ratio = 0.0 d = distance(com, current_combination) * ( 1.0 - char_ratio) + distance(entity, word) * char_ratio if d < current_distance: current_distance = d similar_word = entity # if d<=2.5: # print(curLine(),com, current_combination, distance(com, current_combination), distance(entity, word) ) # print(curLine(), word, entity, similar_word, "current_distance=", current_distance) current_similar_score = 1.0 - float(current_distance) / len( current_combination) # print(curLine(), "current_combination:%s, %f" % (current_combination, current_similar_score), similar_word, current_distance) if current_similar_score > top_similar_score: # print(curLine(), current_similar_score, top_similar_score, best_similar_word, similar_word) best_similar_word = similar_word top_similar_score = current_similar_score except Exception as error: print(curLine(), "error:", error) return top_similar_score, best_similar_word
def http_post(sources_batch): parameter = {'text_list': sources_batch} headers = {'Content-type': 'application/json'} status = -1 output = None try: r = requests.post(url, data=json.dumps(parameter), headers=headers, timeout=10.5) if r.status_code == 200: result = r.json() # print(curLine(),result) status = result['status'] version = result['version'] if status == 0: data = result["data"] output = data['output'] else: print( curLine(), "version:%s, status=%d, message:%s" % (version, status, result['message'])) else: print("%sraise wrong,status_code: " % (curLine()), r.status_code) except Exception as e: print(curLine(), Exception, ' : ', e) input(curLine()) return status, output
def split(corpus_list, save_folder, trainRate=0.8): corpusNum = len(corpus_list) shuffle_indices = list(np.random.permutation(range(corpusNum))) indexTrain = int(trainRate * corpusNum) # indexDev= int((trainRate + devRate) * corpusNum) corpusTrain = [] for i in shuffle_indices[:indexTrain]: corpusTrain.append(corpus_list[i]) save_file = os.path.join(save_folder, "train.txt") with open(save_file, "w") as fw: fw.writelines(corpusTrain) print(curLine(), "have save %d to %s" % (len(corpusTrain), save_file)) corpusDev = [] for i in shuffle_indices[indexTrain:]: # TODO all corpus corpusDev.append(corpus_list[i]) save_file = os.path.join(save_folder, "tune.txt") with open(save_file, "w") as fw: fw.writelines(corpusDev) print(curLine(), "have save %d to %s" % (len(corpusDev), save_file)) save_file = os.path.join(save_folder, "test.txt") with open(save_file, "w") as fw: fw.writelines(corpusDev) print(curLine(), "have save %d to %s" % (len(corpusDev), save_file))
def get_entityType_pinyin(entity_type): entity_info_dict = {} entity_file = os.path.join(entity_folder, "%s.txt" % entity_type) with open(entity_file, "r") as fr: lines = fr.readlines() pri = 3 if entity_type in ["song"]: pri -= 0.5 print( curLine(), "get %d %s from %s, pri=%f" % (len(lines), entity_type, entity_file, pri)) for line in lines: entity = line.strip() for k, v in number_map.items(): entity.replace(k, v) # for combination in all_combination: if entity not in entity_info_dict: # 新的实体 combination = "".join(lazy_pinyin( entity)) # default:默认行为,不处理,原木原样返回 , errors="ignore" if len(combination) < 2: print(curLine(), "warning:", entity, "combination:", combination) entity_info_dict[entity] = (combination, pri) else: combination, old_pri = entity_info_dict[entity] if pri > old_pri: entity_info_dict[entity] = (combination, pri) return entity_info_dict
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') flags.mark_flag_as_required('input_file') flags.mark_flag_as_required('input_format') flags.mark_flag_as_required('output_file') flags.mark_flag_as_required('label_map_file') flags.mark_flag_as_required('vocab_file') flags.mark_flag_as_required('saved_model') label_map = utils.read_label_map(FLAGS.label_map_file) converter = tagging_converter.TaggingConverter( tagging_converter.get_phrase_vocabulary_from_label_map(label_map), FLAGS.enable_swap_tag) builder = bert_example.BertExampleBuilder(label_map, FLAGS.vocab_file, FLAGS.max_seq_length, FLAGS.do_lower_case, converter) predictor = predict_utils.LaserTaggerPredictor( tf.contrib.predictor.from_saved_model(FLAGS.saved_model), builder, label_map) print(colored("%s input file:%s" % (curLine(), FLAGS.input_file), "red")) sources_list = [] target_list = [] with tf.gfile.GFile(FLAGS.input_file) as f: for line in f: sources, target, lcs_rate = line.rstrip('\n').split('\t') sources_list.append([sources]) target_list.append(target) number = len(sources_list) # 总样本数 predict_batch_size = min(64, number) batch_num = math.ceil(float(number) / predict_batch_size) start_time = time.time() num_predicted = 0 with tf.gfile.Open(FLAGS.output_file, 'w') as writer: writer.write(f'source\tprediction\ttarget\n') for batch_id in range(batch_num): sources_batch = sources_list[batch_id * predict_batch_size:(batch_id + 1) * predict_batch_size] prediction_batch = predictor.predict_batch( sources_batch=sources_batch) assert len(prediction_batch) == len(sources_batch) num_predicted += len(prediction_batch) for id, [prediction, sources] in enumerate(zip(prediction_batch, sources_batch)): target = target_list[batch_id * predict_batch_size + id] writer.write(f'{"".join(sources)}\t{prediction}\t{target}\n') if batch_id % 20 == 0: cost_time = (time.time() - start_time) / 60.0 print( "%s batch_id=%d/%d, predict %d/%d examples, cost %.2fmin." % (curLine(), batch_id + 1, batch_num, num_predicted, number, cost_time)) cost_time = (time.time() - start_time) / 60.0 logging.info( f'{curLine()} {num_predicted} predictions saved to:{FLAGS.output_file}, cost {cost_time} min, ave {cost_time / num_predicted} min.' )
def main(corpus_folder, save_folder): fileList = os.listdir(corpus_folder) corpus_list_total = [] for raw_file_name in fileList: corpus_list = process(corpus_folder, raw_file_name) print(curLine(), raw_file_name, len(corpus_list)) corpus_list_total.extend(corpus_list) save_file = os.path.join(save_folder, "baoan_airport_from_xlsx.txt") with open(save_file, "w") as fw: fw.writelines(corpus_list_total) print(curLine(), "have save %d to %s" % (len(corpus_list_total), save_file))
def get_slot_info(query, domain): useEntityTypeList = domain2entity_map[domain] entityTypeMap = get_all_entity(query, useEntityTypeList=useEntityTypeList) entity_list_all = [] # 汇总所有实体 for entity_type, entity_list in entityTypeMap.items(): for entity in entity_list: entity_before = entity['before'] ignore_flag = False if entity_type != "song" and len( entity_before) < 2 and entity_before not in ["家", "妈"]: ignore_flag = True if entity_type == "song" and len(entity_before) < 2 and \ entity_before not in {"鱼", "云", "逃", "退", "陶", "美", "图", "默"}: ignore_flag = True if entity_before in { "什么歌", "一首", "小花", "叮当", "傻逼", "给你听", "现在", "当我" }: ignore_flag = True if ignore_flag: if entity_before not in "好点没走伤": print( curLine(), "ignore entity_type:%s, entity:%s, query:%s" % (entity_type, entity_before, query)) else: entity_list_all.append((entity_type, entity_before, entity['after'], entity['priority'])) entity_list_all = sorted(entity_list_all, key=lambda item: len(item[1]) * 100 + item[3], reverse=True) # new_entity_map 中key是实体,value是实体类型 slot_info = query exist_entityType_set = set() replace_mask = [0] * len(query) for entity_type, entity_before, entity_after, priority in entity_list_all: if entity_before not in query: continue if entity_type in exist_entityType_set: continue # 已经有这个类型了,忽略 # TODO start_location = slot_info.find(entity_before) if start_location > -1: exist_entityType_set.add(entity_type) if entity_after == entity_before: entity_info_str = "<%s>%s</%s>" % (entity_type, entity_after, entity_type) else: entity_info_str = "<%s>%s||%s</%s>" % ( entity_type, entity_before, entity_after, entity_type) slot_info = slot_info.replace(entity_before, entity_info_str) query = query.replace(entity_before, "") else: print(curLine(), replace_mask, slot_info, "entity_type:", entity_type, entity_before) return slot_info
def process(corpus_folder, raw_file_name, save_folder): corpus_list = [] for name in raw_file_name: raw_file = os.path.join(corpus_folder, name) with open(raw_file, "r") as fr: lines = fr.readlines() for i, line in enumerate(lines): source, target, label = line.strip().split("\t") if label == "0" or source == target: continue if label != "1": input(curLine() + line.strip()) length = float(len(source) + len(target)) source_length = len(source) if source_length > 8 and source_length < 38 and ( i + 1) % 2 > 0: # 对50%的长句构造交换操作 rand = random.uniform(0.4, 0.9) source_pre = source swag_location = int(source_length * rand) source = "%s%s" % (source[swag_location:], source[:swag_location]) lcs1 = _compute_lcs(source, target) lcs_rate = len(lcs1) / length if (lcs_rate < 0.4): # 差异大,换回来 source = source_pre else: print( curLine(), "source_pre:%s, source:%s, lcs_rate=%f" % (source_pre, source, lcs_rate)) lcs1 = _compute_lcs(source, target) lcs_rate = len(lcs1) / length if (lcs_rate < 0.2): continue # 变动过大,忽略 # if (lcs_rate<0.4): # continue # 变动过大,忽略 # if len(source)*1.15 < len(target): # new_t = source # source = target # target = new_t # print(curLine(), source, target, ",lcs1:",lcs1 , ",lcs_rate=", lcs_rate) corpus = "%s\t%s\t%f\n" % (source, target, lcs_rate) corpus_list.append(corpus) print(curLine(), len(corpus_list), "from %s" % raw_file) save_file = os.path.join(save_folder, "lcqmc.txt") with open(save_file, "w") as fw: fw.writelines(corpus_list) print(curLine(), "have save %d to %s" % (len(corpus_list), save_file))
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') flags.mark_flag_as_required('input_file') flags.mark_flag_as_required('input_format') flags.mark_flag_as_required('output_tfrecord_train') flags.mark_flag_as_required('output_tfrecord_dev') flags.mark_flag_as_required('vocab_file') builder = bert_example.BertExampleBuilder({}, FLAGS.vocab_file, FLAGS.max_seq_length, FLAGS.do_lower_case) num_converted = 0 num_ignored = 0 with tf.python_io.TFRecordWriter( FLAGS.output_tfrecord_train) as writer_train: for input_file in [FLAGS.input_file]: print(curLine(), "input_file:", input_file) for i, (sources, target) in enumerate( utils.yield_sources_and_targets(input_file, FLAGS.input_format)): logging.log_every_n( logging.INFO, f'{i} examples processed, {num_converted} converted to tf.Example.', 10000) if len(sources[-1]) > FLAGS.max_seq_length: # TODO 忽略问题太长的样本 num_ignored += 1 print( curLine(), "ignore num_ignored=%d, question length=%d" % (num_ignored, len(sources[-1]))) continue example1, _ = builder.build_bert_example(sources, target) example = example1.to_tf_example().SerializeToString() writer_train.write(example) num_converted += 1 logging.info( f'Done. {num_converted} examples converted to tf.Example, num_ignored {num_ignored} examples.' ) for output_file in [ FLAGS.output_tfrecord_train, FLAGS.output_tfrecord_dev ]: count_fname = _write_example_count(num_converted, output_file=output_file) logging.info(f'Wrote:\n{output_file}\n{count_fname}') with open(FLAGS.label_map_file, "w") as f: json.dump(builder._label_map, f, ensure_ascii=False, indent=4) print(curLine(), "save %d to %s" % (len(builder._label_map), FLAGS.label_map_file))
def main(): argv = sys.argv host_name = argv[2] model_id = argv[3] print(curLine(), "argv:", argv) arg_groups = params.parse(argv[1], host_name, mode="test") args, config = arg_groups[0] args.output_dir = "/home/%s/Mywork/model/qa_model_dir/on_test/block1-layer1-hidden100-acc=85.31" % ( host_name) # TODO args.output_dir = "/home/%s/Mywork/model/qa_model_dir/part_chatcorpus_model/block1-layer1-hidden100-normal-acc80.57" % host_name args.data_dir = os.path.join( "/home/%s/Mywork/corpus/Chinese_QA" % host_name, args.data_dir) checkpoint_dir = os.path.join(args.output_dir, model_id) if len(argv) == 5: args.eval_file = argv[4] demoer = Demoer(args, checkpoint_dir) sample = { 'text1': "请问谁有狂三这张高清的电影资源?", 'text2': '这张高清图,谁有狂三这张高清的请问谁有狂三这张高清的电影资源?' } predictions, probabilities, inference_time = demoer.serve(dev=[sample]) test(args, config, demoer) # 批量测试
def build_model(self, sess): states = {} interface = Interface(self.args, self.log) self.log( f'#classes: {self.args.num_classes}; #vocab: {self.args.num_vocab}' ) if self.args.seed: random.seed(self.args.seed) np.random.seed(self.args.seed) tf.set_random_seed(self.args.seed) model = Model(self.args, sess) sess.run(tf.global_variables_initializer()) embeddings = interface.load_embeddings() model.set_embeddings(sess, embeddings) self.log(f'trainable params: {model.num_parameters():,d}') self.log( f'trainable params (exclude embeddings): {model.num_parameters(exclude_embed=True):,d}' ) validate_params(self.args) file = os.path.join(self.args.summary_dir, 'args.json5') print(curLine(), "save to %s" % file) with open(file, 'w') as f: args = { k: v for k, v in vars(self.args).items() if not k.startswith('_') } json5.dump(args, f, indent=2) self.log(pformat(vars(self.args), indent=2, width=120)) return model, interface, states
def process(corpus_folder, raw_file_name, save_folder): raw_file = os.path.join(corpus_folder, raw_file_name) with open(raw_file, "r") as fr: lines = fr.readlines() corpus_list = [] for line in lines: sent_list = line.strip().split("&&") sent_num = len(sent_list) for i in range(1, sent_num, 2): source= sent_list[i-1] target = sent_list[i] length = float(len(source) + len(target)) lcs1 = _compute_lcs(source, target) lcs_rate= len(lcs1)/length if (lcs_rate<0.3): continue # 变动过大,忽略 if len(source)*1.15 < len(target): new_t = source source = target target = new_t corpus = "%s\t%s\t%f\n" % (source, target, lcs_rate) corpus_list.append(corpus) save_file = os.path.join(save_folder, "baoan_airport.txt") with open(save_file, "w") as fw: fw.writelines(corpus_list) print(curLine(), "have save %d to %s" % (len(corpus_list), save_file))
def my_pinyin(char): shengmu = pinyin(char, style=Style.INITIALS, strict=True)[0][0] yunmu = pinyin(char, style=Style.FINALS, strict=True)[0][0] total_pinyin = lazy_pinyin(char, errors='default')[0] if shengmu + yunmu != total_pinyin: print(curLine(), "char:", char, ",shengmu:%s, yunmu:%s" % (shengmu, yunmu), total_pinyin) return shengmu, yunmu, total_pinyin
def test(): """ 此函数为测试函数,将sh运行在服务器端后,用该程序在另一网络测试 This function is a test function. Run this function for test in a network while ServerDemo.py is running on a server in a different network """ sources_list = [] target_list = [] output_file = "/home/cloudminds/Mywork/corpus/rephrase_corpus/pred.tsv" input_file = "/home/cloudminds/Mywork/corpus/rephrase_corpus/test.txt" with tf.io.gfile.GFile(input_file) as f: for line in f: sources, target, lcs_rate = line.rstrip('\n').split('\t') sources_list.append(sources) # [sources]) target_list.append(target) number = len(target_list) # 总样本数 predict_batch_size = min(64, number) # TODO batch_num = math.ceil(float(number) / predict_batch_size) num_predicted = 0 with open(output_file, 'w') as writer: writer.write(f'source\tprediction\ttarget\n') start_time = time.time() for batch_id in range(batch_num): sources_batch = sources_list[batch_id * predict_batch_size:(batch_id + 1) * predict_batch_size] # prediction_batch = predictor.predict_batch(sources_batch=sources_batch) status, prediction_batch = http_post(sources_batch) assert len(prediction_batch) == len(sources_batch) num_predicted += len(prediction_batch) for id, [prediction, sources] in enumerate(zip(prediction_batch, sources_batch)): target = target_list[batch_id * predict_batch_size + id] writer.write(f'{"".join(sources)}\t{prediction}\t{target}\n') if batch_id % 20 == 0: cost_time = (time.time() - start_time) / 60.0 print( "%s batch_id=%d/%d, predict %d/%d examples, cost %.2fmin." % (curLine(), batch_id + 1, batch_num, num_predicted, number, cost_time)) cost_time = (time.time() - start_time) / 60.0 print( curLine(), "%d predictions saved to %s, cost %f min, ave %f min." % (num_predicted, output_file, cost_time, cost_time / num_predicted))
def test(args, config, demoer): dev = loader.load_data(args.data_dir, args.eval_file) targets = [] for sample in dev: targets.append(int(sample['target'])) predictions, probabilities, inference_time = demoer.serve(dev=dev, batch_size=384) if "train" in args.eval_file: # 将模型的置信度保存到文件 with open(os.path.join(args.data_dir, "%s.txt" % args.eval_file), "r") as fr: lines = fr.readlines() assert len(lines) == len( probabilities ), 'number of lines is %d, number of probabilities is %d' % ( len(lines), len(probabilities)) save_file = os.path.join(args.data_dir, "%s_score.txt" % args.eval_file) with open(save_file, "w") as writer: for line, prediction, prob in zip(lines, predictions, probabilities): writer.write("%s\t%f\n" % (line.strip(), prob[1])) print(curLine(), "save %d results to %s" % (len(probabilities), save_file)) outputs = { 'target': targets, 'prob': probabilities, 'pred': predictions, 'args': args, } # total_loss = sum(losses[:-1]) / (len(losses) - 1) if len(losses) > 1 else sum(losses) states = {'inference_time': inference_time / len(targets)} for metric in args.watch_metrics: if metric not in states: # multiple metrics could be computed by the same function states.update(metrics[metric](outputs)) print(curLine(), "stats:", states) with open('%s/log.jsonl' % args.output_dir, 'a') as f: f.write( json5.dumps({ 'data': os.path.basename(args.data_dir), 'params': config, 'state': states })) f.write('\n')
def main(): argv = sys.argv print(curLine(), "argv:", argv) host_name = sys.argv[2] if len(argv) == 3: arg_groups = params.parse(sys.argv[1], host_name, mode="train") test_score_sum = 0.0 max_test_score = 0.0 experiment_times = 0 eval_score_list = [] best_experiment_times = None for args, config in arg_groups: if not os.path.exists(args.summary_dir): os.makedirs(args.summary_dir) args.pretrained_embeddings = os.path.join( "/home/%s/Word2Vector/Chinese" % host_name, args.pretrained_embeddings) # print(curLine(), "args.data_dir:%s, args.output_dir:%s" % (args.data_dir, args.output_dir)) trainer = Trainer(args) states, best_eval_score = trainer.train(experiment_times) eval_score_list.append(best_eval_score) test_score_sum += best_eval_score if max_test_score < best_eval_score: max_test_score = best_eval_score best_experiment_times = experiment_times experiment_times += 1 print( curLine(), "experiment_times=%d/%d, best_experiment_times=%d, ave_test_score=%f, max_test_score=%f" % (experiment_times, len(arg_groups), best_experiment_times, test_score_sum / experiment_times, max_test_score)) with open('%s/log.jsonl' % args.output_dir, 'a') as f: f.write( json5.dumps({ 'data': os.path.basename(args.data_dir), 'params': config, 'state': states, })) f.write('\n') print(curLine(), "eval_score_list:", eval_score_list, eval_score_list.index(max_test_score), "\n") else: print(curLine(), 'Usage: "python train.py configs/xxx.json5 host_name"')
def get_slot(param): slot = [] if "<" not in param: return slot if ">" not in param: print(curLine(), "param:", param) return slot if "</" not in param: return slot start_segment = re.findall("<[\w_]*>", param) end_segment = re.findall("</[\w_]*>", param) if len(start_segment) != len(end_segment): print(curLine(), "start_segment:", start_segment) print(curLine(), "end_segment:", end_segment) search_location = 0 for s,e in zip(start_segment, end_segment): entityType = s[1:-1] assert "</%s>" % entityType == e start_index = param[search_location:].index(s) + len(s) end_index = param[search_location:].index(e) entity_info = param[search_location:][start_index:end_index] search_location += end_index + len(e) before,after = entity_info, entity_info if "||" in entity_info: before, after = entity_info.split("||") if before in before2after: after = before2after[before] if before not in all_entity_dict[entityType]: all_entity_dict[entityType][before] = [after, 1] else: if after != all_entity_dict[entityType][before][0]: print(curLine(), entityType, before, after, all_entity_dict[entityType][before]) assert after == all_entity_dict[entityType][before][0] all_entity_dict[entityType][before][1] += 1 if before != after: before = after if before not in all_entity_dict[entityType]: all_entity_dict[entityType][before] = [after, 1] else: assert after == all_entity_dict[entityType][before][0] all_entity_dict[entityType][before][1] += 1
def read_data(path, lowercase): """Reads data from prediction TSV file. The prediction file should contain 3 or more columns: 1: sources (concatenated) 2: prediction 3-n: targets (1 or more) Args: path: Path to the prediction file. lowercase: Whether to lowercase the data (to compute case insensitive scores). Returns: Tuple (list of sources, list of predictions, list of target lists) """ sources = [] predDomain_list = [] predIntent_list = [] domain_list = [] right_intent_num = 0 right_slot_num = 0 exact_num = 0 with tf.gfile.GFile(path) as f: for line in f: if "sessionId" in line and "pred" in line: continue sessionId, query, predDomain, predIntent, predSlot, domain, intent, Slot = line.rstrip( '\n').split('\t') # if lowercase: # source = normal_transformer(source.lower()) # pred = normal_transformer(pred.lower()) # targets = [normal_transformer(t.lower()) for t in targets] # sources.append(source) if predIntent == intent: right_intent_num += 1 if predSlot == Slot: exact_num += 1 # else: # print(curLine(), predIntent, "intent:", intent) if predSlot == Slot: right_slot_num += 1 # else: # print(curLine(), predSlot, "Slot:", Slot, "predDomain:%s, domain:%s" % (predDomain, domain)) if predDomain != domain: print(curLine(), "predDomain:%s, domain:%s" % (predDomain, domain), predSlot, "Slot:", Slot) predDomain_list.append(predDomain) predIntent_list.append(predIntent) domain_list.append(domain) return predDomain_list, predIntent_list, domain_list, right_intent_num, right_slot_num, exact_num
def get_slot_info(query, domain): useEntityTypeList = domain2entity_map[domain] entityTypeMap = get_all_entity(query, useEntityTypeList=useEntityTypeList) if "phone_num" in useEntityTypeList: token_numbers = re_phoneNum.findall(query) for number in token_numbers: entityTypeMap["phone_num"].append({'before':number, 'after':number, 'priority': 2}) # print(curLine(), "entityTypeMap", entityTypeMap) # for entity_type, entity_info_list in entityTypeMap.items(): # for entity_info in entity_info_list: # entity_before = entity_info['before'] # priority = entity_info['priority'] # if len(entity_before) < 2 and entity_before not in ["家","妈"]: # continue # entity_map[entity_before] = (entity_type, entity_info['after'], priority) # TODO song的优先级应该低一点 # # if entity_before not in entity_map or (priority>entity_map[entity_before][2]): # # entity_map[entity_before] = (entity_type, entity_info['after'], priority) # print(curLine(), len(entity_map), "entity_map", entity_map) # if "phone_num" in useEntityTypeList: # token_numbers = re_phoneNum.findall(query) # for number in token_numbers: # entity_map[number] = ("phone_num", number, 2) entity_list_all = [] #汇总所有实体 for entity_type, entity_list in entityTypeMap.items(): for entity in entity_list: entity_before = entity['before'] if len(entity_before) < 2 and entity_before not in ["家","妈"]: continue entity_list_all.append((entity_type, entity_before, entity['after'], entity['priority'])) entity_list_all = sorted(entity_list_all, key=lambda item: len(item[1])*100+item[3], reverse=True) # new_entity_map 中key是实体,value是实体类型 slot_info = query exist_entityType_set = set() replace_mask = [0] * len(query) for entity_type, entity_before, entity_after, priority in entity_list_all: if entity_before not in query: continue if entity_type in exist_entityType_set: continue # 已经有这个类型了,忽略 # TODO start_location = slot_info.find(entity_before) if start_location > -1: # exist exist_entityType_set.add(entity_type) if entity_after == entity_before: entity_info_str = "<%s>%s</%s>" % (entity_type, entity_after, entity_type) else: entity_info_str = "<%s>%s||%s</%s>" % (entity_type, entity_before, entity_after, entity_type) slot_info = slot_info.replace(entity_before, entity_info_str) query = query.replace(entity_before, "") else: print(curLine(), replace_mask, slot_info, "entity_type:", entity_type, entity_before) return slot_info
def bleu(hyps, refs_list): """ calculate bleu1, bleu2, bleu3 """ bleu_1 = [] bleu_2 = [] for hyp, refs in zip(hyps, refs_list): if len(hyp) <= 1: # print("ignore hyp:%s, refs:" % hyp, refs) bleu_1.append(0.0) bleu_2.append(0.0) continue score = bleu_score.sentence_bleu( refs, hyp, smoothing_function=None, # bleu_score.SmoothingFunction().method7, weights=[1, 0, 0, 0]) # input(curLine()) if score > 1.0: print(curLine(), refs, hyp) print(curLine(), "score=", score) input(curLine()) bleu_1.append(score) score = bleu_score.sentence_bleu( refs, hyp, smoothing_function=None, # bleu_score.SmoothingFunction().method7, weights=[0.5, 0.5, 0, 0]) bleu_2.append(score) bleu_1 = np.average(bleu_1) bleu_2 = np.average(bleu_2) bleu_average_score = (bleu_1 + bleu_2) * 0.5 print("bleu_1=%f, bleu_2=%f, bleu_average_score=%f" % (bleu_1, bleu_2, bleu_average_score)) return bleu_average_score
def process(source_file, train_file, dev_file): dev_lines = [] train_num = 0 intent_distribution = defaultdict(dict) with open(source_file, "r") as f, open(train_file, "w") as f_train: reader = csv.reader(f) train_write = csv.writer(f_train, dialect='excel') for row_id, line in enumerate(reader): if row_id==0: continue (sessionId, raw_query, domain_intent, param) = line get_slot(param) if domain_intent == other_tag: domain = other_tag intent = other_tag else: domain, intent = domain_intent.split(".") if intent in intent_distribution[domain]: intent_distribution[domain][intent] += 1 else: intent_distribution[domain][intent] = 0 if row_id == 0: continue sessionId = int(sessionId) if sessionId % 10>0: train_write.writerow(line) train_num += 1 else: dev_lines.append(line) with open(dev_file, "w") as f_dev: write = csv.writer(f_dev, dialect='excel') for line in dev_lines: write.writerow(line) print(curLine(), "dev=%d, train=%d" % (len(dev_lines), train_num)) for domain, intent_num in intent_distribution.items(): print(curLine(), domain, intent_num)
def _added_token_counts(data_iterator, try_swapping, max_input_examples=10000): """Computes how many times different phrases have to be added. 计算需要添加多少个不同的短语 Args: data_iterator: Iterator to yield source lists and targets. See function yield_sources_and_targets in utils.py for the available iterators. The strings in the source list will be concatenated, possibly after swapping their order if swapping is enabled. try_swapping: Whether to try if swapping sources results in less added text. max_input_examples: Maximum number of examples to be read from the iterator. Returns: Tuple (collections.Counter for phrases, added phrases for each example). """ phrase_counter = collections.Counter() num_examples = 0 all_added_phrases = [] max_seq_length = 0 for sources, target in data_iterator: # sources 可能是多句话,后面用空格拼接起来 if num_examples >= max_input_examples: break # source_merge = ' '.join(sources) source_merge = sources #print("phrase_vocabulary_optimization.py source_merge",source_merge) if len(source_merge) > max_seq_length: print( curLine(), "max_seq_length=%d, len(source_merge)=%d,source_merge:%s" % (max_seq_length, len(source_merge), source_merge)) max_seq_length = len(source_merge) logging.log_every_n(logging.INFO, f'{num_examples} examples processed.', 10000) added_phrases = _get_added_phrases(source_merge, target) #print("added_phrases",added_phrases) if try_swapping and len(sources) == 2: added_phrases_swap = _get_added_phrases(' '.join(sources[::-1]), target) # If we can align more and have to add less after swapping, we assume that # the sources would be swapped during conversion. if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)): added_phrases = added_phrases_swap for phrase in added_phrases: phrase_counter[phrase] += 1 all_added_phrases.append(added_phrases) num_examples += 1 logging.info(f'{num_examples} examples processed.\n') return phrase_counter, all_added_phrases, max_seq_length
def process_sample(self, text1): text = normal_transformer(text1) if len(text) < 1: print(curLine(), "text:%s, text1:%s" % (text, text1)) if self.args.language.lower() == "chinese": processed_text = [ self.vocab.index(w) for w in list(text)[:self.args.max_len] ] else: processed_text = [ self.vocab.index(w) for w in text.split()[:self.args.max_len] ], processed_len = len(processed_text) return processed_text, processed_len
def __init__(self, args, checkpoint_dir): self.args = args self.log = Logger(self.args) tf.reset_default_graph() with tf.Graph().as_default(): config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True self.sess = tf.Session(config=config) with self.sess.as_default(): self.model, self.interface, self.states = self.build_model( self.sess) ckpt = tf.train.get_checkpoint_state( checkpoint_dir) # self.model_path) if ckpt is None: print(curLine(), "there is no model in %s" % checkpoint_dir) else: file_name = ckpt.model_checkpoint_path.split("/")[-1] model_checkpoint_file = os.path.join( checkpoint_dir, file_name) print(curLine(), "restore from %s" % model_checkpoint_file) # saver = tf.train.import_meta_graph("{}.meta".format(ckpt.model_checkpoint_path)) # 用这个saver比self.model.saver推理更慢 self.model.saver.restore(self.sess, model_checkpoint_file)
def predict_and_write(predictor, sources_batch, previous_line_list,context_list, writer, num_predicted, start_time, batch_num): prediction_batch = predictor.predict_batch(sources_batch=sources_batch) assert len(prediction_batch) == len(sources_batch) for id, [prediction, sources] in enumerate(zip(prediction_batch, sources_batch)): output = "" if len(prediction) > 1 and prediction != sources: # TODO ignore keep totelly and short prediction output= "%s%s" % (context_list[id], prediction) # 需要和context拼接么 # print(curLine(), "prediction:", prediction, "sources:", sources, ",output:", output, prediction != sources) writer.write("%s\t%s\n" % (previous_line_list[id], output)) batch_num = batch_num + 1 num_predicted += len(prediction_batch) if batch_num % 200 == 0: cost_time = (time.time() - start_time) / 3600.0 print("%s batch_id=%d, predict %d examples, cost %.3fh." % (curLine(), batch_num, num_predicted, cost_time)) return num_predicted, batch_num
def pinyin_similar_word(entity_info_dict, word): similar_word = None if word in entity_info_dict: # 存在实体,无需纠错 return 0, word all_combination = get_pinyin_combination(entity=word) top_similar_score = 0 for current_combination in all_combination: # 当前的各种发音 current_distance = 10000 for entity_after,(com_list, pri) in entity_info_dict.items(): for com in com_list: d = distance(com, current_combination) if d < current_distance: current_distance = d similar_word = entity_after current_similar_score = 1.0 - float(current_distance) / len(current_combination) print(curLine(), "current_combination:%s, %f" % (current_combination, current_similar_score), similar_word, current_distance)
def process(corpus_folder, raw_file_name): raw_file = os.path.join(corpus_folder, raw_file_name) # 打开文件,获取excel文件的workbook(工作簿)对象 workbook = xlrd.open_workbook(raw_file) # 文件路径 # 通过sheet索引获得sheet对象 worksheet = workbook.sheet_by_index(0) nrows = worksheet.nrows # 获取该表总行数 ncols = worksheet.ncols # 获取该表总列数 print( curLine(), "raw_file_name:%s, worksheet:%s nrows=%d, ncols=%d" % (raw_file_name, worksheet.name, nrows, ncols)) assert ncols == 3 assert nrows > 0 col_data = worksheet.col_values(0) # 获取第一列的内容 corpus_list = [] for line in col_data: sent_list = line.strip().split("&&") sent_num = len(sent_list) for i in range(1, sent_num, 2): source = sent_list[i - 1] target = sent_list[i] # source_length = len(source) # if source_length > 8 and (i+1)%4>0: # 对50%的长句随机删除 # rand = random.uniform(0.1, 0.9) # source_pre = source # swag_location = int(source_length*rand) # source = "%s%s" % (source[:swag_location], source[swag_location+1:]) # print(curLine(), "source_pre:%s, source:%s" % (source_pre, source)) length = float(len(source) + len(target)) lcs1 = _compute_lcs(source, target) lcs_rate = len(lcs1) / length if (lcs_rate < 0.2): continue # 变动过大,忽略 # if (lcs_rate<0.3): # continue # 变动过大,忽略 # if len(source)*1.15 < len(target): # new_t = source # source = target # target = new_t corpus = "%s\t%s\t%f\n" % (source, target, lcs_rate) corpus_list.append(corpus) return corpus_list
def _realize_sequence(self, tokens, tags): """Realizes output text corresponding to a single source text. Args: tokens: Tokens of the source text. tags: Tags indicating the edit operations. Returns: The realized text. """ output_tokens = [] for index, (token, tag) in enumerate(zip(tokens, tags)): loc = "0" if self.location is not None: loc = self.location[index] if tag.added_phrase and ( loc == "0" or index == 0 or (index > 0 and self.location[index - 1] == "0")): # TODO if not tag.added_phrase.startswith("##", 0, 2): output_tokens.append(tag.added_phrase) else: # word piece if len(output_tokens) > 0: output_tokens[-1] += tag.added_phrase[2:] else: output_tokens.append(tag.added_phrase[2:]) if tag.tag_type in ( TagType.KEEP, TagType.SWAP ) or loc == "1": # TODO 根据需要修改代码,location为"1"的位置不能被删除, 但目前是可以插入的 token = token.upper() # TODO 因为当前语料中有不少都是大写的,所以把预测结果都转为大写 if token.startswith("##", 0, 2): output_tokens.append(token[2:]) elif "UNK" in token: # 处理UNK的情况 previoud_id = self.token_index_map[index] # unk对应word开始的位置 next_previoud_id = previoud_id + 1 # unk对应word结束的位置 if index + 1 in self.token_index_map: next_previoud_id = self.token_index_map[index + 1] token = self.sources[0][previoud_id: next_previoud_id] # TODO print( curLine(), "self.passage[%d,%d]=%s" % (previoud_id, next_previoud_id, token)) output_tokens.append(token) else: # word piece output_tokens.append(token) return self.sep.join(output_tokens)
def rules(raw_query, predict_domain, target_domain_name): predict_intent = predict_domain # OTHERS slot_info = raw_query if predict_domain == "navigation": predict_intent = 'navigation' if "打开" in raw_query: predict_intent = "open" elif "开始" in raw_query: predict_intent = "start_navigation" for word in predict_utils.cancel_keywords: if word in raw_query: predict_intent = 'cancel_navigation' break # slot_info = raw_query # if predict_intent == 'navigation': TODO slot_info = exacter_acmation.get_slot_info(raw_query, domain=predict_domain) # if predict_intent != 'navigation': # TODO # print(curLine(), "slot_info:", slot_info) elif predict_domain == 'music': predict_intent = 'play' for word in predict_utils.cancel_keywords: if word in raw_query: predict_intent = 'pause' break for word in ["下一", "换一首", "换一曲", "切歌", "其他歌"]: if word in raw_query: predict_intent = 'next' break slot_info = exacter_acmation.get_slot_info(raw_query, domain=predict_domain) if predict_intent not in ['play','pause'] and slot_info != raw_query: # 根据槽位修改意图 换一首<singer>高安</singer>的<song>红尘情歌</song> print(curLine(), predict_intent, slot_info) predict_intent = 'play' # if predict_intent != 'play': # 换一首<singer>高安</singer>的<song>红尘情歌</song> # print(curLine(), predict_intent, slot_info) elif predict_domain == 'phone_call': predict_intent = 'make_a_phone_call' for word in predict_utils.cancel_keywords: if word in raw_query: predict_intent = 'cancel' break slot_info = exacter_acmation.get_slot_info(raw_query, domain=predict_domain) return predict_intent, slot_info