def main(): # fnam = "../data/word2vec/zh-cn/wiki_texts_seg.txt.bin" # model = models.Word2Vec.load(fnam) jieba.set_dictionary('jieba_dict/dict.txt.big') # load stopwords set stopwordset = set() with open('jieba_dict/stopwords.txt', 'r', encoding='utf-8') as sw: for line in sw: stopwordset.add(line.strip('\n')) word_set = set() # 问题路径 # path1 = "../data/nlpcc2016/nlpcc-iccpol-2016.kbqa.training.testing-data-all.txt" path1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v2.txt' # rdf路径 # lines = ct.file_read_all_lines(path1) # lines =['《机械设计基础》这本书的作者是谁','鑫威kw9000es是个什么产品'] result_lines = [] for line in lines: sentence = str(line).split('\t')[0] # line = "《机械设计基础》这本书的作者是谁" sentence = ct.clean_str_question(sentence) # 读取所有的line words = jieba.cut(sentence, cut_all=False) result_lines.append('%s' % ' '.join(words)) ct.file_wirte_list('../data/nlpcc2016/4-ner/seg/sentence.v6.txt', result_lines) print('done')
def re_write(f1, f2): """将问题格式转换""" f1s = ct.file_read_all_lines_strip(f1) f2s = [] for l1 in f1s: if str(l1).__contains__('question id'): f2s.append(str(l1).split('\t')[1].replace(' ', '').lower()) ct.file_wirte_list(f2, f2s)
def re_write_m2id(f1, f_out): f1s = ct.file_read_all_lines_strip(f1) # 读取所有的问题 f2 = [] for l1 in f1s: l1 = str(l1).replace(' ', '').replace('|||', '\t') l1 = ct.clean_str_s(l1) f2.append(l1) ct.file_wirte_list(f_out, f2) pass
def prepare_data(): f1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v1.txt' f3 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.txt' f4 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.sort_by_ner_lstm.txt' f1s = ct.file_read_all_lines_strip(f1) f3s = ct.file_read_all_lines_strip(f3) f1s_new = [] f3s_new = [] for i in range(len(f1s)): if str(f1s[i]).__contains__('NULL'): continue f1s_new.append(f1s[i]) f3s_new.append(f3s[i]) # 过滤NULL # 获取候选实体逐个去替代和判断 # cs.append('立建候时么什是♠') # 读取出所有候选实体并打分取出前3 看准确率 f4s = [] _index = -1 for l1 in f1s_new: # 遍历每个问题 _index += 1 replace_qs = [] for l3 in f3s_new[_index].split('\t'): q_1 = str(l1).split('\t')[0].replace(l3, '♠') replace_qs.append((q_1, l3)) entitys = [] for content, l3 in replace_qs: # content = input("input:") r1 = '1' entitys.append((l3, r1)) # print(content) # print(r1) # print(score_list) entitys.sort(key=lambda x: x[1]) entitys_new = [x[0] for x in entitys] f4s.append('\t'.join(entitys_new)) ct.file_wirte_list(f4, f4s)
def ner_re_writer(f1='../data/nlpcc2016/ner_t1/q.rdf.m_s.filter.txt', f2='../data/nlpcc2016/class/q.rdf.m_s.filter.re_writer.txt'): """ 重写问句库 """ # 1. 读取问句库 # 2. 替换问句并输出 f1s = ct.file_read_all_lines_strip(f1) f1s_new = [] for f1s_l in f1s: s1 = str(f1s_l).split('\t') e1 = str(f1s_l).split('\t')[5] q1 = str(f1s_l).split('\t')[0].replace(' ','').lower() # .replace('','♠') q2 = str(q1).replace(e1, '♠') s1.append(q2) f1s_new.append('\t'.join(s1)) ct.file_wirte_list(f2, f1s_new) print(1)
def class1(f5='../data/nlpcc2016/5-class/class1.txt', f1="../data/nlpcc2016/2-kb/kb-use.v2.txt"): bkh = baike_helper() bkh.init_spo(f_in=f1) keys = bkh.kbqa.keys() ps_dict = dict() # key = '\t'.join(list(v1)) value = for key in keys: vs = bkh.kbqa.get(key) # if vs[0] # vs[1] vs = list(vs) vs1 = [x for x in vs[1]] if len(ct.clean_str_answer(vs1)) == len(set(ct.clean_str_answer(vs1))): # 答案里面没有一样的就跳过 continue _vs_dict = dict() # 遍历每个KEY的VS,如果值重复则记录 相同的属性对 到全局里面 for _vs in vs: if _vs[1] in _vs_dict: # if _vs[0] in ps_dict: # ps_dict[_vs[0]] += 1 # else: # ps_dict[_vs[0]] = 1 s1 = _vs_dict[_vs[1]] s1.add(_vs[0]) _vs_dict[_vs[1]] = s1 else: s1 = set() s1.add(_vs[0]) _vs_dict[_vs[1]] = s1 # 去除 属性值- (属性1,属性2) 序列, # 将属性1,属性2作为KEY ,次数作为VALue 存到全局 for (k1, v1) in _vs_dict.items(): if len(list(v1)) <= 1: continue key1 = '\t'.join(list(v1)) if key1 in ps_dict: ps_dict[key1] += 1 else: ps_dict[key1] = 1 tp = ct.sort_dict(ps_dict, True) f5s = [] for t in tp: f5s.append("%s\t%s" % (t[0], t[1])) ct.file_wirte_list(f5, f5s) keys = ps_dict.keys() words_bag_list = [] for key in keys: words = set(str(key).split('\t')) exist = False wl_index = -1 for word in words: # 遍历每个单词 for wl_index in range(len(words_bag_list)): # 这个单词去匹配一遍所有的 if word in words_bag_list[wl_index]: exist = True break if exist: break # 把当前的words全部整合进去 if exist: for word in words: # 遍历每个单词 words_bag_list[wl_index].add(word) else: s1 = set() for word in words: # 遍历每个单词 s1.add(word) words_bag_list.append(s1) # 输出 words_bag_list f5s = [] for words_bag in words_bag_list: f5s.append('\t'.join(list(words_bag))) ct.file_wirte_list(f5 + '.combine.txt', f5s)
def class2(f5='../data/nlpcc2016/5-class/class2.txt', f1="../data/nlpcc2016/3-questions/q.rdf.ms.re.v1.filter.txt"): f1s = ct.file_read_all_lines_strip(f1) f1s_new = [str(x).split('\t')[6] for x in f1s] q_patten_set = set() q_patten_dict = dict() q_count_dict = dict() for f1_line in f1s_new: q_patten_set.add(f1_line) # for q1 in q_patten_set: # q_patten_dict[q1] = set() # q_count_dict[q1] = 0 gc1 = ct.generate_counter() for q1 in q_patten_set: # 遍历唯一问题集合 for f1_line in f1s: # 遍历问题集合 index = gc1() if index % 100000 == 0: print("%d - %d " % (index / 100000, len(q_patten_set) * len(f1s) / 100000)) _q1 = str(f1_line).split('\t')[6] _ps = str(f1_line).split('\t')[3] q1 = str(q1) if _q1!= '♠' and _q1.__contains__(q1): # 相等 或者 包含? if q1 in q_patten_dict: s1 = q_patten_dict[q1] s1.add(_ps) q_patten_dict[q1] = s1 q_count_dict[q1] += 1 else: s1 = set() s1.add(_ps) try: q_patten_dict[q1] = s1 except Exception as e11: print(e11) q_count_dict[q1] = 1 tp = ct.sort_dict(q_count_dict) f5s = [] for t in tp: f5s.append("%s\t%s\t%s" % (t[0], t[1], '\t'.join(list(q_patten_dict[t[0]])))) ct.file_wirte_list(f5, f5s) # ------- keys = q_patten_dict.keys() words_bag_list = [] for key in keys: # words = set(str(key).split('\t')) words = q_patten_dict.get(key) # words 规划总面积 建筑面积 显示器尺寸 面积 占地总面积 exist = False wl_index = -1 for word in words: # 遍历每个单词 for wl_index in range(len(words_bag_list)): # 这个单词去匹配一遍所有的 if word in words_bag_list[wl_index]: exist = True break if exist: break # 把当前的words全部整合进去 if exist: wbl = words_bag_list[wl_index] for word in words: # 遍历每个单词 wbl.add(word) words_bag_list[wl_index] = wbl else: s1 = set() for word in words: # 遍历每个单词 s1.add(word) words_bag_list.append(s1) # 输出 words_bag_list f5s = [] for words_bag in words_bag_list: f5s.append('\t'.join(list(words_bag))) ct.file_wirte_list(f5 + '.combine.txt', f5s)
list1_new = [ baike_helper.entity_re_extract_one_repeat(ct.clean_str_zh2en(x)) for x in res2 ] # 去掉重复 list1_new = ct.list_no_repeat(list1_new) # 去掉重复 # 去掉包含 # 5.8.3 去掉词语包含试试 有一首歌叫 有一首歌 一首歌 if True: # 能略微提高 list1_new_2 = [] for list1_new_word in list1_new: if not ct.be_contains(list1_new_word, list1_new): list1_new_2.append(list1_new_word) list1_new = list1_new_2 ct.print(list1_new) # 使用jieba分词出实体 if False: # 读取别名字典,然后将其按格式 名字 长度 n 输出 names = ct.file_read_all_lines_strip_no_tips( config.cc_par('alias_dict')) alias = [str(x).split('\t')[0] for x in names] # alias = list(sorted(alias,key=lambda x:len(x),reverse=True)) alias2 = ["%s %d n" % (ct.clean_str_s(str(x)), len(x)) for x in alias] path1 = '../word2vec-test/jieba_dict/dict.txt.big' path2 = '../data/nlpcc2016/4-ner/extract_e/dict.txt.big' ct.file_wirte_list(path1, alias2) print('done') print('done')
if len(str(f2s[i]).split('\t')) < 4: continue print(i) if i == 36: print(3333) p1 = str(f1s[i]).split('\t')[2].lower() p2 = str(f2s[i]).split('\t')[2].lower() if p1.replace(' ', '') != p2.replace(' ', ''): # 比较这2个属性,谁在句子中的词多 line = str(f1s[i]).split('\t')[0] count1 = math1(p1) count2 = math1(p2) l1_append = '' l2_append = '' if count1 > count2: l1_append = '\t@@@@' elif count1 < count2: l2_append = '\t@@@@' else: print('==') l1.append(f1s[i] + l1_append) l2.append(f2s[i] + l2_append) ct.file_wirte_list('../data/nlpcc2016/ner_t1/q.rdf.compare-1.txt', l1) ct.file_wirte_list('../data/nlpcc2016/ner_t1/q.rdf.compare-2.txt', l2)
def main(_): # prepare_data() # FLAGS.start_string = FLAGS.start_string.decode('utf-8') # converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path = \ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) model = 'ner' dh = data_helper.DataClass(model) train_batch_size = 1 # g = dh.batch_iter_char_rnn(train_batch_size) # (FLAGS.num_seqs, FLAGS.num_steps) embedding_weight = dh.embeddings model = CharRNN(dh.converter.vocab_size, # 词汇表大小 从其中生成所有候选 num_seqs=train_batch_size, # FLAGS.num_seqs, # ? 一个batch 的 句子 数目 num_steps=dh.max_document_length, # FLAGS.num_steps, # 一个句子的长度 lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size, embedding_weight=embedding_weight, sampling=True, dh=dh ) model.load(FLAGS.checkpoint_path) # cs = [] # cs.append('♠是什么类型的产品') # cs.append('♠是谁') # cs.append('♠是哪个公司的长度') f1 = '../data/nlpcc2016/6-answer/q.rdf.ms.re.v1.txt' f3 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.v1.txt' f4 = '../data/nlpcc2016/4-ner/extract_entitys_all_tj.sort_by_ner_lstm.v1.txt' f1s = ct.file_read_all_lines_strip(f1) f3s = ct.file_read_all_lines_strip(f3) f1s_new = [] f3s_new = [] for i in range(len(f1s)): # if str(f1s[i]).__contains__('NULL'): # continue f1s_new.append(f1s[i]) f3s_new.append(f3s[i]) # 过滤NULL # 获取候选实体逐个去替代和判断 # cs.append('立建候时么什是♠') # 读取出所有候选实体并打分取出前3 看准确率 f4s = [] _index = -1 for l1 in f1s_new: # 遍历每个问题 _index += 1 replace_qs = [] for l3 in f3s_new[_index].split('\t'): q_1 = str(l1).split('\t')[0].replace(l3, '♠') replace_qs.append((q_1, l3)) entitys = [] for content, l3 in replace_qs: # content = input("input:") start = dh.convert_str_to_indexlist_2(content, False) # arr = model.sample(FLAGS.max_length, start, dh.converter.vocab_size,dh.get_padding_num()) # #converter.vocab_size r1, score_list = model.judge(start, dh.converter.vocab_size) entitys.append((l3, r1)) # print(content) # print(r1) # print(score_list) ct.print("%s\t%s\t%s" % (content, l3, r1), 'debug_process') entitys.sort(key=lambda x: x[1]) entitys_new = [x[0] for x in entitys] ct.print('\t'.join(entitys_new)) f4s.append('\t'.join(entitys_new)) ct.file_wirte_list(f4, f4s)