class Searcher(object): def __init__(self, args=_get_parser()): p_bucket_path = os.path.join(args.infile, 'p_bucket.pickle') with open(p_bucket_path, 'rb') as infile: self.p_bucket = pickle.load(infile) self.seg = Segmentor(args) self.path = args.infile self.sim_th = args.sim_th self.stop_words = get_stop_words(args.stop_words) self.args = args def search(self, sentence): if not sentence or type(sentence) != str: return None res = list() c_bucket = list() seg_sen = list(self.seg.cut(sentence)) seg_sen = list(filter(lambda x: x not in self.stop_words, seg_sen)) for w in seg_sen: if w in self.p_bucket: c_bucket += self.p_bucket[w] c_bucket = list(set(c_bucket)) cmp, score = list(), list() for bucket in c_bucket: bucket_path = os.path.join(self.path, bucket) check_file(bucket_path) infile = open(bucket_path, 'r', encoding="utf-8") for inline in infile: inline = inline.rstrip() line = inline.split(':::')[0] seg_list = list(self.seg.cut(line)) seg_list = list( filter(lambda x: x not in self.stop_words, seg_list)) sc = jaccard(seg_sen, seg_list) if sc < self.args.threshold: continue cmp.append(inline) score.append(sc) infile.close() zipped = zip(cmp, score) zipped = sorted(zipped, key=lambda x: x[1], reverse=True) right = None if self.args.top_k <= 0 else self.args.top_k for (cp, sc) in zipped[:right]: res.append(cp) return res
def run(self, questions): args = self._get_parser() # preliminary work ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = questions for inline in tqdm(infile): inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list( filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) name_map = dict() for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_name = id_name.format(idx) new_path = os.path.join(args.output, new_name) os.rename(origin_path, new_path) name_map[file_name] = new_name for k, v in p_bucket.items(): p_bucket[k] = list(map(lambda x: name_map[x], v)) #合并文件 output_file = os.path.join(args.output, 'all_cluster.txt') try: if os.path.isfile(output_file): os.unlink(output_file) except Exception as e: print(e) file_list = os.listdir(args.output) fw = open(output_file, 'w+') for file in file_list: with open(os.path.join(args.output, file)) as f: for line in f.readlines(): fw.write(str(int(file)) + ',' + line) fw.close() df = pd.read_csv(output_file, names=['id', 'text']) df.columns = ['cluster_id', 'ques'] print('All is well') # json.dumps(dict(ques=ques)) df_dict = df.set_index('cluster_id').T.to_dict('records')[0] #dataframe 的数据格式转换 #df 0 aa # 0 aaa => aa [aaa] # 1 bb bb [] #df_dict = {0: aa, 1: bb} print(df_dict) result_dict = {} for cluster_id, ques in df_dict.items(): li = df[df['cluster_id'] == cluster_id].ques.values.tolist() # if(ques in li): li.remove(ques) result_dict[ques] = li my_list = [result_dict] my_df = pd.DataFrame(my_list).T my_df = my_df.reset_index() my_df.columns = ['ques', 'info'] print(my_df) return my_df.to_json(orient="records", force_ascii=False)
def main(): args = _get_parser() # preliminary work check_file(args.infile) ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") for line in tqdm(infile): line = line.rstrip() is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list(filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_path = os.path.join(args.output, id_name.format(idx)) os.rename(origin_path, new_path) print('All is well')