def search(self, sentence): if not sentence or type(sentence) != str: return None res = list() c_bucket = list() seg_sen = list(self.seg.cut(sentence)) seg_sen = list(filter(lambda x: x not in self.stop_words, seg_sen)) for w in seg_sen: if w in self.p_bucket: c_bucket += self.p_bucket[w] c_bucket = list(set(c_bucket)) cmp, score = list(), list() for bucket in c_bucket: bucket_path = os.path.join(self.path, bucket) check_file(bucket_path) infile = open(bucket_path, 'r', encoding="utf-8") for inline in infile: inline = inline.rstrip() line = inline.split(':::')[0] seg_list = list(self.seg.cut(line)) seg_list = list( filter(lambda x: x not in self.stop_words, seg_list)) sc = jaccard(seg_sen, seg_list) if sc < self.args.threshold: continue cmp.append(inline) score.append(sc) infile.close() zipped = zip(cmp, score) zipped = sorted(zipped, key=lambda x: x[1], reverse=True) right = None if self.args.top_k <= 0 else self.args.top_k for (cp, sc) in zipped[:right]: res.append(cp) return res
def main(): global connection, cursor cpu = multiprocessing.cpu_count() print("CPU {}".format(cpu)) # preliminary work check_file(args.infile) ensure_dir(args.output) all_lines = 0 if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work all_bucked = defaultdict(list) p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load tokenizer print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") i = 0 all_data = infile.readlines() n = 10000 # 大列表中几个数据组成一个小列表 lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)] print(len(lstgs)) r = [] tr = [] pool = multiprocessing.Pool(processes=4) for xyz in lstgs: tr.append(pool.apply_async(fenci, (xyz, ))) pool.close() pool.join() for res in tr: tmp = res.get() for z in tmp: if z not in jieba_cache.keys(): jieba_cache[z] = tmp[z] else: print(z) for st in stop_words: stop_words_cache[st] = 1 r.clear() r = None all_lines = len(jieba_cache) print("开始执行 总 {} 行".format(all_lines)) print("缓存成功jieba {}".format(len(jieba_cache))) print("缓存成功停用词 {}".format(len(stop_words_cache))) all_data = jieba_cache.keys() for inline in all_data: if inline == '太原去贵阳怎么走': print("") i = i + 1 print("当前第 {} 行----总 {}".format(i, all_lines)) inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = jieba_cache[line] llll = [] if stop_words: for mmmm in seg_list: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) seg_list = llll for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: array = all_bucked[bucket] selected = sample_dict(array, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: jieba_cache[x], selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: llll = [] for mmmm in sen: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) filt_selected.append(llll) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True all_bucked[bucket].append(line) for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break # print("{} jaccard耗时 {}".format( inline, endtime - starttime)) if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_array = [line] all_bucked[bucket_name] = bucket_array for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() batch_size = 0 for zzzz in all_bucked: batch_size = batch_size + 1 connection = pymysql.connect(host='47.99.87.74', user='******', password='******', db='august', port=33306) cursor = connection.cursor() all_bucked_data = [] for zx in all_bucked[zzzz]: all_bucked_data.append([all_bucked[zzzz][0], zx, today]) print("当前批次 {} 共 {}".format(batch_size, len(all_bucked))) cursor.executemany( "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)", (all_bucked_data)) connection.commit() cursor.close() connection.close() print('All is well')
def run(self, questions): args = self._get_parser() # preliminary work ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = questions for inline in tqdm(infile): inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list( filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) name_map = dict() for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_name = id_name.format(idx) new_path = os.path.join(args.output, new_name) os.rename(origin_path, new_path) name_map[file_name] = new_name for k, v in p_bucket.items(): p_bucket[k] = list(map(lambda x: name_map[x], v)) #合并文件 output_file = os.path.join(args.output, 'all_cluster.txt') try: if os.path.isfile(output_file): os.unlink(output_file) except Exception as e: print(e) file_list = os.listdir(args.output) fw = open(output_file, 'w+') for file in file_list: with open(os.path.join(args.output, file)) as f: for line in f.readlines(): fw.write(str(int(file)) + ',' + line) fw.close() df = pd.read_csv(output_file, names=['id', 'text']) df.columns = ['cluster_id', 'ques'] print('All is well') # json.dumps(dict(ques=ques)) df_dict = df.set_index('cluster_id').T.to_dict('records')[0] #dataframe 的数据格式转换 #df 0 aa # 0 aaa => aa [aaa] # 1 bb bb [] #df_dict = {0: aa, 1: bb} print(df_dict) result_dict = {} for cluster_id, ques in df_dict.items(): li = df[df['cluster_id'] == cluster_id].ques.values.tolist() # if(ques in li): li.remove(ques) result_dict[ques] = li my_list = [result_dict] my_df = pd.DataFrame(my_list).T my_df = my_df.reset_index() my_df.columns = ['ques', 'info'] print(my_df) return my_df.to_json(orient="records", force_ascii=False)
def main(): args = _get_parser() # preliminary work check_file(args.infile) ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") for line in tqdm(infile): line = line.rstrip() is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list(filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_path = os.path.join(args.output, id_name.format(idx)) os.rename(origin_path, new_path) print('All is well')