def clean_summarizes(): conf = ConfigHandler.loadFromFile() title_path = 'title_{}.pkl'.format(conf['title_table']) content_path = 'content_{}.pkl'.format(conf['content_table']) titles = handle_pickle(title_path) contents = handle_pickle(content_path) none_remove = [] for content in contents: if content != None: none_remove.append(content) max_len = max([len(title) for title in titles]) news = [] for row in range(len(none_remove)): cur_sums = none_remove[row] new_sums = [[], []] for i in range(len(cur_sums[0])): if cur_sums[0][i].strip() != '' and len(cur_sums[0][i]) < max_len: new_sums[0].append(cur_sums[0][i]) new_sums[1].append(cur_sums[1][i]) if len(new_sums[0]) > 2: news.append(new_sums) handle_pickle('clean_' + content_path, data=news, is_save=True)
def get_titles(): conf = ConfigHandler.loadFromFile() sql = f"select {conf['title_column']} from {conf['title_table']}" handler = MysqlHandler(host=conf['db_host'], user=conf['db_user'], password=conf['db_passwd'], port=int(conf['db_port'])) df = get_clean_df(sql, handler) data = [] for col in df: for title in df[col]: data.append(title) handle_pickle(f"title_{conf['title_table']}.pkl", data=data, is_save=True)
def extract_from_given_title(): conf = ConfigHandler.loadFromFile() total_title = handle_pickle('title_{}.pkl'.format(conf['title_table'])) total_sum = handle_pickle('clean_content_{}.pkl'.format( conf['content_table'])) data = [(total_sum[i], total_title) for i in range(len(total_sum))] result = apply_by_multiprocessor(data=data, func=extract_from_content) json.dump(result, open('result.json', 'w', encoding='utf-8'), ensure_ascii=False, indent=4)
def clean_title_summarizes_pair(): conf = ConfigHandler.loadFromFile() path = '{}.pkl'.format(conf['title_table']) data = handle_pickle(path) none_remove = [] for row in range(len(data)): if data[row] != None: none_remove.append(data[row]) max_len = 0 for row in range(len(none_remove)): if len(none_remove[row][0]) > max_len: max_len = len(none_remove[row][0]) cur_sums = none_remove[row][2] new_sums = [[], []] for i in range(len(cur_sums[0])): if cur_sums[0][i] != '': new_sums[0].append(cur_sums[0][i]) new_sums[1].append(cur_sums[1][i]) none_remove[row][2] = new_sums news = [] for row in range(len(none_remove)): cur_sums = none_remove[row][2] new_sums = [[], []] for i in range(len(cur_sums[0])): if len(cur_sums[0][i]) < max_len: new_sums[0].append(cur_sums[0][i]) new_sums[1].append(cur_sums[1][i]) if len(new_sums[0]) > 2: none_remove[row][2] = new_sums news.append(none_remove[row]) handle_pickle('clean_' + path, data=news, is_save=True)
def get_data(): conf = ConfigHandler.loadFromFile() data = handle_pickle('clean_' + '{}.pkl'.format(conf['title_table'])) total_sum = [] total_title = [] for row in range(len(data)): if data[row] is None: continue if data[row][0] in total_title: continue total_title.append(data[row][0]) total_sum.append(data[row][2]) return total_title, total_sum