def mark_reduplicate(self): '''标记相邻重复''' print('>检查相邻重复:') last_reply = None reduplicate_list = [] # 查找重复 for rpl in self.rlist: if last_reply and last_reply.text == rpl.text: reduplicate_list.append(rpl) last_reply = rpl # 处理重复 for i in reduplicate_list: i.text = self.append_note(i.text, '【与上一条回复重复】') i.suggest = False reduplicate_count = len(reduplicate_list) if reduplicate_count: color_p = color.fore_color(reduplicate_count, color.Fore.RED) else: color_p = color.fore_color(reduplicate_count, color.Fore.GREEN) print('...标记了{0}个重复回复'.format(color_p))
def compile_txt(infile, outfile, discard='', label='', automode=False): if not automode: # 文件大小 size1 = os.path.getsize(infile) infile = read_input(infile) if infile == None: return None, None, None, None # keep_discard keep_discard = True if discard else False # 格式 label = label.lower() if label == 'page': label = 1 elif label == 'floor': label = 2 else: label = 0 output, discard_output, info_list, chinese_ct = \ datamachine.bp_to_final(infile, keep_discard, label) if automode: return output, discard_output, info_list, chinese_ct else: # write file write_output(output, outfile, show_size=False) if discard_output and discard: write_output(discard_output, discard, show_size=False) # format & color size1 = format(size1, ',') size2 = os.path.getsize(outfile) size2 = format(size2, ',') color_size = color.fore_color(size2, color.Fore.MAGENTA) chinese_ct = format(chinese_ct, ',') color_chinese = color.fore_color(chinese_ct, color.Fore.CYAN) print('输入文件{0}字节;输出文件{1}字节,约{2}个汉字。'.format( size1, color_size, color_chinese) ) return None, None, info_list, chinese_ct
def mark_cantdeal(self): '''标记无法处理''' print('>查找无法处理的引用') quote_count = 0 for rpl in self.rlist: if self.has_unhandled_quote(rpl): rpl.text = self.append_note(rpl.text, '【无法处理的回复】') rpl.suggest = False quote_count += 1 if quote_count: color_p = color.fore_color(quote_count, color.Fore.RED) else: color_p = color.fore_color(quote_count, color.Fore.GREEN) print('...标记了{0}个无法处理引用的回复'.format(color_p))
def mark_empty(self): '''标记空回复''' print('>标记空白回复:') p = red.re_dict(r'^\s*$') blank_count = 0 for rpl in self.rlist: if p.match(rpl.text): rpl.suggest = False blank_count += 1 if blank_count: color_p = color.fore_color(blank_count, color.Fore.RED) else: color_p = color.fore_color(blank_count, color.Fore.GREEN) print('...标记了{0}个空白回复'.format(color_p))
def compile_txt(infile, outfile, discard=''): # 文件大小 size1 = os.path.getsize(infile) size1 = format(size1,',') datamachine.bp_to_final(infile, outfile, discard) size2 = os.path.getsize(outfile) size2 = format(size2,',') color_p = color.fore_color(size2, color.Fore.MAGENTA) print('输入文件{0}字节,输出文件{1}字节'.format(size1, color_p))
def auto(url, pg_count, outfile, discard, label, from_gui=False): # 下载 dl_object, title = download_till(url, pg_count, '', automode=True) if dl_object == None: return None, None, None, None, None print('\n ===下载完毕,准备自动处理===\n') # 自动处理 bp_object = bp_process_bp(dl_object, '', automode=True) print('\n ===自动处理完毕,准备编译===\n') # 编译 if from_gui: discard = 'from_gui' output, discard_output, info_list, chinese_ct = \ compile_txt(bp_object, '', discard, label, automode=True) if not from_gui: # write file write_output(output, outfile, show_size=False) if discard_output: write_output(discard_output, discard, show_size=False) # format & color size2 = os.path.getsize(outfile) size2 = format(size2, ',') color_size = color.fore_color(size2, color.Fore.MAGENTA) chinese_ct = format(chinese_ct, ',') color_chinese = color.fore_color(chinese_ct, color.Fore.CYAN) print('输出文件{0}字节,约{1}个汉字。'.format( color_size, color_chinese) ) else: return output, discard_output, title, info_list, chinese_ct
def process_1(self): '''自定义处理''' # 处理引用 print('>处理引用') r = (r'^(?=(.*@@(\S{1,16})##))', r'\1', r'.*?', r'(?<=\n)', r'(?=(.*?(?<=\n)', re_separater, r'\s+))', r'(?!\3.*?(?<=\n)', re_separater, r'\s+)', r'\s*(.*?)\s*', re_separater, r'\s+(.*)') p = red.re_dict(''.join(r), red.DOTALL) quote_count = 0 for rpl in self.rlist: rpl.text, n = p.subn(r'回复 \2:\n【引用开始】\4\n【引用结束】\n\5', rpl.text) quote_count += n color_p = color.fore_color(quote_count, color.Fore.CYAN) print('...处理了{0}条引用'.format(color_p))
def web_to_internal(url, pg_count): '''论坛帖子 到 内部形式''' # 下载器 f = Fetcher() # 页面解析器 parser = None tz = Tiezi() dl_count = 0 while True: # 是否下载完指定页数 if pg_count >= 0 and dl_count >= pg_count: print('下载完指定页数{0},停止下载\n'.format(pg_count)) break # 下载数据 data = f.fetch_url(url) if not data: print('无法读取页面:{0}'.format(url)) break # 准备解析器 if not parser: parser = AbPageParser.get_parser(url, data) if not parser: return None # 检查解析器 parser.set_page(url, data) if not parser.check_parse_methods(): print(' 可能是网页改版,导致无法提取数据。') print(' 请使用“检测新版本”功能检测是否有新程序可用。') print() return None # 起始下载页 tz.begin_url = url else: # 送数据到解析器 parser.set_page(url, data) # 设置tz的信息 if not tz.louzhu: pub_date = None tz.title = parser.wrap_get_title() tz.louzhu = parser.wrap_get_louzhu() # 首页1楼作楼主、发帖日期 if parser.wrap_get_page_num() == 1: rplys = parser.wrap_get_replys() if rplys: if not tz.louzhu: tz.louzhu = rplys[0].author pub_date = rplys[0].time.strftime('%Y-%m-%d %H:%M') # 手工输入楼主ID if not tz.louzhu: tz.louzhu = input('无法提取楼主ID,请手工输入楼主ID:').strip() # 打印帖子信息 print_str = '标题:%s\n楼主:%s\n' % (tz.title, tz.louzhu) if pub_date != None: print_str += '发帖时间:%s\n' % pub_date save_print(print_str) # 得到本地格式名 tz.local_processor = parser.get_local_processor() next_url = parser.wrap_get_next_pg_url() pg_num = parser.wrap_get_page_num() # 添加页 pg = Page(url, pg_num, bool(next_url), parser.wrap_get_replys() ) tz.add_page(pg) dl_count += 1 print('已下载第{0}页, 共{1}层'.format(pg_num, len(pg.replys))) # 帖子的最后一页? if not next_url: print('\n下载完帖子的最后一页(第{0}页),停止'.format(pg.page_num)) break url = next_url count = sum(len(p.replys) for p in tz.pages) color_p1 = color.fore_color(len(tz.pages), color.Fore.YELLOW) info = '共载入{pg_count}页,共有回复{rpl_count}条'.format( pg_count=color_p1, rpl_count=count ) print(info) # 发出响声 if winsound != None: try: winsound.Beep(400, 320) # (frequency, duration) except: pass # 转义编排文本的标签 def escape_bp_tag(text): # 转义编排标签 text = red.sub(r'^(<(?:time|mark)>)', r'#\1', text, flags=red.MULTILINE) # 【引用开始】、【引用结束】 text = red.sub(r'【(引用(?:开始|结束))】', r'[\1]', text) # 标记的处理信息 if text.endswith('【与上一条回复重复】') \ or text.endswith('【无法处理的回复】'): text = text + '#' return text for p in tz.pages: for r in p.replys: r.text = escape_bp_tag(r.text) return tz
def bp_to_final(infile, keep_discard=True, label=0): '''编译 编排to最终、丢弃''' class placeholder: def __init__(self, posi=0, pagenum=0, show=False): self.posi = posi self.pagenum = pagenum self.show = show def is_not_empty(lst): for i in lst: yield i.strip() != '' info_list = list() holder_list = [placeholder()] text_list = list() abandon_list = list() pickcount, allcount = 0, 0 # 用于把 [img]http://img3.laibafile.cn/p/m/1234567.jpg[/img] # 替换成 【图片:1234567.jpg】 picr = (r'\[img\s*(\d+|)\].*?\[/img\]') pattern = red.re_dict(picr) # 提取页号 re_pagenum = red.re_dict(r'^<page>页号:\s*(\d+)\s*$') # 提取时间 p_time = (r'^<time>[^<]*<\d\d(\d\d-\d{1,2}-\d{1,2})\s+' r'(\d{1,2}:\d{1,2})') re_time = red.re_dict(p_time) # 读取编排文本 in_reply = False temp = list() current_page = 0 current_time = '' for line in infile.readlines(): if line.startswith('<time>'): if in_reply == True: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<mark>行') break in_reply = True # current_time if label == 2: m = re_time.search(line) if m: current_time = m.group(1) + ' ' + m.group(2) else: current_time = '' elif line.startswith('<mark>'): if in_reply == False: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<time>行') break if line.endswith('█\n') or line.endswith('█'): pickcount += 1 if label == 0: pass elif label == 1: holder_list[-1].show = True elif label == 2: floor_label = ('№.%d ☆☆☆' ' 发表于%s P.%d ' '☆☆☆\n' '-------------------------' '-------------------------' '\n') floor_label = floor_label % \ (pickcount, current_time, current_page) text_list.append(floor_label) text_list.extend(temp) text_list.append('\n') elif any(is_not_empty(temp)): abandon_list.extend(temp) abandon_list.append('∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞\n\n') temp.clear() allcount += 1 in_reply = False elif in_reply: line = pattern.sub(r'【一张图片\1】', line) temp.append(line) # 由于上一个elif,以下必定not in_reply elif not text_list and not abandon_list and \ line.startswith('<tiezi>'): info_list.append(line[len('<tiezi>'):]) elif label != 0: m = re_pagenum.search(line) if m: current_page = int(m.group(1)) if label == 1: text_list.append('') holder = placeholder(len(text_list)-1, current_page ) holder_list.append(holder) infile.close() if in_reply == True: print('格式错误:最后一个回复文本的前后包括标志不配对。') # 页码 辅助格式 if label == 1: for holder in holder_list[1:]: if holder.show: page_label = ('☆☆☆☆☆' ' 进入第%d页 ' '☆☆☆☆☆\n' '----------------' '----------------' '\n\n') % holder.pagenum text_list[holder.posi] = page_label color_p1 = color.fore_color(allcount, color.Fore.YELLOW) color_p2 = color.fore_color(pickcount, color.Fore.YELLOW) print('共有{0}条回复,选择了其中{1}条回复'.format(color_p1, color_p2)) # output的内容============ # 连接 if info_list: s_iter = itertools.chain(info_list, '\n', text_list) else: s_iter = iter(text_list) s = ''.join(s_iter) # 连续的多张图片 s = red.sub(r'(?:【一张图片(\d+|)】\s+){3,}', r'【多张图片\1】\n\n', s) s = red.sub(r'(?:【一张图片(\d+|)】\s+){2}', r'【两张图片\1】\n\n', s) # 输出StringIO output = StringIO(s) # 汉字字数 chinese_ct = count_chinese(s) # 丢弃文本 if keep_discard and abandon_list: s_iter = itertools.chain(info_list, '\n', abandon_list) s = ''.join(s_iter) discard = StringIO(s) else: discard = None return output, discard, info_list, chinese_ct