예제 #1
0
    def mark_reduplicate(self):
        '''标记相邻重复'''        
        print('>检查相邻重复:')

        last_reply = None
        reduplicate_list = []

        # 查找重复
        for rpl in self.rlist:
            if last_reply and last_reply.text == rpl.text:
                reduplicate_list.append(rpl)
            last_reply = rpl

        # 处理重复
        for i in reduplicate_list:
            i.text = self.append_note(i.text, '【与上一条回复重复】')
            i.suggest = False

        reduplicate_count = len(reduplicate_list)
        if reduplicate_count:
            color_p = color.fore_color(reduplicate_count, color.Fore.RED)
        else:
            color_p = color.fore_color(reduplicate_count, color.Fore.GREEN)

        print('...标记了{0}个重复回复'.format(color_p))
예제 #2
0
파일: tz2txt.py 프로젝트: ixinshang/tz2txt
def compile_txt(infile, outfile, 
                discard='', label='', automode=False):
    if not automode:
        # 文件大小
        size1 = os.path.getsize(infile)
        infile = read_input(infile)
        if infile == None:
            return None, None, None, None
        
    # keep_discard
    keep_discard = True if discard else False
    
    # 格式
    label = label.lower()
    if label == 'page':
        label = 1
    elif label == 'floor':
        label = 2
    else:
        label = 0
    
    output, discard_output, info_list, chinese_ct = \
                datamachine.bp_to_final(infile, keep_discard, label)

    if automode:
        return output, discard_output, info_list, chinese_ct
    else:
        # write file
        write_output(output, outfile, show_size=False)
        
        if discard_output and discard:
            write_output(discard_output, discard, show_size=False)
        
        # format & color
        size1 = format(size1, ',')
        
        size2 = os.path.getsize(outfile)
        size2 = format(size2, ',')
        color_size = color.fore_color(size2, color.Fore.MAGENTA)
        
        chinese_ct = format(chinese_ct, ',')
        color_chinese = color.fore_color(chinese_ct, color.Fore.CYAN)
        
        print('输入文件{0}字节;输出文件{1}字节,约{2}个汉字。'.format(
                                                    size1, 
                                                    color_size,
                                                    color_chinese)
              )
    
    return None, None, info_list, chinese_ct
예제 #3
0
    def mark_cantdeal(self):
        '''标记无法处理'''       
        print('>查找无法处理的引用')
        
        quote_count = 0

        for rpl in self.rlist:
            if self.has_unhandled_quote(rpl):
                rpl.text = self.append_note(rpl.text, '【无法处理的回复】')
                rpl.suggest = False
                quote_count += 1

        if quote_count:
            color_p = color.fore_color(quote_count, color.Fore.RED)
        else:
            color_p = color.fore_color(quote_count, color.Fore.GREEN)
            
        print('...标记了{0}个无法处理引用的回复'.format(color_p))
예제 #4
0
    def mark_empty(self):
        '''标记空回复'''
        print('>标记空白回复:')
        
        p = red.re_dict(r'^\s*$')
        blank_count = 0
        
        for rpl in self.rlist:
            if p.match(rpl.text):
                rpl.suggest = False
                blank_count += 1

        if blank_count:
            color_p = color.fore_color(blank_count, color.Fore.RED)
        else:
            color_p = color.fore_color(blank_count, color.Fore.GREEN)

        print('...标记了{0}个空白回复'.format(color_p))
예제 #5
0
파일: tz2txt.py 프로젝트: zhufeng/tz2txt
def compile_txt(infile, outfile, discard=''):
    # 文件大小
    size1 = os.path.getsize(infile)
    size1 = format(size1,',')
    
    datamachine.bp_to_final(infile, outfile, discard)

    size2 = os.path.getsize(outfile)
    size2 = format(size2,',')

    color_p = color.fore_color(size2, color.Fore.MAGENTA)
    print('输入文件{0}字节,输出文件{1}字节'.format(size1, color_p))
예제 #6
0
파일: tz2txt.py 프로젝트: szmf/tz2txt
def auto(url, pg_count, outfile, discard, label, from_gui=False):
    # 下载
    dl_object, title = download_till(url, pg_count,
                                     '', automode=True)
    if dl_object == None:
        return None, None, None, None, None
    
    print('\n ===下载完毕,准备自动处理===\n')

    # 自动处理
    bp_object = bp_process_bp(dl_object, '', automode=True)
    print('\n ===自动处理完毕,准备编译===\n')
    
    # 编译
    if from_gui:
        discard = 'from_gui'
    output, discard_output, info_list, chinese_ct = \
        compile_txt(bp_object, '', discard, label, automode=True)
        
    if not from_gui:
        # write file
        write_output(output, outfile, show_size=False)
        
        if discard_output:
            write_output(discard_output, discard, show_size=False)
        
        # format & color        
        size2 = os.path.getsize(outfile)
        size2 = format(size2, ',')
        color_size = color.fore_color(size2, color.Fore.MAGENTA)
        
        chinese_ct = format(chinese_ct, ',')
        color_chinese = color.fore_color(chinese_ct, color.Fore.CYAN)
        
        print('输出文件{0}字节,约{1}个汉字。'.format(
                                                    color_size,
                                                    color_chinese)
              )
    else:
        return output, discard_output, title, info_list, chinese_ct
예제 #7
0
파일: tz2txt.py 프로젝트: ixinshang/tz2txt
def auto(url, pg_count, outfile, discard, label, from_gui=False):
    # 下载
    dl_object, title = download_till(url, pg_count,
                                     '', automode=True)
    if dl_object == None:
        return None, None, None, None, None
    
    print('\n ===下载完毕,准备自动处理===\n')

    # 自动处理
    bp_object = bp_process_bp(dl_object, '', automode=True)
    print('\n ===自动处理完毕,准备编译===\n')
    
    # 编译
    if from_gui:
        discard = 'from_gui'
    output, discard_output, info_list, chinese_ct = \
        compile_txt(bp_object, '', discard, label, automode=True)
        
    if not from_gui:
        # write file
        write_output(output, outfile, show_size=False)
        
        if discard_output:
            write_output(discard_output, discard, show_size=False)
        
        # format & color        
        size2 = os.path.getsize(outfile)
        size2 = format(size2, ',')
        color_size = color.fore_color(size2, color.Fore.MAGENTA)
        
        chinese_ct = format(chinese_ct, ',')
        color_chinese = color.fore_color(chinese_ct, color.Fore.CYAN)
        
        print('输出文件{0}字节,约{1}个汉字。'.format(
                                                    color_size,
                                                    color_chinese)
              )
    else:
        return output, discard_output, title, info_list, chinese_ct
예제 #8
0
    def process_1(self):
        '''自定义处理'''

        # 处理引用
        print('>处理引用')
        r = (r'^(?=(.*@@(\S{1,16})##))', r'\1', r'.*?', r'(?<=\n)',
             r'(?=(.*?(?<=\n)', re_separater, r'\s+))', r'(?!\3.*?(?<=\n)',
             re_separater, r'\s+)', r'\s*(.*?)\s*', re_separater, r'\s+(.*)')

        p = red.re_dict(''.join(r), red.DOTALL)

        quote_count = 0
        for rpl in self.rlist:
            rpl.text, n = p.subn(r'回复 \2:\n【引用开始】\4\n【引用结束】\n\5', rpl.text)
            quote_count += n

        color_p = color.fore_color(quote_count, color.Fore.CYAN)
        print('...处理了{0}条引用'.format(color_p))
예제 #9
0
    def process_1(self):
        '''自定义处理'''
        
        # 处理引用
        print('>处理引用')
        r = (r'^(?=(.*@@(\S{1,16})##))',
             r'\1',
             r'.*?',
             r'(?<=\n)',
             r'(?=(.*?(?<=\n)', re_separater, r'\s+))',
             r'(?!\3.*?(?<=\n)', re_separater, r'\s+)',
             r'\s*(.*?)\s*', re_separater, r'\s+(.*)')

        p = red.re_dict(''.join(r), red.DOTALL)

        quote_count = 0
        for rpl in self.rlist:
            rpl.text, n = p.subn(r'回复 \2:\n【引用开始】\4\n【引用结束】\n\5',
                                  rpl.text)
            quote_count += n

        color_p = color.fore_color(quote_count, color.Fore.CYAN)
        print('...处理了{0}条引用'.format(color_p))
예제 #10
0
def web_to_internal(url, pg_count):
    '''论坛帖子 到 内部形式'''
    
    # 下载器
    f = Fetcher()
    # 页面解析器
    parser = None
    
    tz = Tiezi()
    dl_count = 0
    while True:
        # 是否下载完指定页数 
        if pg_count >= 0 and dl_count >= pg_count:
            print('下载完指定页数{0},停止下载\n'.format(pg_count))
            break
        
        # 下载数据
        data = f.fetch_url(url)
        if not data:
            print('无法读取页面:{0}'.format(url))
            break
        
        # 准备解析器
        if not parser:
            parser = AbPageParser.get_parser(url, data)
            if not parser:
                return None
            
            # 检查解析器
            parser.set_page(url, data)
            if not parser.check_parse_methods():
                print(' 可能是网页改版,导致无法提取数据。')
                print(' 请使用“检测新版本”功能检测是否有新程序可用。')
                print()
                return None
            
            # 起始下载页
            tz.begin_url = url
        else:
            # 送数据到解析器
            parser.set_page(url, data)

        # 设置tz的信息
        if not tz.louzhu:
            pub_date = None
            
            tz.title = parser.wrap_get_title()
            tz.louzhu = parser.wrap_get_louzhu()
            
            # 首页1楼作楼主、发帖日期
            if parser.wrap_get_page_num() == 1:
                rplys = parser.wrap_get_replys()
                if rplys:
                    if not tz.louzhu:
                        tz.louzhu = rplys[0].author
                    pub_date = rplys[0].time.strftime('%Y-%m-%d %H:%M')

            # 手工输入楼主ID
            if not tz.louzhu:
                tz.louzhu = input('无法提取楼主ID,请手工输入楼主ID:').strip()

            # 打印帖子信息
            print_str = '标题:%s\n楼主:%s\n' % (tz.title, tz.louzhu) 
            if pub_date != None:
                print_str += '发帖时间:%s\n' % pub_date
            save_print(print_str)

            # 得到本地格式名
            tz.local_processor = parser.get_local_processor()

        next_url = parser.wrap_get_next_pg_url()
        pg_num = parser.wrap_get_page_num()
        # 添加页
        pg = Page(url,
                  pg_num,
                  bool(next_url),
                  parser.wrap_get_replys()
                  )
        tz.add_page(pg)
        dl_count += 1
        print('已下载第{0}页, 共{1}层'.format(pg_num, len(pg.replys)))

        # 帖子的最后一页?
        if not next_url:
            print('\n下载完帖子的最后一页(第{0}页),停止'.format(pg.page_num))
            break
        url = next_url

    count = sum(len(p.replys) for p in tz.pages)

    color_p1 = color.fore_color(len(tz.pages), color.Fore.YELLOW)
    info = '共载入{pg_count}页,共有回复{rpl_count}条'.format(
                            pg_count=color_p1,
                            rpl_count=count
                            )
    print(info)

    # 发出响声
    if winsound != None:
        try:
            winsound.Beep(400, 320) # (frequency, duration)
        except:
            pass

    # 转义编排文本的标签
    def escape_bp_tag(text):
        # 转义编排标签
        text = red.sub(r'^(<(?:time|mark)>)',
                       r'#\1',
                       text,
                       flags=red.MULTILINE)
        
        # 【引用开始】、【引用结束】
        text = red.sub(r'【(引用(?:开始|结束))】',
                       r'[\1]',
                       text)

        # 标记的处理信息
        if text.endswith('【与上一条回复重复】') \
           or text.endswith('【无法处理的回复】'):
            text = text + '#'
        
        return text
    
    for p in tz.pages:
        for r in p.replys:
            r.text = escape_bp_tag(r.text)
    
    return tz
예제 #11
0
def bp_to_final(infile, keep_discard=True, label=0):
    '''编译 编排to最终、丢弃'''
    class placeholder:
        def __init__(self, posi=0, pagenum=0, show=False):
            self.posi = posi
            self.pagenum = pagenum
            self.show = show

    def is_not_empty(lst):
        for i in lst:
            yield i.strip() != ''
    
    info_list = list()
    holder_list = [placeholder()]
    
    text_list = list()
    abandon_list = list()
    
    pickcount, allcount = 0, 0

    # 用于把 [img]http://img3.laibafile.cn/p/m/1234567.jpg[/img]
    # 替换成 【图片:1234567.jpg】
    picr = (r'\[img\s*(\d+|)\].*?\[/img\]')
    pattern = red.re_dict(picr)
    
    # 提取页号
    re_pagenum = red.re_dict(r'^<page>页号:\s*(\d+)\s*$')
    
    # 提取时间
    p_time = (r'^<time>[^<]*<\d\d(\d\d-\d{1,2}-\d{1,2})\s+'
              r'(\d{1,2}:\d{1,2})')
    re_time = red.re_dict(p_time)

    # 读取编排文本
    in_reply = False
    temp = list()
    
    current_page = 0
    current_time = ''

    for line in infile.readlines():
        if line.startswith('<time>'):
            if in_reply == True:
                print('格式错误:回复文本的前后包括标志不配对。\n',
                      '丢失<mark>行')
                break
            in_reply = True
            
            # current_time
            if label == 2:
                m = re_time.search(line)
                if m:
                    current_time = m.group(1) + ' ' + m.group(2)
                else:
                    current_time = ''
            
        elif line.startswith('<mark>'):
            if in_reply == False:
                print('格式错误:回复文本的前后包括标志不配对。\n',
                      '丢失<time>行')
                break
                                   
            if line.endswith('█\n') or line.endswith('█'):
                pickcount += 1
                
                if label == 0:
                    pass
                elif label == 1:
                    holder_list[-1].show = True
                elif label == 2:
                    floor_label = ('№.%d ☆☆☆'
                                   ' 发表于%s  P.%d '
                                   '☆☆☆\n'
                                   '-------------------------'
                                   '-------------------------'
                                   '\n')
                    floor_label = floor_label % \
                        (pickcount, current_time, current_page)
                    text_list.append(floor_label)
                    
                text_list.extend(temp)
                text_list.append('\n')

            elif any(is_not_empty(temp)):
                abandon_list.extend(temp)
                abandon_list.append('∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞\n\n')
                
            temp.clear()
            allcount += 1
            in_reply = False
            
        elif in_reply:
            line = pattern.sub(r'【一张图片\1】', line)
            temp.append(line)

        # 由于上一个elif,以下必定not in_reply
        elif not text_list and not abandon_list and \
             line.startswith('<tiezi>'):
            info_list.append(line[len('<tiezi>'):])
        
        elif label != 0:
            m = re_pagenum.search(line)
            if m:
                current_page = int(m.group(1))
                if label == 1:
                    text_list.append('')
                    holder = placeholder(len(text_list)-1,
                                         current_page
                                         )
                    holder_list.append(holder)

    infile.close()
    if in_reply == True:
        print('格式错误:最后一个回复文本的前后包括标志不配对。')

    
    # 页码 辅助格式
    if label == 1:
        for holder in holder_list[1:]:
            if holder.show:
                page_label = ('☆☆☆☆☆'
                              ' 进入第%d页 '
                              '☆☆☆☆☆\n'
                              '----------------'
                              '----------------'
                              '\n\n') % holder.pagenum
                text_list[holder.posi] = page_label

    color_p1 = color.fore_color(allcount, color.Fore.YELLOW)
    color_p2 = color.fore_color(pickcount, color.Fore.YELLOW)
    print('共有{0}条回复,选择了其中{1}条回复'.format(color_p1, color_p2))

    # output的内容============
    # 连接
    if info_list:
        s_iter = itertools.chain(info_list, '\n', text_list)
    else:
        s_iter = iter(text_list)
    s = ''.join(s_iter)

    # 连续的多张图片
    s = red.sub(r'(?:【一张图片(\d+|)】\s+){3,}',
                r'【多张图片\1】\n\n',
                s)
    
    s = red.sub(r'(?:【一张图片(\d+|)】\s+){2}',
                r'【两张图片\1】\n\n',
                s)

    # 输出StringIO
    output = StringIO(s)
    
    # 汉字字数
    chinese_ct = count_chinese(s)

    # 丢弃文本
    if keep_discard and abandon_list:
        s_iter = itertools.chain(info_list, '\n', abandon_list)
        s = ''.join(s_iter)
        discard = StringIO(s)
    else:
        discard = None
            
    return output, discard, info_list, chinese_ct