コード例 #1
0
ファイル: datamachine.py プロジェクト: szmf/tz2txt
    def escape_bp_tag(text):
        # 转义编排标签
        text = red.sub(r'^(<(?:time|mark)>)',
                       r'#\1',
                       text,
                       flags=red.MULTILINE)

        # 【引用开始】、【引用结束】
        text = red.sub(r'【(引用(?:开始|结束)|补充回复)】', r'[\1]', text)

        # 标记的处理信息
        if text.endswith('【与上一条回复重复】') \
           or text.endswith('【无法处理的回复】'):
            text = text + '#'

        return text
コード例 #2
0
ファイル: datamachine.py プロジェクト: yongchao88/tz2txt
def bp_to_final(infile, keep_discard=True, label=0):
    '''编译 编排to最终、丢弃'''
    class placeholder:
        def __init__(self, posi=0, pagenum=0, show=False):
            self.posi = posi
            self.pagenum = pagenum
            self.show = show

    def is_not_empty(lst):
        for i in lst:
            yield i.strip() != ''
    
    info_list = list()
    holder_list = [placeholder()]
    
    text_list = list()
    abandon_list = list()
    
    pickcount, allcount = 0, 0

    # 用于把 [img]http://img3.laibafile.cn/p/m/1234567.jpg[/img]
    # 替换成 【图片:1234567.jpg】
    picr = (r'\[img\s*(\d+|)\].*?\[/img\]')
    pattern = red.re_dict(picr)
    
    # 提取页号
    re_pagenum = red.re_dict(r'^<page>页号:\s*(\d+)\s*$')
    
    # 提取时间
    p_time = (r'^<time>[^<]*<\d\d(\d\d-\d{1,2}-\d{1,2})\s+'
              r'(\d{1,2}:\d{1,2})')
    re_time = red.re_dict(p_time)

    # 读取编排文本
    in_reply = False
    temp = list()
    
    current_page = 0
    current_time = ''

    for line in infile.readlines():
        if line.startswith('<time>'):
            if in_reply == True:
                print('格式错误:回复文本的前后包括标志不配对。\n',
                      '丢失<mark>行')
                break
            in_reply = True
            
            # current_time
            if label == 2:
                m = re_time.search(line)
                if m:
                    current_time = m.group(1) + ' ' + m.group(2)
                else:
                    current_time = ''
            
        elif line.startswith('<mark>'):
            if in_reply == False:
                print('格式错误:回复文本的前后包括标志不配对。\n',
                      '丢失<time>行')
                break
                                   
            if line.endswith('█\n') or line.endswith('█'):
                pickcount += 1
                
                if label == 0:
                    pass
                elif label == 1:
                    holder_list[-1].show = True
                elif label == 2:
                    floor_label = ('№.%d ☆☆☆'
                                   ' 发表于%s  P.%d '
                                   '☆☆☆\n'
                                   '-------------------------'
                                   '-------------------------'
                                   '\n')
                    floor_label = floor_label % \
                        (pickcount, current_time, current_page)
                    text_list.append(floor_label)
                    
                text_list.extend(temp)
                text_list.append('\n')

            elif any(is_not_empty(temp)):
                abandon_list.extend(temp)
                abandon_list.append('∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞\n\n')
                
            temp.clear()
            allcount += 1
            in_reply = False
            
        elif in_reply:
            line = pattern.sub(r'【一张图片\1】', line)
            temp.append(line)

        # 由于上一个elif,以下必定not in_reply
        elif not text_list and not abandon_list and \
             line.startswith('<tiezi>'):
            info_list.append(line[len('<tiezi>'):])
        
        elif label != 0:
            m = re_pagenum.search(line)
            if m:
                current_page = int(m.group(1))
                if label == 1:
                    text_list.append('')
                    holder = placeholder(len(text_list)-1,
                                         current_page
                                         )
                    holder_list.append(holder)

    infile.close()
    if in_reply == True:
        print('格式错误:最后一个回复文本的前后包括标志不配对。')

    
    # 页码 辅助格式
    if label == 1:
        for holder in holder_list[1:]:
            if holder.show:
                page_label = ('☆☆☆☆☆'
                              ' 进入第%d页 '
                              '☆☆☆☆☆\n'
                              '----------------'
                              '----------------'
                              '\n\n') % holder.pagenum
                text_list[holder.posi] = page_label

    color_p1 = color.fore_color(allcount, color.Fore.YELLOW)
    color_p2 = color.fore_color(pickcount, color.Fore.YELLOW)
    print('共有{0}条回复,选择了其中{1}条回复'.format(color_p1, color_p2))

    # output的内容============
    # 连接
    if info_list:
        s_iter = itertools.chain(info_list, '\n', text_list)
    else:
        s_iter = iter(text_list)
    s = ''.join(s_iter)

    # 连续的多张图片
    s = red.sub(r'(?:【一张图片(\d+|)】\s+){3,}',
                r'【多张图片\1】\n\n',
                s)
    
    s = red.sub(r'(?:【一张图片(\d+|)】\s+){2}',
                r'【两张图片\1】\n\n',
                s)

    # 输出StringIO
    output = StringIO(s)
    
    # 汉字字数
    chinese_ct = count_chinese(s)

    # 丢弃文本
    if keep_discard and abandon_list:
        s_iter = itertools.chain(info_list, '\n', abandon_list)
        s = ''.join(s_iter)
        discard = StringIO(s)
    else:
        discard = None
            
    return output, discard, info_list, chinese_ct
コード例 #3
0
ファイル: gui.py プロジェクト: ixinshang/tz2txt
    def doit(self):
        # 获取、显示网址
        try:
            u = self.master.clipboard_get().strip()
        except:
            bad = True
            u = ''
        else:
            bad = False

        if bad or not tz2txt.is_url(u):
            self.url.set('无效网址,网址须以http://或https://开头。')
            return
        self.url.set(u)
        
        # 辅助模式
        assist = self.assist.get()
        if assist == 1:
            label = ''
        elif assist == 2:
            label = 'page'
        elif assist == 3:
            label = 'floor'
            
        # 末页
        till = self.till.get().strip()
        try:
            till = int(till)
        except:
            till = -1
        
        # 执行命令
        self.status['fg'] = '#993300'
        self.status['text'] = '处理中'
        self.update()
        
        # except里return
        try:
            output, discard_output, title, info_list, chinese_ct = \
                tz2txt.auto(u, till, '', '', label, from_gui=True)
            if title == None:
                raise Exception('无法完成全自动处理')
        
        except Exception as e:
            print('\n出现异常:', e)
            print('===================================\n')            
            return
        
        else:
            # 显示标题
            title = red.sub(r'[\U00010000-\U0010FFFF]', r'', title)
            title = title.strip()
            self.url.set(title)
        
        finally:
            self.status['fg'] = 'blue'
            self.status['text'] = '待机'
                    
        # 输出文件名
        if self.rename.get():
            output_fn = title + '.txt'
        else:
            output_fn = self.output.get().strip()
        
        # 合法文件名
        output_fn = red.sub(r'[\\/:*?"<>|]', r'', output_fn)
        if output_fn == '.txt':
            output_fn = '楼主.txt'
        
        # 输出内容
        text = output.getvalue()
        output.close()
            
        # 覆盖判断:文件已存在 and 输出有内容 and (强制覆盖 or 选择覆盖)
        if os.path.isfile(output_fn) and \
           text and \
           (self.override.get() == 1 or \
            messagebox.askyesno('输出文件已存在', '是否覆盖?\n%s' % output_fn)
            ):
            # 删除已有目标
            try:
                os.remove(output_fn)
            except:
                pass
        
        # 写入output
        if not os.path.isfile(output_fn) and text:
            try:
                with open(output_fn, 'w', 
                          encoding='gb18030', errors='replace') as f:
                    f.write(text)
                print('\n已保存为:', output_fn)
            except Exception as e:
                print('\n保存文件时出现异常', e)
        
            # 显示信息 
            size2 = os.path.getsize(output_fn)
            size2 = format(size2, ',')
            chinese_ct = format(chinese_ct, ',')
            print('输出文件 {0} 字节,约 {1} 个汉字。'.format(
                                                        size2,
                                                        chinese_ct)
                  )
                
        # 写入discard
        if discard_output != None:
            try:
                text = discard_output.getvalue()
                discard_output.close()
                
                if text:
                    with open(discard_fn, 'w', 
                              encoding='gb18030', errors='replace') as f:
                        f.write(text)
            except Exception as e:
                print('\n保存文件时出现异常', e)

        print()
        for line in info_list:
            if line.startswith('下载时间:'):
                break
            datamachine.save_print(line.rstrip('\n'))
        print('===================================\n')
コード例 #4
0
ファイル: gui.py プロジェクト: yankaics/tz2txt
    def doit(self):
        # 获取、显示网址
        try:
            u = self.master.clipboard_get().strip()
        except:
            bad = True
            u = ''
        else:
            bad = False

        if bad or not tz2txt.is_url(u):
            self.url.set('无效网址,网址须以http://或https://开头。')
            return
        self.url.set(u)

        # 辅助模式
        assist = self.assist.get()
        if assist == 1:
            label = ''
        elif assist == 2:
            label = 'page'
        elif assist == 3:
            label = 'floor'

        # 末页
        till = self.till.get().strip()
        try:
            till = int(till)
        except:
            till = -1

        # 执行命令
        self.status['fg'] = '#993300'
        self.status['text'] = '处理中'
        self.update()

        # except里return
        try:
            output, discard_output, title, info_list, chinese_ct = \
                tz2txt.auto(u, till, '', '', label, from_gui=True)
            if title == None:
                raise Exception('无法完成全自动处理')

        except Exception as e:
            print('\n出现异常:', e)
            print('===================================\n')
            return

        else:
            # 显示标题
            title = red.sub(r'[\U00010000-\U0010FFFF]', r'', title)
            title = title.strip()
            self.url.set(title)

        finally:
            self.status['fg'] = 'blue'
            self.status['text'] = '待机'

        # 输出文件名
        if self.rename.get():
            output_fn = title + '.txt'
        else:
            output_fn = self.output.get().strip()

        # 合法文件名
        output_fn = red.sub(r'[\\/:*?"<>|]', r'', output_fn)
        if output_fn == '.txt':
            output_fn = '楼主.txt'

        # 输出内容
        text = output.getvalue()
        output.close()

        # 覆盖判断:文件已存在 and 输出有内容 and (强制覆盖 or 选择覆盖)
        if os.path.isfile(output_fn) and \
           text and \
           (self.override.get() == 1 or \
            messagebox.askyesno('输出文件已存在', '是否覆盖?\n%s' % output_fn)
            ):
            # 删除已有目标
            try:
                os.remove(output_fn)
            except:
                pass

        # 写入output
        if not os.path.isfile(output_fn) and text:
            try:
                with open(output_fn, 'w', encoding='gb18030',
                          errors='replace') as f:
                    f.write(text)
                print('\n已保存为:', output_fn)
            except Exception as e:
                print('\n保存文件时出现异常', e)

            # 显示信息
            size2 = os.path.getsize(output_fn)
            size2 = format(size2, ',')
            chinese_ct = format(chinese_ct, ',')
            print('输出文件 {0} 字节,约 {1} 个汉字。'.format(size2, chinese_ct))

        # 写入discard
        if discard_output != None:
            try:
                text = discard_output.getvalue()
                discard_output.close()

                if text:
                    with open(discard_fn,
                              'w',
                              encoding='gb18030',
                              errors='replace') as f:
                        f.write(text)
            except Exception as e:
                print('\n保存文件时出现异常', e)

        print()
        for line in info_list:
            if line.startswith('下载时间:'):
                break
            datamachine.save_print(line.rstrip('\n'))
        print('===================================\n')