Ejemplo n.º 1
0
    def parse_multicolumn(cls, s):
        r"""找出s中第一次出现的满足模式的multicolumn,返回3个关键值

        :returns:
            第1个参数是该合并单元格的尺寸,固定格式: (行数, 列数),只有一行是也会写'1'

        >>> TexTabular.parse_multicolumn(r'\multicolumn{2}{|c|}{aa\multirow{3}*{特点}bb}')
        ((3, 2), '|c|', 'aa特点bb')
        """
        # 1 基本的模式匹配抓取
        m = re.search(r'\\multicolumn' + grp_bracket(3, inner=True) * 2
                      + grp_bracket(5, inner=True), s)  # 最后层多套下,我怕不够用
        if not m: return None

        # 2 取出参数值
        m, col_align, text = m.groups()
        m = int(m)

        # 3 如果有 multirow
        if 'multirow' in text:
            n, bigstructs, width, fixup, text = cls.parse_multirow(text, brace_text_only=False)
        else:
            n = 1
        # if isinstance(text, str) and re.match(r'\d+$', text): text = int(text)  # 如果可以,直接识别为数字
        return (n, m), col_align, text
Ejemplo n.º 2
0
def grp_figure(cnt_groups=0, parpic=False):
    """生成跟图片匹配相关的表达式

    D:\2017LaTeX\D招培试卷\高中地理,用过  \captionfig{3-3.eps}{图~3}
    奕本从2018秋季教材开始使用多种图片格式

    191224周二18:20 更新:匹配到的图片名不带花括号
    """
    ibrace3 = grp_bracket(3, inner=True)

    if cnt_groups == 0:  # 不分组
        s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + grp_bracket(
            3)  # 注意第1组fig要放最后面
    elif cnt_groups == 1:  # 只分1组,那么只对图片括号内的内容分组
        s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + ibrace3
    elif cnt_groups == 2:  # 只分2组,那么只对插图命令和图片分组
        s = r'\\(includegraphics|figt|figc|figr|fig).*?' + ibrace3
    elif cnt_groups == 3:
        s = r'\\(includegraphics|figt|figc|figr|fig)(.*?)' + ibrace3
    else:
        s = None

    if s and parpic:
        s = r'{?\\parpic(?:\[.\])?{' + s + r'}*'

    return s
Ejemplo n.º 3
0
    def parse_align(cls, s):
        r"""解析latex表头的列对齐格式

        latex表头的规则很复杂,这里目前只处理一些较常用的功能点

        :param s: 内容文本
        :return: 不考虑竖线和一些高级对齐格式,暂时返回一个str
            长度是表格列数,每个元素是一个字母存储对齐信息(后续可以扩展更细致的对齐格式信息)

        >>> TexTabular.parse_align('{|c|c|c|c|c|c|c|c|c<{}|c|}')
        'cccccccccc'
        >>> TexTabular.parse_align('{|c|w{6em}|w{23mm}|w{47mm}|w{22mm}|}')
        'cwwww'
        >>> TexTabular.parse_align('cc*{8}{l}')
        'ccllllllll'
        >>> TexTabular.parse_align('|c|')
        'c'
        >>> TexTabular.parse_align('|c|*{2}{m{38mm}<{\\centering}|}')
        'cmm'
        """
        # 展开 *{n}{列格式} 模式
        s = re.sub(r'\*(\d+)', r'*{\1}', s)  # 给*数字加上花括号,不然我的匹配会错
        s = re.sub(r'\*{(\d+)}' + grp_bracket(3, inner=True), lambda m: m.group(2) * int(m.group(1)), s)
        # 删除其他干扰字符
        if s[0] == '{' and s[-1] == '}': s = s[1:-1]  # 去掉头尾 { }
        s = re.sub(r'{.*?}', '', s)
        for char in '|<>!':
            s = s.replace(char, '')
        return s
Ejemplo n.º 4
0
        def core(s):
            if inner:  # 只取\lewis{}花括号里内容的定位
                raise ValueError(
                    r"lewis模式没有inner模式,如果需要可以使用NestEnv(s).inner(r'\lewis{')")

            lewis = r'\\(l|L)ewis' + grp_bracket(5, inner=True)  # 基本匹配模式
            ms = re.finditer(rf'(H?~*{lewis}\s*|~*H)*(~*{lewis}|~*H)',
                             s)  # 有一定的延展
            return [m.span(0) for m in ms if 'lewis' in m.group().lower()]
Ejemplo n.º 5
0
def brieftexstr(s):
    """对比两段tex文本
    """
    # 1 删除百分注
    s = re.sub(r'%' + grp_bracket(2, '<', '>'), r'', s)
    # 2 删除所有空白字符
    # debuglib.dprint(debuglib.typename(s))
    s = re.sub(r'\s+', '', s)
    # 3 转小写字符
    s = s.casefold()
    return s
Ejemplo n.º 6
0
        def core(s):
            right, parts = 0, []
            while True:
                m0 = re.search(r'\\(' + name + r')(?![a-zA-Z])', s[right:])
                if not m0: break
                left, right = m0.start() + right, m0.end() + right

                if star:
                    m1 = re.match(r'(\s*)(\*)', s[right:])
                    if m1 and m1.group(1).count('\n') <= linefeed and m1.group(
                            2):
                        right += m1.end()

                if optional:
                    m2 = re.match(r'(\s*)(' + grp_bracket(5, '[') + ')',
                                  s[right:])
                    if m2 and m2.group(1).count('\n') <= linefeed and m2.group(
                            2):
                        right += m2.end()

                cur_cnt, pattern = 0, r'(\s*)(' + grp_bracket(5) + ')'
                max_bracket_ = max_bracket
                if max_bracket == float('inf'):
                    if m0.group(1) in ('begin', 'end'):
                        max_bracket_ = 1  # 有些命令只能匹配一个花括号
                    if m0.group(1) in ('hfil', 'hfill'):
                        max_bracket_ = 0  # 有些命令不能匹配花括号
                while cur_cnt < max_bracket_:
                    m3 = re.match(pattern, s[right:])
                    if m3 and m3.group(1).count('\n') <= linefeed and m3.group(
                            2):
                        right += m3.end()
                        cur_cnt += 1
                    else:
                        break

                if cur_cnt >= min_bracket:
                    parts.append([left, right])

            return parts
Ejemplo n.º 7
0
    def parse_multirow(cls, s, brace_text_only=True):
        r"""

        :param brace_text_only: 只取花括号里面的内容
            如果为False,会把multirow外部的内容做拼接

        multirow 和 multicolumn 的不同是,第1、2个花括号后面可以有可选参数。
        第2个花括号如果内容是*,可以省略。
        两个[]的内容先省略,不做分析处理

        注意:这里会取出前后缀内容!业务需要,防止bug,不过这种概率很小

        >>> TexTabular.parse_multirow(r'\multirow{2}*{特点}')
        (2, None, '*', None, '特点')
        >>> TexTabular.parse_multirow(r'\multirow{2}{*}{特点}')
        (2, None, '*', None, '特点')
        >>> TexTabular.parse_multirow(r'aa\multirow{2}[bla1]{*}[bla2]{特点}bb', brace_text_only=False)
        (2, 'bla1', '*', 'bla2', 'aa特点bb')

        TODO multirow第一个数字是可以负值的,代表向上合并单元格数,
        """
        square = r'(?:\[(.*?)\])?'  # 可选参数
        m = re.search(r'\\multirow' + grp_bracket(3, inner=True) + square +
                      r'(?:{(.*?)}|(\*))' + square + grp_bracket(5, inner=True), s)
        if not m: return None
        n, bigstructs, width1, width2, fixup, text = m.groups()
        width = width1 or width2
        if not brace_text_only: text = s[:m.start()] + text + s[m.end():]
        # if re.match(r'\d+$', text): text = int(text)  # 如果可以,直接识别为数字

        n = int(n)
        if -1 <= n <= 1:
            n = 1
        elif n > 1:
            pass
        else:
            raise ValueError(f'{s} 不支持解析multirow第一个值为负数,向上合并单元格的情况')

        return n, bigstructs, width, fixup, text
Ejemplo n.º 8
0
    def init_from_latex(ws, content):
        """ 注意没有取名为from_latex,因为ws是事先创建好的,这里只是能输入latex代码进行初始化而已 """
        from openpyxl.styles import Border, Alignment, Side

        from pyxllib.text.pupil import grp_bracket
        from pyxllib.text.latex import TexTabular

        BRACE2 = grp_bracket(2, inner=True)
        BRACE5 = grp_bracket(5, inner=True)

        # 暂时统一边框线的样式 borders。不做细化解析
        double = Side(border_style='thin', color='000000')

        # 处理表头
        data_tex = re.search(r'\\begin{tabular}\s*(?:\[.*?\])?\s*' + BRACE5,
                             content).group(1)
        col_pos = TexTabular.parse_align(data_tex)  # 每列的格式控制
        # dprint(self.data_tex, col_pos)
        total_col = len(col_pos)
        # 删除头尾标记
        s = re.sub(r'\\begin{tabular}(?:\[.*?\])?' + BRACE5, '',
                   re.sub(r'\\end{tabular}', '', content))
        row, col = 1, 1

        # 先用简单不严谨的规则确定用全网格,还是无网格
        # if '\\hline' not in s and '\\midrule' not in s:
        #     border = 0

        # 用 \\ 分割处理每一行
        for line in re.split(r'\\\\(?!{)', s)[:-1]:
            # dprint(line)
            # 1 处理当前行的所有列元素
            cur_line = line
            # dprint(line)
            # 清除特殊格式数据
            cur_line = re.sub(r'\\cmidrule' + BRACE2, '', cur_line)
            cur_line = re.sub(r'\\cline' + BRACE2, '', cur_line)
            for t in (r'\midrule', r'\toprule', r'\bottomrule', r'\hline',
                      '\n'):
                cur_line = cur_line.replace(t, '')

            # 遍历每列
            # dprint(cur_line)
            for item in cur_line.strip().split('&'):
                item = item.strip()
                # dprint(item)
                cur_loc = excel_addr(row, col)
                # dprint(row, col)

                if 'multicolumn' in item:
                    size, align, text = TexTabular.parse_multicolumn(item)
                    align = TexTabular.parse_align(
                        align) if align else col_pos[col -
                                                     1]  # 如果没有写对齐,用默认列的格式
                    n, m = size
                    # 左右对齐,默认是left
                    align = {
                        'l': 'left',
                        'c': 'center',
                        'r': 'right'
                    }.get(align, 'left')
                    cell = ws[cur_loc].mcell()
                    if cell.value:
                        cell.value += '\n' + text
                    else:
                        cell.value = text
                    ws[cur_loc].alignment = Alignment(horizontal=align,
                                                      vertical='center')
                    merge_loc = excel_addr(row + n - 1, col + m - 1)
                    ws.merge_cells(f'{cur_loc}:{merge_loc}')
                    col += m
                elif 'multirow' in item:
                    n, bigstructs, width, fixup, text = TexTabular.parse_multirow(
                        item, brace_text_only=False)
                    try:
                        ws[cur_loc] = text
                    except AttributeError:
                        # 遇到合并单元格重叠问题,就修改旧的合并单元格,然后添加新单元格
                        # 例如原来 A1:A3 是一个合并单元格,现在要独立一个A3,则原来的部分重置为A1:A2
                        rng = ws[cur_loc].in_range()
                        ws.unmerge_cells(rng.coord)  # 解除旧的合并单元格
                        ws.merge_cells(re.sub(r'\d+$', f'{row - 1}',
                                              rng.coord))
                        ws[cur_loc] = text
                    align = {
                        'l': 'left',
                        'c': 'center',
                        'r': 'right'
                    }.get(col_pos[col - 1], 'left')
                    ws[cur_loc].alignment = Alignment(horizontal=align,
                                                      vertical='center')
                    # dprint(item, row, n)
                    merge_loc = excel_addr(row + n - 1, col)
                    ws.merge_cells(f'{cur_loc}:{merge_loc}')
                    col += 1
                else:
                    if ws[cur_loc].celltype() == 0:
                        ws[cur_loc].value = item
                        # dprint(item, col_pos, col)
                        align = {
                            'l': 'left',
                            'c': 'center',
                            'r': 'right'
                        }.get(col_pos[col - 1], 'left')
                        ws[cur_loc].alignment = Alignment(horizontal=align,
                                                          vertical='center')
                    col += 1

            # 2 其他border等格式控制
            if r'\midrule' in line or r'\toprule' in line or r'\bottomrule' in line or r'\hline' in line:
                # 该行画整条线
                loc_1 = excel_addr(row, 1)
                loc_2 = excel_addr(row, total_col)
                comb_loc = f'{loc_1}:{loc_2}'
                for cell in ws[comb_loc][0]:
                    cell.border = Border(top=double)
            if r'\cmidrule' in line:
                for match in re.findall(r'\\cmidrule{([0-9]+)-([0-9]+)}',
                                        line):
                    loc_1 = excel_addr(row, match[0])
                    loc_2 = excel_addr(row, match[1])
                    comb_loc = f'{loc_1}:{loc_2}'
                    for cell in ws[comb_loc][0]:
                        cell.border = Border(top=double)
            if r'\cline' in line:
                for match in re.findall(r'\\cline{([0-9]+)-([0-9]+)}', line):
                    loc_1 = excel_addr(row, match[0])
                    loc_2 = excel_addr(row, match[1])
                    comb_loc = f'{loc_1}:{loc_2}'
                    for cell in ws[comb_loc][0]:
                        cell.border = Border(top=double)
            row, col = row + 1, 1