def parse_multicolumn(cls, s): r"""找出s中第一次出现的满足模式的multicolumn,返回3个关键值 :returns: 第1个参数是该合并单元格的尺寸,固定格式: (行数, 列数),只有一行是也会写'1' >>> TexTabular.parse_multicolumn(r'\multicolumn{2}{|c|}{aa\multirow{3}*{特点}bb}') ((3, 2), '|c|', 'aa特点bb') """ # 1 基本的模式匹配抓取 m = re.search(r'\\multicolumn' + grp_bracket(3, inner=True) * 2 + grp_bracket(5, inner=True), s) # 最后层多套下,我怕不够用 if not m: return None # 2 取出参数值 m, col_align, text = m.groups() m = int(m) # 3 如果有 multirow if 'multirow' in text: n, bigstructs, width, fixup, text = cls.parse_multirow(text, brace_text_only=False) else: n = 1 # if isinstance(text, str) and re.match(r'\d+$', text): text = int(text) # 如果可以,直接识别为数字 return (n, m), col_align, text
def grp_figure(cnt_groups=0, parpic=False): """生成跟图片匹配相关的表达式 D:\2017LaTeX\D招培试卷\高中地理,用过 \captionfig{3-3.eps}{图~3} 奕本从2018秋季教材开始使用多种图片格式 191224周二18:20 更新:匹配到的图片名不带花括号 """ ibrace3 = grp_bracket(3, inner=True) if cnt_groups == 0: # 不分组 s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + grp_bracket( 3) # 注意第1组fig要放最后面 elif cnt_groups == 1: # 只分1组,那么只对图片括号内的内容分组 s = r'\\(?:includegraphics|figt|figc|figr|fig).*?' + ibrace3 elif cnt_groups == 2: # 只分2组,那么只对插图命令和图片分组 s = r'\\(includegraphics|figt|figc|figr|fig).*?' + ibrace3 elif cnt_groups == 3: s = r'\\(includegraphics|figt|figc|figr|fig)(.*?)' + ibrace3 else: s = None if s and parpic: s = r'{?\\parpic(?:\[.\])?{' + s + r'}*' return s
def parse_align(cls, s): r"""解析latex表头的列对齐格式 latex表头的规则很复杂,这里目前只处理一些较常用的功能点 :param s: 内容文本 :return: 不考虑竖线和一些高级对齐格式,暂时返回一个str 长度是表格列数,每个元素是一个字母存储对齐信息(后续可以扩展更细致的对齐格式信息) >>> TexTabular.parse_align('{|c|c|c|c|c|c|c|c|c<{}|c|}') 'cccccccccc' >>> TexTabular.parse_align('{|c|w{6em}|w{23mm}|w{47mm}|w{22mm}|}') 'cwwww' >>> TexTabular.parse_align('cc*{8}{l}') 'ccllllllll' >>> TexTabular.parse_align('|c|') 'c' >>> TexTabular.parse_align('|c|*{2}{m{38mm}<{\\centering}|}') 'cmm' """ # 展开 *{n}{列格式} 模式 s = re.sub(r'\*(\d+)', r'*{\1}', s) # 给*数字加上花括号,不然我的匹配会错 s = re.sub(r'\*{(\d+)}' + grp_bracket(3, inner=True), lambda m: m.group(2) * int(m.group(1)), s) # 删除其他干扰字符 if s[0] == '{' and s[-1] == '}': s = s[1:-1] # 去掉头尾 { } s = re.sub(r'{.*?}', '', s) for char in '|<>!': s = s.replace(char, '') return s
def core(s): if inner: # 只取\lewis{}花括号里内容的定位 raise ValueError( r"lewis模式没有inner模式,如果需要可以使用NestEnv(s).inner(r'\lewis{')") lewis = r'\\(l|L)ewis' + grp_bracket(5, inner=True) # 基本匹配模式 ms = re.finditer(rf'(H?~*{lewis}\s*|~*H)*(~*{lewis}|~*H)', s) # 有一定的延展 return [m.span(0) for m in ms if 'lewis' in m.group().lower()]
def brieftexstr(s): """对比两段tex文本 """ # 1 删除百分注 s = re.sub(r'%' + grp_bracket(2, '<', '>'), r'', s) # 2 删除所有空白字符 # debuglib.dprint(debuglib.typename(s)) s = re.sub(r'\s+', '', s) # 3 转小写字符 s = s.casefold() return s
def core(s): right, parts = 0, [] while True: m0 = re.search(r'\\(' + name + r')(?![a-zA-Z])', s[right:]) if not m0: break left, right = m0.start() + right, m0.end() + right if star: m1 = re.match(r'(\s*)(\*)', s[right:]) if m1 and m1.group(1).count('\n') <= linefeed and m1.group( 2): right += m1.end() if optional: m2 = re.match(r'(\s*)(' + grp_bracket(5, '[') + ')', s[right:]) if m2 and m2.group(1).count('\n') <= linefeed and m2.group( 2): right += m2.end() cur_cnt, pattern = 0, r'(\s*)(' + grp_bracket(5) + ')' max_bracket_ = max_bracket if max_bracket == float('inf'): if m0.group(1) in ('begin', 'end'): max_bracket_ = 1 # 有些命令只能匹配一个花括号 if m0.group(1) in ('hfil', 'hfill'): max_bracket_ = 0 # 有些命令不能匹配花括号 while cur_cnt < max_bracket_: m3 = re.match(pattern, s[right:]) if m3 and m3.group(1).count('\n') <= linefeed and m3.group( 2): right += m3.end() cur_cnt += 1 else: break if cur_cnt >= min_bracket: parts.append([left, right]) return parts
def parse_multirow(cls, s, brace_text_only=True): r""" :param brace_text_only: 只取花括号里面的内容 如果为False,会把multirow外部的内容做拼接 multirow 和 multicolumn 的不同是,第1、2个花括号后面可以有可选参数。 第2个花括号如果内容是*,可以省略。 两个[]的内容先省略,不做分析处理 注意:这里会取出前后缀内容!业务需要,防止bug,不过这种概率很小 >>> TexTabular.parse_multirow(r'\multirow{2}*{特点}') (2, None, '*', None, '特点') >>> TexTabular.parse_multirow(r'\multirow{2}{*}{特点}') (2, None, '*', None, '特点') >>> TexTabular.parse_multirow(r'aa\multirow{2}[bla1]{*}[bla2]{特点}bb', brace_text_only=False) (2, 'bla1', '*', 'bla2', 'aa特点bb') TODO multirow第一个数字是可以负值的,代表向上合并单元格数, """ square = r'(?:\[(.*?)\])?' # 可选参数 m = re.search(r'\\multirow' + grp_bracket(3, inner=True) + square + r'(?:{(.*?)}|(\*))' + square + grp_bracket(5, inner=True), s) if not m: return None n, bigstructs, width1, width2, fixup, text = m.groups() width = width1 or width2 if not brace_text_only: text = s[:m.start()] + text + s[m.end():] # if re.match(r'\d+$', text): text = int(text) # 如果可以,直接识别为数字 n = int(n) if -1 <= n <= 1: n = 1 elif n > 1: pass else: raise ValueError(f'{s} 不支持解析multirow第一个值为负数,向上合并单元格的情况') return n, bigstructs, width, fixup, text
def init_from_latex(ws, content): """ 注意没有取名为from_latex,因为ws是事先创建好的,这里只是能输入latex代码进行初始化而已 """ from openpyxl.styles import Border, Alignment, Side from pyxllib.text.pupil import grp_bracket from pyxllib.text.latex import TexTabular BRACE2 = grp_bracket(2, inner=True) BRACE5 = grp_bracket(5, inner=True) # 暂时统一边框线的样式 borders。不做细化解析 double = Side(border_style='thin', color='000000') # 处理表头 data_tex = re.search(r'\\begin{tabular}\s*(?:\[.*?\])?\s*' + BRACE5, content).group(1) col_pos = TexTabular.parse_align(data_tex) # 每列的格式控制 # dprint(self.data_tex, col_pos) total_col = len(col_pos) # 删除头尾标记 s = re.sub(r'\\begin{tabular}(?:\[.*?\])?' + BRACE5, '', re.sub(r'\\end{tabular}', '', content)) row, col = 1, 1 # 先用简单不严谨的规则确定用全网格,还是无网格 # if '\\hline' not in s and '\\midrule' not in s: # border = 0 # 用 \\ 分割处理每一行 for line in re.split(r'\\\\(?!{)', s)[:-1]: # dprint(line) # 1 处理当前行的所有列元素 cur_line = line # dprint(line) # 清除特殊格式数据 cur_line = re.sub(r'\\cmidrule' + BRACE2, '', cur_line) cur_line = re.sub(r'\\cline' + BRACE2, '', cur_line) for t in (r'\midrule', r'\toprule', r'\bottomrule', r'\hline', '\n'): cur_line = cur_line.replace(t, '') # 遍历每列 # dprint(cur_line) for item in cur_line.strip().split('&'): item = item.strip() # dprint(item) cur_loc = excel_addr(row, col) # dprint(row, col) if 'multicolumn' in item: size, align, text = TexTabular.parse_multicolumn(item) align = TexTabular.parse_align( align) if align else col_pos[col - 1] # 如果没有写对齐,用默认列的格式 n, m = size # 左右对齐,默认是left align = { 'l': 'left', 'c': 'center', 'r': 'right' }.get(align, 'left') cell = ws[cur_loc].mcell() if cell.value: cell.value += '\n' + text else: cell.value = text ws[cur_loc].alignment = Alignment(horizontal=align, vertical='center') merge_loc = excel_addr(row + n - 1, col + m - 1) ws.merge_cells(f'{cur_loc}:{merge_loc}') col += m elif 'multirow' in item: n, bigstructs, width, fixup, text = TexTabular.parse_multirow( item, brace_text_only=False) try: ws[cur_loc] = text except AttributeError: # 遇到合并单元格重叠问题,就修改旧的合并单元格,然后添加新单元格 # 例如原来 A1:A3 是一个合并单元格,现在要独立一个A3,则原来的部分重置为A1:A2 rng = ws[cur_loc].in_range() ws.unmerge_cells(rng.coord) # 解除旧的合并单元格 ws.merge_cells(re.sub(r'\d+$', f'{row - 1}', rng.coord)) ws[cur_loc] = text align = { 'l': 'left', 'c': 'center', 'r': 'right' }.get(col_pos[col - 1], 'left') ws[cur_loc].alignment = Alignment(horizontal=align, vertical='center') # dprint(item, row, n) merge_loc = excel_addr(row + n - 1, col) ws.merge_cells(f'{cur_loc}:{merge_loc}') col += 1 else: if ws[cur_loc].celltype() == 0: ws[cur_loc].value = item # dprint(item, col_pos, col) align = { 'l': 'left', 'c': 'center', 'r': 'right' }.get(col_pos[col - 1], 'left') ws[cur_loc].alignment = Alignment(horizontal=align, vertical='center') col += 1 # 2 其他border等格式控制 if r'\midrule' in line or r'\toprule' in line or r'\bottomrule' in line or r'\hline' in line: # 该行画整条线 loc_1 = excel_addr(row, 1) loc_2 = excel_addr(row, total_col) comb_loc = f'{loc_1}:{loc_2}' for cell in ws[comb_loc][0]: cell.border = Border(top=double) if r'\cmidrule' in line: for match in re.findall(r'\\cmidrule{([0-9]+)-([0-9]+)}', line): loc_1 = excel_addr(row, match[0]) loc_2 = excel_addr(row, match[1]) comb_loc = f'{loc_1}:{loc_2}' for cell in ws[comb_loc][0]: cell.border = Border(top=double) if r'\cline' in line: for match in re.findall(r'\\cline{([0-9]+)-([0-9]+)}', line): loc_1 = excel_addr(row, match[0]) loc_2 = excel_addr(row, match[1]) comb_loc = f'{loc_1}:{loc_2}' for cell in ws[comb_loc][0]: cell.border = Border(top=double) row, col = row + 1, 1