def is_overparen(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(trs[0], '<td', flags=re.I) if len(tds) != 1: return False if '<table' in tds[0]: return False _keys1 = [ '7c229fcab3c2cd74bf9054ddd5f18383', 'b339cff44a322d9b88562650f4ed6061' ] _keys2 = ['/part/94.png'] if self.md5 is True: if not _is_in(_keys2 + _keys1, tds[0]): return False else: if not _is_in(_keys1 + _keys2, tds[0]): return False tds = find_valid_elements(trs[1], '<td', with_tag=False, flags=re.I) if len(tds) != 1: raise ParseFormatError('[is_overparen]:{}'.format( self.html_string)) self.node_format = r'\overparen{{{}}}' self.value_strs = tds return True
def is_subsup2(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index1 = tds[1].find('>') + 1 if 'style="font-size:90%"' not in tds[1][:index1]: return False tds0 = remove_start_tag(tds[0]) trs2 = find_valid_elements(tds0, '<tr', flags=re.I) if len(trs2) != 2: return False if 'style="font-size: 90%"' not in trs2[0]: return False tds2 = find_valid_elements(tds0, '<td', with_tag=False, flags=re.I) if len(tds2) != 2: return False if '<table' in tds2[1]: return False tds1 = remove_start_tag(tds[1]) self.node_format = '{}^{{{}}}_{{{}}}' self.value_strs = [tds2[1], tds2[0], tds1] return True
def is_sqrt_n(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(trs[1], '<td', flags=re.I) if len(tds) != 1: return False if '<table' in tds[0]: return False _keys1 = [ '02d3676c15d163b68f7c8690c7237c0a', 'b5a0e6f59b3540d7c2e032a6d5cc5ae7' ] _keys2 = ['/part/8730D.png'] if self.md5 is True: if not _is_in(_keys1 + _keys2, tds[0]): return False else: if not _is_in(_keys2 + _keys1, tds[0]): return False tds = find_valid_elements(trs[0], '<td', with_tag=False, flags=re.I) if len(tds) != 2: raise ParseFormatError('[is_sqrt_n]:{}'.format(self.html_string)) self.node_format = r'\sqrt[{}]{{{}}}' self.value_strs = tds return True
def is_underset(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index = tds[1].find('>') + 1 if 'style="font-size: 90%"' not in tds[1][:index]: return False # for uncommon integral _keys1 = [ '7ea7ce25490319b1bc0a30f02283c465', '3d579d20afec8779d54985a5acf51879' ] _keys2 = ['/part/8747.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[0]): if '<table' not in tds[0]: tds = [remove_start_tag(td) for td in tds] self.node_format = '\\underset{{{}}}{{\int}}' self.value_strs = tds[-1:] return True tds = [remove_start_tag(td) for td in tds] self.node_format = '\\underset{{{}}}{{{}}}' self.value_strs = tds[::-1] return True
def is_sqrt(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False tds = find_valid_elements(trs[0], '<td', with_tag=False, flags=re.I) if tds and len(tds) == 2: _keys1 = [ '02d3676c15d163b68f7c8690c7237c0a', 'b5a0e6f59b3540d7c2e032a6d5cc5ae7' ] _keys2 = ['/part/8730D.png'] if self.md5 is True: if not _is_in(_keys1 + _keys2, tds[0]): return False else: if not _is_in(_keys2 + _keys1, tds[0]): return False else: return False self.node_format = r'\sqrt{{{}}}' self.value_strs = [tds[1]] return True
def is_underbrace(self, html_string=None): html_string = html_string or self.html_string trs = find_valid_elements(html_string, '<tr>', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(html_string, '<td', flags=re.I) if len(tds) != 2: return False _keys1 = [ '8cd1f5182b8ab176140b3249472323ac', '99129c72930ac651065df803ef39d322' ] _keys2 = ['/part/H123U.png'] if self.md5 is True: if not _is_in(_keys1 + _keys2, tds[1]): return False else: if not _is_in(_keys2 + _keys1, tds[1]): return False td = remove_start_tag(tds[0]) self.node_format = '\\underbrace{{{}}}' self.value_strs = [td] return True
def is_overrightarrow(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(trs[0], '<td', flags=re.I) if len(tds) != 1: return False if '<table' in tds[0]: return False _keys1 = [ '6bd6a72411898992479ded42ef82f09c', 'a11fdee4d4b49f96bbe3814ad58f817b' ] _keys2 = ['/part/8594.png'] if self.md5 is True: if not _is_in(_keys2 + _keys1, tds[0]): return False else: if not _is_in(_keys1 + _keys2, tds[0]): return False tds = find_valid_elements(trs[1], '<td', with_tag=False, flags=re.I) if len(tds) != 1: raise ParseFormatError('[is_overrightarrow]:{}'.format( self.html_string)) self.node_format = r'\overrightarrow{{{}}}' self.value_strs = tds return True
def is_stackrel(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index1 = tds[0].find('>') + 1 if 'style="font-size: 90%"' not in tds[0][:index1]: return False # is_underrightarrow # change to is_xrightarrow if '<table' not in tds[1]: _keys1 = [ '6bd6a72411898992479ded42ef82f09c', 'a11fdee4d4b49f96bbe3814ad58f817b' ] _keys2 = ['/part/8594.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): td = remove_start_tag(tds[0]) self.node_format = '\\xrightarrow{{{}}}' self.value_strs = [td] return True # # is_xrightleftharpoons # for case, http://www.jyeoo.com/bio2/ques/detail/7003bd54-0861-4152-985a-3c7678ade246 # # {{ _keys1 = [ 'ee5caff647b9b23babff8113e9147821', '341e5bdc05923e0895e330d9c062477f' ] _keys2 = ['/part/8652L.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): if '<table' in tds[1]: return False td = remove_start_tag(tds[0]) self.node_format = '\\xrightleftharpoons{{{}}}' self.value_strs = [td] return True # }} tds = [remove_start_tag(td) for td in tds] self.node_format = r'\stackrel{{{}}}{{{}}}' self.value_strs = tds return True
def is_xrightarrow(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 3: return False tds = find_valid_elements(self.html_string, '<td', with_tag=False, flags=re.I) if len(tds) != 3: return False _keys1 = [ '6bd6a72411898992479ded42ef82f09c', 'a11fdee4d4b49f96bbe3814ad58f817b' ] _keys2 = ['/part/8594.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): if '<table' in tds[1]: return False if tds[2].strip(): self.node_format = '\\xrightarrow[{}]{{{}}}' self.value_strs = [tds[2], tds[0]] else: self.node_format = '\\xrightarrow{{{}}}' self.value_strs = tds[:1] return True # # is_xrightleftharpoons # # \xrightleftharpoons is defined as: # \Newextarrow{\xrightleftharpoons}{10,10}{0x21CC} # \Newextarrow is at "extpfeil.js" # # {{ _keys1 = [ 'ee5caff647b9b23babff8113e9147821', '341e5bdc05923e0895e330d9c062477f' ] _keys2 = ['/part/8652L.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): if '<table' in tds[1]: return False self.node_format = '\\xrightleftharpoons[{}]{{{}}}' self.value_strs = [tds[2], tds[0]] return True # }} return False
def is_sum(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 3: return False tds = find_valid_elements(self.html_string, '<td', with_tag=False, flags=re.I) if len(tds) != 3: return False if '<table' in tds[1]: return False # is_lim # {{ if tds[1] == 'lim': self.node_format = r'\lim\limits_{{{}}}' self.value_strs = [tds[2]] return True # }} # is_sum # {{ _keys1 = [ '19d3e9593386103f95e71affc87e62ea', '5f4100b557a7c8116b2a45e4435b67ae' ] _keys2 = ['/part/8721.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): self.node_format = r'\sum^{{{}}}_{{{}}}' self.value_strs = [tds[0], tds[2]] return True # }} # is_prod # {{ _keys1 = [ 'b9331d3ee2218a431c9203512001f479', '9b0bbd95adbebda854a4ec3b1c2ab2e6' ] _keys2 = ['/part/8719.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): self.node_format = r'\prod^{{{}}}_{{{}}}' self.value_strs = [tds[0], tds[2]] return True # }} return False
def get_sub_nodes(self, value_str): sub_nodes = list() elems = find_valid_elements(value_str, '<table', flags=re.I) begin = 0 for elem in elems: index = value_str.find(elem, begin) html_elem = value_str[begin:index] if html_elem.strip(): sub_node = Node() sub_node.md5 = self.md5 sub_node.html_string = latex_excape(html_elem) sub_node.is_str = True sub_node.node_format = '{}' sub_nodes.append(sub_node) begin = index + len(elem) sub_node = Node() sub_node.md5 = self.md5 sub_node.html_string = elem sub_nodes.append(sub_node) html_elem = value_str[begin:] if html_elem.strip(): sub_node = Node() sub_node.md5 = self.md5 sub_node.html_string = latex_excape(html_elem) sub_node.is_str = True sub_node.node_format = '{}' sub_nodes.append(sub_node) return sub_nodes
def test_find_valid_elements(self): es = find_valid_elements(self.html_string) self.assertEqual(es, [ '<p> p1 <div id="div">\n <p> target </p> </div> <pre>\n code</pre> </p>', '<b> </b>' ])
def is_integral(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index = tds[1].find('>') + 1 if 'style="font-size: 90%"' not in tds[1][:index]: return False tds = [remove_start_tag(td) for td in tds] self.node_format = '\\underset{{{}}}{{{}}}' self.value_strs = tds[::-1] return True
def record_questions(self, html_string, subj): qss = find_valid_elements(html_string, '<table ') n = 0 for qs in qss: mod = re_qid.search(qs) if mod: qid = mod.group(1) key = 'manfen5_zujuan_qs_' + qid save_html(key, qs, {'subj': subj}) n += 1 return n
def get_spans(html_string): imgs = get_html_element('<img ', html_string, only_tag=True) for img in imgs: html_string = html_string.replace(img, img + '</img>') rs = [] spans = find_valid_elements(html_string, '<(span|img) ', with_tag=True, regex=True) for span in spans: rs.append(Span(span)) # format_spans(rs) return rs
def is_align(self): """ e.g. \(\begin{align} -CAGGATCCC & - \\ -GTCCTAGGG & - \end{align}\) """ trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False tds = find_valid_elements(self.html_string, '<td', with_tag=False, flags=re.I) if len(tds) != 3: return False if not (tds[0] == '' and tds[2] == ''): return False if not tds[1].startswith('<table'): return False index = tds[1].find('>') + 1 if 'text-align: left' not in tds[1][:index]: return False trs = find_valid_elements(tds[1], '<tr', flags=re.I) value_strs = list() rows = list() for tr in trs: tds = find_valid_elements(tr, '<td', with_tag=False, flags=re.I) value_strs += tds rows.append(' & '.join(['{}'] * len(tds))) self.node_format = '\\begin{{align}} %s \\end{{align}}' % ' \\\\ '.join( rows) self.value_strs = value_strs return True
def is_xlongequal2(self, html_string=None): # case http://www.jyeoo.com/chemistry/ques/detail/b6e8a3b9-d9e5-48ae-903d-40fd3f05e969 html_string = html_string or self.html_string tds = find_valid_elements(html_string, '<td', flags=re.I) if len(tds) == 2: if tds[1] == '<td style="font-size: 90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>' \ or tds[1] == '<td style="font-size:90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>': td = remove_start_tag(tds[0]) self.node_format = '\\xlongequal{{{}}}' self.value_strs = [td] return True return False
def is_subsup(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False tds = find_valid_elements(trs[0], '<td', with_tag=True, flags=re.I) tds = [remove_start_tag(td) for td in tds] if len(tds) != 2: return False if '"msubsup' not in tds[1]: return False divs = find_valid_elements(tds[1], '<div [^<>]+"msubsup', regex=True, flags=re.I) if len(divs) != 2: raise ParseFormatError('[is_subsup]:{}'.format(self.html_string)) e = tds[0] sub = '' sup = '' for div in divs: index = div.find('>') + 1 if 'msubsup_sup' in div[:index]: sup = remove_start_tag(div) elif 'msubsup_sub' in div[:index]: sub = remove_start_tag(div) else: raise ParseFormatError('[is_subsup][not sub or sup]:{}'.format( self.html_string)) self.node_format = '{{{}}}^{{{}}}_{{{}}}' self.value_strs = [e, sup, sub] return True
def _discard_mathml_displaystyle_for_subsup(mathml): """ # fractions that are in sub/sup are not needed to display remove displaystyle in which of sub/sup """ subsups = find_valid_elements(mathml, '<msu(b|p)', regex=True, with_tag=False) subsups = list(set(subsups)) subsups = sort_by_len(subsups, reverse=True) for subsup in subsups: subsup_t = remove_tag('<mstyle displaystyle', subsup, all=False) mathml = mathml.replace(subsup, subsup_t, 1) return mathml
def _restore_latex(html_string): """ replace <latextag> </latextag> to \( \) at only outsider of latex """ latextags = find_valid_elements(html_string, '<latextag>') for latextag in latextags: latextag_t = latextag.replace('<latextag>', '').replace('</latextag>', '') latextag_t = '<span class="afanti-latex">\(##delspace##{}\)</span>'.format( latextag_t) # latextag_t = '##wsq##\({}\)##wsq##'.format(latextag_t) html_string = html_string.replace(latextag, latextag_t) return html_string
def is_lim(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(trs[0], '<td', flags=re.I) if len(tds) != 1: return False if '<table' in tds[0]: return False if '<td>lim</td>' not in tds[0]: return False tds = find_valid_elements(trs[1], '<td', with_tag=False, flags=re.I) if len(tds) != 1: raise ParseFormatError('[is_lim]:{}'.format(self.html_string)) self.node_format = r'\lim\limits_{{{}}}' # self.value_strs = [tds[0].replace('→', r'\to ')] self.value_strs = [tds[0]] return True
def find_table_options(html_string): """ 找 <td> 内容已 ABCD... 开头的 <table> """ tables = find_valid_elements(html_string, '<table', flags=re.I) rs = [] for table in tables: tds = get_html_element('<td', table, with_tag=False, flags=re.I) if len(tds) < 3: continue _tds = [re_tag.sub('', td).strip() for td in tds] if _startswith_abcd(_tds): rs.append([table, tds]) return rs
def no_table_format(html_string): if '<img' in html_string: html_string = convert_img_to_latex(html_string) html_string = html_string.replace('∏limit{s}', '\prod\limits ') html_string = html_string.replace('πlimit{s}', '\prod\limits ') html_string = html_string.replace('∑limit{s}', '\sum\limits ') html_string = html_string.replace('∫limit{s}', '\int\limits ') html_string = html_string.replace('%', ' \\%') if 'underpoint' in html_string: underpoints = get_html_element('<bdo [^<>]+underpoint', html_string, with_tag=True, regex=True, flags=re.I) for underpoint in set(underpoints): t = remove_start_tag(underpoint) underpoint_tex = '\\underset{{˙}}{{{}}}'.format(t) html_string = html_string.replace(underpoint, underpoint_tex) while True: spans = find_valid_elements(html_string, '<span ', flags=re.I) if not spans: break for span in spans: index = span.find('>') + 1 if 'vertical-align:sub' in span[:index]: n_span = remove_start_tag(span) html_string = html_string.replace(span, '_{%s}' % n_span, 1) elif 'vertical-align:sup' in span[:index]: n_span = remove_start_tag(span) html_string = html_string.replace(span, '^{%s}' % n_span, 1) html_string = re.sub(r'<(span|font)>', '', html_string, flags=re.I) html_string = re.sub(r'</(span|font)>', '', html_string, flags=re.I) return html_string
def subsup_to_latex(html_string): """ convert sub/sup to latex "_{}" / "^{}" html_string must be unicode literals """ html_string = unity_brackets(html_string) while True: s_tags = find_valid_elements(html_string, '<(sub|sup)', regex=True, flags=re.I | re.U) if not s_tags: html_string = _restore_latex(html_string) return html_string for s_tag in s_tags: index = html_string.find(s_tag) which_pre = '' try: which_pre = _find_which_pre(html_string[:index]) except Exception as err: logging.error('[_find_which_pre]: {}'.format(err)) which_pre = _find_first_not_empty(html_string[:index]) item = remove_start_tag(s_tag).strip(' \xa0').rstrip('\\').strip( ' \xa0') if s_tag.lower().startswith('<sub'): tex = '<latextag>{{{}}}_{{{}}}</latextag>'.format( which_pre.strip(' \xa0'), item) else: tex = '<latextag>{{{}}}^{{ {} ##addspace##}}</latextag>'.format( which_pre.strip(' \xa0'), item) html_string = (html_string[:index - len(which_pre)] + tex + html_string[index + len(s_tag):])
def parse(self, html_string, url, info): self.url = url cols = dict() tds = find_valid_elements(html_string, '<td') question_html = self.get_question_html(tds[3]) jieda = self.get_jieda(tds[4]) kps = self.get_kps(tds[2]) question_type_name = self.get_question_type_name(tds[0]) # format question object _question = Question(question_body = question_html, jieda = jieda) # unity question style unity_question = _question.normialize() cols['question_html'] = unity_question['question_body'] cols['jieda'] = unity_question['jieda'] cols['knowledge_point'] = kps cols['question_type_name'] = question_type_name cols['subject'] = self.get_subject(info) cols['fenxi'] = '' cols['dianping'] = '' cols['answer_all_html'] = '' cols['option_html'] = '' cols['difficulty'] = 0 cols['zhuanti'] = '' cols['spider_url'] = url cols['spider_source'] = 80 cols['question_type'] = 0 cols['question_quality'] = 0 cols['exam_year'] = 0 cols['exam_city'] = '' return cols
def is_matrix_g(self, key, matrix_t): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False _tds = find_valid_elements(trs[0], '<td', with_tag=True, flags=re.I) if len(_tds) != 3: return False if '<table' in _tds[0]: return False if key not in _tds[0] and key not in _tds[2]: return False td = remove_start_tag(_tds[1]) trs = find_valid_elements(td, '<tr>', with_tag=True, flags=re.I) trs = [remove_start_tag(tr) for tr in trs] row_size = len(trs) if not row_size: raise ParseFormatError('[is_{}][not row]:{}'.format( matrix_t, self.html_string)) rows = list() for tr in trs: tds = find_valid_elements(tr, '<td', with_tag=True, flags=re.I) tds = [remove_start_tag(td) for td in tds] rows.append(tds) if not rows: raise ParseFormatError('[is_{}][not col]:{}'.format( matrix_t, self.html_string)) buckets = list() for row in rows: buckets.append(' & '.join(['{}'] * len(row))) bucket = r' \\ '.join(buckets) cols = max([len(row) for row in rows]) for row in rows: self.value_strs += row # # unformated case # e.g. http://www.jyeoo.com/math2/ques/detail/80c73320-61e5-4df3-bab7-3134bd8bf834 # # https://kogler.wordpress.com/2008/03/21/latex-multiline-equations-systems-and-matrices/ # pare_l = '.' pare_r = '.' if matrix_t == 'pmatrix': pare_l = '(' pare_r = ')' elif matrix_t == 'bmatrix': pare_l = '[' pare_r = ']' elif matrix_t == 'vmatrix': pare_l = '|' pare_r = '|' if key in _tds[0] and key in _tds[2]: self.node_format = r'\begin{{%s}} ' % matrix_t + bucket + ' \end{{%s}}' % matrix_t elif key in _tds[0] and key not in _tds[2]: self.node_format = r'\left%s \begin{{array}}{{%s}} ' % ( pare_l, 'c' * cols) + bucket + r' \end{{array}} \right.' elif key not in _tds[0] and key in _tds[2]: self.node_format = r'\left. \begin{{array}}{{%s}} ' % 'c' * cols + bucket + r' \end{{array}} \right%s' % pare_r return True
def is_frac(self): # or is_overline trs = find_valid_elements(self.html_string, '<tr>', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index = tds[0].find('>') + 1 if 'border-bottom' in tds[0][:index]: # is_xlongequal # {{ td = remove_start_tag(tds[0]) if td.startswith('<table '): tables = find_valid_elements(td, '<table ', flags=re.I) if len(tables) == 1: tds_t = find_valid_elements(tables[0], '<td', flags=re.I) if len(tds_t) == 2: if tds_t[1] == '<td style="font-size: 90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>' \ or tds_t[1] == '<td style="font-size:90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>': td = remove_start_tag(tds_t[0]) td2 = remove_start_tag(tds[1]) self.node_format = '\\xlongequal[{}]{{{}}}' self.value_strs = [td2, td] return True # }} self.node_format = r'\frac{{{}}}{{{}}}' tds = [remove_start_tag(td) for td in tds] self.value_strs = tds return True if '<table' not in tds[0]: # is_overline # {{ if 'border-top' in tds[0]: self.node_format = r'\overline{{{}}}' tds = [remove_start_tag(td) for td in tds] self.value_strs = tds[1:] return True # }} # is_sub if not is_underbrace # {{ index = tds[1].find('>') + 1 td1 = remove_start_tag(tds[1]) _keys1 = [ '8cd1f5182b8ab176140b3249472323ac', '99129c72930ac651065df803ef39d322' ] _keys2 = ['/part/H123U.png'] if not _is_in(_keys1 + _keys2, td1): if 'style="font-size:90%"' in tds[1][:index]: td0 = remove_start_tag(tds[0]) self.node_format = r'{}_{{{}}}' self.value_strs = [td0, td1] return True # }} # is_underbrace # {{ # case http://www.jyeoo.com/math/ques/detail/2fe3a71c-3616-441f-aa05-30f90f58310b td0 = remove_start_tag(tds[0]) if self.is_underbrace(html_string=td0): if tds[1].startswith('<td style="font-size:90%">'): td1 = remove_start_tag(tds[1]) self.node_format = '{}_{{{}}}' self.value_strs = [td0, td1] return True # }} # is_xlongequal 2 # {{ if td0.startswith('<table ') and self.is_xlongequal2(html_string=td0): if tds[1] == '<td style="font-size: 90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>' \ or tds[1] == '<td style="font-size:90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>': return True else: self.node_format = None self.value_strs = [] # }} return False
def is_equations(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False # trs[0] = re.sub(r'<td[^<>]*/>', '<td></td>', trs[0], flags=re.I) tds = find_valid_elements(trs[0], '<td', with_tag=True, flags=re.I) if len(tds) != 3: return False # brace is at left _keys1 = [ '6f28da9c3ca14300d4593acc9aad9153', '87030ac64f2d9671babadb5ba43bdb62' ] _keys2 = ['/part/123L.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[0]): if '<table' in tds[0]: return False td = remove_start_tag(tds[1]) trs = find_valid_elements(td, '<tr', with_tag=False, flags=re.I) rows = list() for tr in trs: tds = find_valid_elements(tr, '<td', with_tag=False, flags=re.I) if tds: rows.append(tds) if not rows: raise ParseFormatError('[is_equations]:{}'.format( self.html_string)) cols = ' \\\\ '.join( [' & '.join(['{}'] * len(tds)) for tds in rows]) self.node_format = (r'\begin{{cases}} ' + cols + r' \end{{cases}}') for row in rows: self.value_strs += row return True # brace is at right elif _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[2]): if '<table' in tds[2]: return False td = remove_start_tag(tds[1]) trs = find_valid_elements(td, '<tr', with_tag=False, flags=re.I) rows = list() for tr in trs: tds = find_valid_elements(tr, '<td', with_tag=False, flags=re.I) if tds: rows.append(tds) if not rows: raise ParseFormatError('[is_equations]:{}'.format( self.html_string)) cols = ' \\\\ '.join( [' & '.join(['{}'] * len(tds)) for tds in rows]) self.node_format = (r'\left.' + r'\begin{{array}}{{l}} ' + cols + r' \end{{array}}' + r' \right\}}') for row in rows: self.value_strs += row return True else: return False