def is_subsup2(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index1 = tds[1].find('>') + 1 if 'style="font-size:90%"' not in tds[1][:index1]: return False tds0 = remove_start_tag(tds[0]) trs2 = find_valid_elements(tds0, '<tr', flags=re.I) if len(trs2) != 2: return False if 'style="font-size: 90%"' not in trs2[0]: return False tds2 = find_valid_elements(tds0, '<td', with_tag=False, flags=re.I) if len(tds2) != 2: return False if '<table' in tds2[1]: return False tds1 = remove_start_tag(tds[1]) self.node_format = '{}^{{{}}}_{{{}}}' self.value_strs = [tds2[1], tds2[0], tds1] return True
def is_underset(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index = tds[1].find('>') + 1 if 'style="font-size: 90%"' not in tds[1][:index]: return False # for uncommon integral _keys1 = [ '7ea7ce25490319b1bc0a30f02283c465', '3d579d20afec8779d54985a5acf51879' ] _keys2 = ['/part/8747.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[0]): if '<table' not in tds[0]: tds = [remove_start_tag(td) for td in tds] self.node_format = '\\underset{{{}}}{{\int}}' self.value_strs = tds[-1:] return True tds = [remove_start_tag(td) for td in tds] self.node_format = '\\underset{{{}}}{{{}}}' self.value_strs = tds[::-1] return True
def is_stackrel(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index1 = tds[0].find('>') + 1 if 'style="font-size: 90%"' not in tds[0][:index1]: return False # is_underrightarrow # change to is_xrightarrow if '<table' not in tds[1]: _keys1 = [ '6bd6a72411898992479ded42ef82f09c', 'a11fdee4d4b49f96bbe3814ad58f817b' ] _keys2 = ['/part/8594.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): td = remove_start_tag(tds[0]) self.node_format = '\\xrightarrow{{{}}}' self.value_strs = [td] return True # # is_xrightleftharpoons # for case, http://www.jyeoo.com/bio2/ques/detail/7003bd54-0861-4152-985a-3c7678ade246 # # {{ _keys1 = [ 'ee5caff647b9b23babff8113e9147821', '341e5bdc05923e0895e330d9c062477f' ] _keys2 = ['/part/8652L.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[1]): if '<table' in tds[1]: return False td = remove_start_tag(tds[0]) self.node_format = '\\xrightleftharpoons{{{}}}' self.value_strs = [td] return True # }} tds = [remove_start_tag(td) for td in tds] self.node_format = r'\stackrel{{{}}}{{{}}}' self.value_strs = tds return True
def is_underbrace(self, html_string=None): html_string = html_string or self.html_string trs = find_valid_elements(html_string, '<tr>', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(html_string, '<td', flags=re.I) if len(tds) != 2: return False _keys1 = [ '8cd1f5182b8ab176140b3249472323ac', '99129c72930ac651065df803ef39d322' ] _keys2 = ['/part/H123U.png'] if self.md5 is True: if not _is_in(_keys1 + _keys2, tds[1]): return False else: if not _is_in(_keys2 + _keys1, tds[1]): return False td = remove_start_tag(tds[0]) self.node_format = '\\underbrace{{{}}}' self.value_strs = [td] return True
def handle_spans(html_string): spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)', html_string, regex=True, flags=re.I) spans = [span for span in set(spans)] spans = sorted(spans, key=lambda x: len(x), reverse=True) for span in spans: txt = remove_start_tag(span) i = span.find('>') tag = span[:i].lower() if 'text-decoration' in tag: if 'underline' in tag: nspan = UNDERLINE.format(txt) html_string = html_string.replace(span, nspan) elif 'none' in tag: html_string = html_string.replace(span, txt) elif 'line-through' in tag: nspan = LINE_THROUGH.format(txt) html_string = html_string.replace(span, nspan) elif 'vertical-align' in tag: if ':sub' in tag: nspan = '<sub>{}</sub>'.format(txt) elif ':sup' in tag: nspan = '<sup>{}</sup>'.format(txt) else: nspan = txt html_string = html_string.replace(span, nspan) return html_string
def no_table_format(html_string): if '<img' in html_string: html_string = convert_img_to_latex(html_string) html_string = html_string.replace('∏limit{s}', '\prod\limits ') html_string = html_string.replace('πlimit{s}', '\prod\limits ') html_string = html_string.replace('∑limit{s}', '\sum\limits ') html_string = html_string.replace('∫limit{s}', '\int\limits ') html_string = html_string.replace('%', ' \\%') if 'underpoint' in html_string: underpoints = get_html_element('<bdo [^<>]+underpoint', html_string, with_tag=True, regex=True, flags=re.I) for underpoint in set(underpoints): t = remove_start_tag(underpoint) underpoint_tex = '\\underset{{˙}}{{{}}}'.format(t) html_string = html_string.replace(underpoint, underpoint_tex) while True: spans = find_valid_elements(html_string, '<span ', flags=re.I) if not spans: break for span in spans: index = span.find('>') + 1 if 'vertical-align:sub' in span[:index]: n_span = remove_start_tag(span) html_string = html_string.replace(span, '_{%s}' % n_span, 1) elif 'vertical-align:sup' in span[:index]: n_span = remove_start_tag(span) html_string = html_string.replace(span, '^{%s}' % n_span, 1) html_string = re.sub(r'<(span|font)>', '', html_string, flags=re.I) html_string = re.sub(r'</(span|font)>', '', html_string, flags=re.I) return html_string
def get_question_html(self, html_string): e = get_html_element('<div', html_string, with_tag=False, limit=1) if e: e = e[0] else: e = remove_start_tag(html_string) e = self.fix_any(e) e = center_image(e) e = self.html_magic.bewitch(e, spider_url=self.url) e = self.format_options(e) return e.strip()
def fix_any(html_string): maths = get_html_element('<math', html_string, flags=re.I) for math in set(maths): math_t = '<span class="afanti-latex">{}</span>'.format(math) html_string = html_string.replace(math, math_t) # 加点字 spans = get_html_element('<span class="founderdotem">', html_string) for span in spans: text = remove_start_tag(span) aft_tag = '<bdo class="aft_underpoint">{}</bdo>'.format(text) html_string = html_string.replace(span, aft_tag) return html_string
def is_xlongequal2(self, html_string=None): # case http://www.jyeoo.com/chemistry/ques/detail/b6e8a3b9-d9e5-48ae-903d-40fd3f05e969 html_string = html_string or self.html_string tds = find_valid_elements(html_string, '<td', flags=re.I) if len(tds) == 2: if tds[1] == '<td style="font-size: 90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>' \ or tds[1] == '<td style="font-size:90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>': td = remove_start_tag(tds[0]) self.node_format = '\\xlongequal{{{}}}' self.value_strs = [td] return True return False
def is_subsup(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False tds = find_valid_elements(trs[0], '<td', with_tag=True, flags=re.I) tds = [remove_start_tag(td) for td in tds] if len(tds) != 2: return False if '"msubsup' not in tds[1]: return False divs = find_valid_elements(tds[1], '<div [^<>]+"msubsup', regex=True, flags=re.I) if len(divs) != 2: raise ParseFormatError('[is_subsup]:{}'.format(self.html_string)) e = tds[0] sub = '' sup = '' for div in divs: index = div.find('>') + 1 if 'msubsup_sup' in div[:index]: sup = remove_start_tag(div) elif 'msubsup_sub' in div[:index]: sub = remove_start_tag(div) else: raise ParseFormatError('[is_subsup][not sub or sup]:{}'.format( self.html_string)) self.node_format = '{{{}}}^{{{}}}_{{{}}}' self.value_strs = [e, sup, sub] return True
def format_spans(html_string): _LINE_THROUGH = LINE_THROUGH.replace('<span', '<sspan')\ .replace('</span>', '</sspan>') _UNDERLINE = UNDERLINE.replace('<span', '<sspan')\ .replace('</span>', '</sspan>') spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)', html_string, regex=True, flags=re.I) spans = list(set(spans)) spans = sorted(spans, key=lambda x: len(x), reverse=True) for span in spans: txt = remove_start_tag(span) i = span.find('>') tag = span[:i].lower() if 'text-decoration' in tag: if 'underline' in tag: nspan = _UNDERLINE.format(txt) html_string = html_string.replace(span, nspan) elif 'none' in tag: html_string = html_string.replace(span, txt) elif 'line-through' in tag: nspan = _LINE_THROUGH.format(txt) html_string = html_string.replace(span, nspan) elif 'vertical-align' in tag: if ':sub' in tag: nspan = '<sub>{}</sub>'.format(txt) elif ':sup' in tag: nspan = '<sup>{}</sup>'.format(txt) else: nspan = txt html_string = html_string.replace(span, nspan) while True: html_string = remove_tag('<span', html_string, all=False, flags=re.I) if not get_html_element('<span', html_string): break html_string = html_string.replace('<sspan', '<span')\ .replace('</sspan>', '</span>') return html_string
def is_integral(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index = tds[1].find('>') + 1 if 'style="font-size: 90%"' not in tds[1][:index]: return False tds = [remove_start_tag(td) for td in tds] self.node_format = '\\underset{{{}}}{{{}}}' self.value_strs = tds[::-1] return True
def to_latex(html_string, raw=False, md5=False): jy_math_span_list = get_html_element('<span [^<>]*?mathtag="math', html_string, regex=True, with_tag=True, flags=re.I) latexes = [] for jy_math_span_ori in jy_math_span_list: jy_math_span = remove_start_tag(jy_math_span_ori) # if not <table, no need to convert if '<table' not in jy_math_span.lower(): html_string = html_string.replace(jy_math_span_ori, jy_math_span) continue jy_math_span = re.sub(r'(<td[^<>]*)/>', r'\1></td>', jy_math_span, flags=re.I) root_node = Node() if md5 is True: root_node.md5 = True root_node.node_format = '{}' root_node.value_strs = [jy_math_span] parse(root_node) latex = convert(root_node).strip() latex = Node.no_table_format(latex) latexes.append(latex) if raw is False: latex_span = '<span class="afanti-latex">\( {} \)</span>'.format( latex) html_string = html_string.replace(jy_math_span_ori, latex_span) else: html_string = html_string.replace(jy_math_span_ori, '\( {} \)'.format(latex)) return html_string.strip(), latexes
def subsup_to_latex(html_string): """ convert sub/sup to latex "_{}" / "^{}" html_string must be unicode literals """ html_string = unity_brackets(html_string) while True: s_tags = find_valid_elements(html_string, '<(sub|sup)', regex=True, flags=re.I | re.U) if not s_tags: html_string = _restore_latex(html_string) return html_string for s_tag in s_tags: index = html_string.find(s_tag) which_pre = '' try: which_pre = _find_which_pre(html_string[:index]) except Exception as err: logging.error('[_find_which_pre]: {}'.format(err)) which_pre = _find_first_not_empty(html_string[:index]) item = remove_start_tag(s_tag).strip(' \xa0').rstrip('\\').strip( ' \xa0') if s_tag.lower().startswith('<sub'): tex = '<latextag>{{{}}}_{{{}}}</latextag>'.format( which_pre.strip(' \xa0'), item) else: tex = '<latextag>{{{}}}^{{ {} ##addspace##}}</latextag>'.format( which_pre.strip(' \xa0'), item) html_string = (html_string[:index - len(which_pre)] + tex + html_string[index + len(s_tag):])
def to_latex(html_string, raw=False): xb_math_span_list = get_html_element('<span [^<>]*?math-model', html_string, regex = True, with_tag=True, flags=re.I) for xb_math_span_ori in xb_math_span_list: xb_math_span = remove_start_tag(xb_math_span_ori) spans = get_spans(xb_math_span) root_node = Node(spans) parse(root_node) latex = str(root_node) if raw is False: latex_span = '<span class="afanti-latex">\( {} \)</span>'.format(latex) html_string = html_string.replace(xb_math_span_ori, latex_span) else: html_string = html_string.replace(xb_math_span_ori, '\( {} \)'.format(latex)) return html_string
def is_matrix_g(self, key, matrix_t): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False _tds = find_valid_elements(trs[0], '<td', with_tag=True, flags=re.I) if len(_tds) != 3: return False if '<table' in _tds[0]: return False if key not in _tds[0] and key not in _tds[2]: return False td = remove_start_tag(_tds[1]) trs = find_valid_elements(td, '<tr>', with_tag=True, flags=re.I) trs = [remove_start_tag(tr) for tr in trs] row_size = len(trs) if not row_size: raise ParseFormatError('[is_{}][not row]:{}'.format( matrix_t, self.html_string)) rows = list() for tr in trs: tds = find_valid_elements(tr, '<td', with_tag=True, flags=re.I) tds = [remove_start_tag(td) for td in tds] rows.append(tds) if not rows: raise ParseFormatError('[is_{}][not col]:{}'.format( matrix_t, self.html_string)) buckets = list() for row in rows: buckets.append(' & '.join(['{}'] * len(row))) bucket = r' \\ '.join(buckets) cols = max([len(row) for row in rows]) for row in rows: self.value_strs += row # # unformated case # e.g. http://www.jyeoo.com/math2/ques/detail/80c73320-61e5-4df3-bab7-3134bd8bf834 # # https://kogler.wordpress.com/2008/03/21/latex-multiline-equations-systems-and-matrices/ # pare_l = '.' pare_r = '.' if matrix_t == 'pmatrix': pare_l = '(' pare_r = ')' elif matrix_t == 'bmatrix': pare_l = '[' pare_r = ']' elif matrix_t == 'vmatrix': pare_l = '|' pare_r = '|' if key in _tds[0] and key in _tds[2]: self.node_format = r'\begin{{%s}} ' % matrix_t + bucket + ' \end{{%s}}' % matrix_t elif key in _tds[0] and key not in _tds[2]: self.node_format = r'\left%s \begin{{array}}{{%s}} ' % ( pare_l, 'c' * cols) + bucket + r' \end{{array}} \right.' elif key not in _tds[0] and key in _tds[2]: self.node_format = r'\left. \begin{{array}}{{%s}} ' % 'c' * cols + bucket + r' \end{{array}} \right%s' % pare_r return True
def is_frac(self): # or is_overline trs = find_valid_elements(self.html_string, '<tr>', flags=re.I) if len(trs) != 2: return False tds = find_valid_elements(self.html_string, '<td', flags=re.I) if len(tds) != 2: return False index = tds[0].find('>') + 1 if 'border-bottom' in tds[0][:index]: # is_xlongequal # {{ td = remove_start_tag(tds[0]) if td.startswith('<table '): tables = find_valid_elements(td, '<table ', flags=re.I) if len(tables) == 1: tds_t = find_valid_elements(tables[0], '<td', flags=re.I) if len(tds_t) == 2: if tds_t[1] == '<td style="font-size: 90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>' \ or tds_t[1] == '<td style="font-size:90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>': td = remove_start_tag(tds_t[0]) td2 = remove_start_tag(tds[1]) self.node_format = '\\xlongequal[{}]{{{}}}' self.value_strs = [td2, td] return True # }} self.node_format = r'\frac{{{}}}{{{}}}' tds = [remove_start_tag(td) for td in tds] self.value_strs = tds return True if '<table' not in tds[0]: # is_overline # {{ if 'border-top' in tds[0]: self.node_format = r'\overline{{{}}}' tds = [remove_start_tag(td) for td in tds] self.value_strs = tds[1:] return True # }} # is_sub if not is_underbrace # {{ index = tds[1].find('>') + 1 td1 = remove_start_tag(tds[1]) _keys1 = [ '8cd1f5182b8ab176140b3249472323ac', '99129c72930ac651065df803ef39d322' ] _keys2 = ['/part/H123U.png'] if not _is_in(_keys1 + _keys2, td1): if 'style="font-size:90%"' in tds[1][:index]: td0 = remove_start_tag(tds[0]) self.node_format = r'{}_{{{}}}' self.value_strs = [td0, td1] return True # }} # is_underbrace # {{ # case http://www.jyeoo.com/math/ques/detail/2fe3a71c-3616-441f-aa05-30f90f58310b td0 = remove_start_tag(tds[0]) if self.is_underbrace(html_string=td0): if tds[1].startswith('<td style="font-size:90%">'): td1 = remove_start_tag(tds[1]) self.node_format = '{}_{{{}}}' self.value_strs = [td0, td1] return True # }} # is_xlongequal 2 # {{ if td0.startswith('<table ') and self.is_xlongequal2(html_string=td0): if tds[1] == '<td style="font-size: 90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>' \ or tds[1] == '<td style="font-size:90%"><div style="border-top:1px solid black;line-height:1px">.</div></td>': return True else: self.node_format = None self.value_strs = [] # }} return False
def is_equations(self): trs = find_valid_elements(self.html_string, '<tr', flags=re.I) if len(trs) != 1: return False # trs[0] = re.sub(r'<td[^<>]*/>', '<td></td>', trs[0], flags=re.I) tds = find_valid_elements(trs[0], '<td', with_tag=True, flags=re.I) if len(tds) != 3: return False # brace is at left _keys1 = [ '6f28da9c3ca14300d4593acc9aad9153', '87030ac64f2d9671babadb5ba43bdb62' ] _keys2 = ['/part/123L.png'] if _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[0]): if '<table' in tds[0]: return False td = remove_start_tag(tds[1]) trs = find_valid_elements(td, '<tr', with_tag=False, flags=re.I) rows = list() for tr in trs: tds = find_valid_elements(tr, '<td', with_tag=False, flags=re.I) if tds: rows.append(tds) if not rows: raise ParseFormatError('[is_equations]:{}'.format( self.html_string)) cols = ' \\\\ '.join( [' & '.join(['{}'] * len(tds)) for tds in rows]) self.node_format = (r'\begin{{cases}} ' + cols + r' \end{{cases}}') for row in rows: self.value_strs += row return True # brace is at right elif _is_in((_keys1 + _keys2, _keys2 + _keys1)[self.md5 is False], tds[2]): if '<table' in tds[2]: return False td = remove_start_tag(tds[1]) trs = find_valid_elements(td, '<tr', with_tag=False, flags=re.I) rows = list() for tr in trs: tds = find_valid_elements(tr, '<td', with_tag=False, flags=re.I) if tds: rows.append(tds) if not rows: raise ParseFormatError('[is_equations]:{}'.format( self.html_string)) cols = ' \\\\ '.join( [' & '.join(['{}'] * len(tds)) for tds in rows]) self.node_format = (r'\left.' + r'\begin{{array}}{{l}} ' + cols + r' \end{{array}}' + r' \right\}}') for row in rows: self.value_strs += row return True else: return False