def fix_any(html_string): maths = get_html_element('<math', html_string, flags=re.I) for math in set(maths): math_t = '<span class="afanti-latex">{}</span>'.format(math) html_string = html_string.replace(math, math_t) # 加点字 spans = get_html_element('<span class="founderdotem">', html_string) for span in spans: text = remove_start_tag(span) aft_tag = '<bdo class="aft_underpoint">{}</bdo>'.format(text) html_string = html_string.replace(span, aft_tag) return html_string
def format_spans(html_string): _LINE_THROUGH = LINE_THROUGH.replace('<span', '<sspan')\ .replace('</span>', '</sspan>') _UNDERLINE = UNDERLINE.replace('<span', '<sspan')\ .replace('</span>', '</sspan>') spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)', html_string, regex=True, flags=re.I) spans = list(set(spans)) spans = sorted(spans, key=lambda x: len(x), reverse=True) for span in spans: txt = remove_start_tag(span) i = span.find('>') tag = span[:i].lower() if 'text-decoration' in tag: if 'underline' in tag: nspan = _UNDERLINE.format(txt) html_string = html_string.replace(span, nspan) elif 'none' in tag: html_string = html_string.replace(span, txt) elif 'line-through' in tag: nspan = _LINE_THROUGH.format(txt) html_string = html_string.replace(span, nspan) elif 'vertical-align' in tag: if ':sub' in tag: nspan = '<sub>{}</sub>'.format(txt) elif ':sup' in tag: nspan = '<sup>{}</sup>'.format(txt) else: nspan = txt html_string = html_string.replace(span, nspan) while True: html_string = remove_tag('<span', html_string, all=False, flags=re.I) if not get_html_element('<span', html_string): break html_string = html_string.replace('<sspan', '<span')\ .replace('</sspan>', '</span>') return html_string
def convert_img_to_latex(html_string): imgs = get_html_element('<img ', html_string, only_tag=True) for img in imgs: # for ∑ if _is_in(('19d3e9593386103f95e71affc87e62ea', '5f4100b557a7c8116b2a45e4435b67ae', '/part/8721.png'), img): html_string = html_string.replace(img, '\\sum ') # for ∏ if _is_in(('b9331d3ee2218a431c9203512001f479', '9b0bbd95adbebda854a4ec3b1c2ab2e6', '/part/8719.png'), img): html_string = html_string.replace(img, '\\prod ') # for ∫ if _is_in(('7ea7ce25490319b1bc0a30f02283c465', '3d579d20afec8779d54985a5acf51879', '/part/8747.png'), img): html_string = html_string.replace(img, '\\int ') # for ⋃ if _is_in(('c7fdd6777f4de5a1f490f96c17a414b3', 'cd0da1bb4f8a3b0e0656578988afb49a', '/part/8746.png'), img): html_string = html_string.replace(img, '\\bigcup ') # for ⋂ if _is_in(('429c05d3df51962ccb70c6a9306e78ff', '6c9dd31c5750dd38b991f3616df1517c', '/part/8745.png'), img): html_string = html_string.replace(img, '\\bigcup ') return html_string
def handle_mathml(html_string, uri2oss, url): img_dir = 'working/latex_imgs/' mathmls = get_html_element('<math', html_string) latexes = [fix_latex(lt) for lt in to_latexes(mathmls)] png_paths = [img_dir + md5_string(latex) + '.png' for latex in latexes] png_results = to_pngs(latexes, png_paths, check=False) for latex, mathml, png_path, png_result in zip(latexes, mathmls, png_paths, png_results): if png_result is False: logging.warn('latex2png:{} {}'.format(url, latex)) return False # if not os.path.exists(png_path): # if png_result is False: # logging.warn('latex2png:{}'.format(latex)) # return False w, h = get_image_size(png_path) latex_base64 = compat_base64.b64encode(latex.encode('utf-8')).decode() span = '<span data-latex="base64,{}">'.format(latex_base64) md5_name = os.path.basename(png_path) oss_img_url = uri2oss.convert(md5_name, 56) # oss_img_url = png_path # img = span + ('<img src="{}" width="{}" heigh="{}" ' # 'style="vertical-align: middle; margin: 5px 3px 5px 3px"></span>'.format( # oss_img_url, w // 2 + 2, h // 2 + 2)) img = span + ('<img src="{}" width="{}" heigh="{}" ' 'class="afanti_latex"></span>'.format( oss_img_url, w // 2 + 2, h // 2 + 2)) html_string = html_string.replace(mathml, img) return html_string
def displaystyle(html_string, latex_tag=None, regex=False, flags=re.U, latex=True, mml=True): """ give displaystyle at right places """ if latex is True: texes = list() if latex_tag: texes = get_html_element(latex_tag, html_string, regex=regex, flags=flags) else: if _re_displaystyle_target.search(html_string): texes = find_latexes(html_string) for tex in set(texes): tex_t = _displaystyle(tex) tex_t = _discard_latex_displaystyle_for_subsup(tex_t) html_string = html_string.replace(tex, tex_t) if mml is True and '<math' in html_string.lower(): mathmls = find_mathml_elems(html_string, with_tag=True) mathmls = list(set(mathmls)) mathmls = sort_by_len(mathmls, reverse=True) for mathml in mathmls: html_string = html_string.replace( mathml, ('<mstyle displaystyle="true">{}</mstyle>').format(mathml)) html_string = _discard_mathml_displaystyle_for_subsup(html_string) return html_string
def handle_spans(html_string): spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)', html_string, regex=True, flags=re.I) spans = [span for span in set(spans)] spans = sorted(spans, key=lambda x: len(x), reverse=True) for span in spans: txt = remove_start_tag(span) i = span.find('>') tag = span[:i].lower() if 'text-decoration' in tag: if 'underline' in tag: nspan = UNDERLINE.format(txt) html_string = html_string.replace(span, nspan) elif 'none' in tag: html_string = html_string.replace(span, txt) elif 'line-through' in tag: nspan = LINE_THROUGH.format(txt) html_string = html_string.replace(span, nspan) elif 'vertical-align' in tag: if ':sub' in tag: nspan = '<sub>{}</sub>'.format(txt) elif ':sup' in tag: nspan = '<sup>{}</sup>'.format(txt) else: nspan = txt html_string = html_string.replace(span, nspan) return html_string
def remove_empty_elements(html_string, filter=None): # # filter is function return True or False, # True, then remove the elem, # False, then remain it # def _filter(elem): elem = elem.lower() if 'aft_' in elem \ or 'afanti_' in elem \ or '<u>' in elem \ or '<img ' in elem: return False else: return True _filter = _filter or filter elems = get_html_element('<([a-zA-Z][a-zA-Z0-9:]*)', html_string, regex=True) elems = list(set(elems)) elems = sorted(elems, key=lambda x: len(x), reverse=True) for elem in elems: elem_text = re_tag.sub('', elem) elem_text = re_empty_str.sub('', elem_text) if not elem_text: if _filter and not _filter(elem): continue html_string = html_string.replace(elem, '') return html_string.strip()
def remove_tag(tag, html_string, regex=False, flags=re.U, all=False, check=None): ''' if all is True, remove matched elements including it's text ''' if regex is False: if tag.lower() not in html_string.lower(): return html_string else: if re.search(tag, html_string, flags=flags) is None: return html_string es = get_html_element(tag, html_string, regex=regex, flags=flags) for e in es: if check is not None and check(e) is False: continue if all: content = '' else: content = re.sub(r'^<[^<>]+>', '', e) content = re.sub(r'</[^<>]+>$', '', content) # sindex = e.find('>') + 1 # eindex = e.rfind('<') # content = e[sindex:eindex] html_string = html_string.replace(e, content) return html_string
def center_image(html_string): imgs = get_html_element('<img', html_string, only_tag=True, flags=re.I) for img in imgs: try: src = re.search(r"""src\s*=\s*["'][^"'<>]+?["']""", img, flags=re.I).group() except Exception: continue w = '' mod = re.search(r'\W(width\s*(:|=)\s*[^<>]+?)(;|\s|/|>)', img, flags=re.I) if mod: w = mod.group(1) w = re.sub(r'\s*:\s*', '=', w, 1) if '"' not in w and '\'' not in w: w = w.replace('=', '="', 1) + '"' h = '' mod = re.search(r'\W(height\s*(:|=)\s*[^<>]+?)(;|\s|/|>)', img, flags=re.I) if mod: h = mod.group(1) h = re.sub(r'\s*:\s*', '=', h, 1) if '"' not in h and '\'' not in h: h = h.replace('=', '="', 1) + '"' style = ' '.join((src, w, h)).strip() new_img = '<img %s style="vertical-align: middle;">' % style html_string = html_string.replace(img, new_img) return html_string
def get_answer_all_html(entity): ans = get_html_element('<li class="Answer">', entity, with_tag=False)[0] if not ans: return '' ans = remove_tag('<XHTML', ans, all=False).strip() ans = ans.replace('【答案】', '', 1) return ans.strip()
def get_fenxi(entity): fx = get_html_element('<li class="Analytical">', entity, with_tag=False)[0] if not fx: return '' fx = remove_tag('<XHTML', fx, all=False).strip() fx = fx.replace('【解析】', '', 1) return fx.strip()
def restore_src(html_string): imgs = get_html_element('<img [^<>]*src-base64=', html_string, regex=True, only_tag=True) for img in imgs: img_t = img.replace('src-base64=', 'src=', 1) html_string = html_string.replace(img, img_t, 1) return html_string
def get_jieda(self, html_string): e = get_html_element('<font color=red>', html_string, with_tag=False, limit=1)[0] e = self.fix_any(e) e = center_image(e) e = self.html_magic.bewitch(e, spider_url=self.url) if e.endswith('</div></p>'): e = e[:-4] return e.strip()
def test_get_html_element(self): e = get_html_element(( dict(e='<p>', with_tag=False), dict(e='<div', with_tag=False), dict(e='<p>', with_tag=False), ), self.html_string)[0] self.assertEqual(e, ' target ')
def make_option(entity): options = get_html_element('<span class="option">', entity, with_tag=False) tr_t = '<tr><td class="aft_option" data="{}">{}</td></tr>' option = '<table class="aft_option_wrapper" style="width: 100%;"><tbody class="measureRoot">{}</tbody></table>'.format( ''.join([ tr_t.format(OPTION_DICT[index], td) for index, td in enumerate(options) ])) return option
def get_question_type_str(html_string): e = get_html_element('<div class="T">', html_string, with_tag=False, limit=1)[0] mod = re.search('type">(.+?)</tt>', e) if not mod: return '' tp = mod.group(1) return tp
def get_question_html(self, html_string): rs = [] cns = get_html_element('<div class="content">', html_string, with_tag=False) for cn in cns: cn = abs_url(cn) cn = center_image(cn) cn = self.html_magic.bewitch(cn, spider_url=self.url) rs.append(cn.strip()) rs[1] = self.fix_any(rs[1]).replace('\r', '').strip() return rs
def get_difficulty(html_string): e = get_html_element('<div class="T">', html_string, with_tag=False, limit=1)[0] mod = re.search('difficulty">(.+?)<', e) if not mod: return '' dfs = mod.group(1) df = DIFFS.get(dfs, 0) return df
def get_question_html(self, html_string): e = get_html_element('<div', html_string, with_tag=False, limit=1) if e: e = e[0] else: e = remove_start_tag(html_string) e = self.fix_any(e) e = center_image(e) e = self.html_magic.bewitch(e, spider_url=self.url) e = self.format_options(e) return e.strip()
def get_render_html(self, raw_render_html): elem = get_html_element('<div id="-mathjax-render-div-">', raw_render_html, with_tag=False, limit=1) if not elem: raise ParserError('Cant\'t find <div id="-mathjax-render-div-">') elem = self.remove_some_elem(elem[0]) return elem.strip()
def get_spans(html_string): imgs = get_html_element('<img ', html_string, only_tag=True) for img in imgs: html_string = html_string.replace(img, img + '</img>') rs = [] spans = find_valid_elements(html_string, '<(span|img) ', with_tag=True, regex=True) for span in spans: rs.append(Span(span)) # format_spans(rs) return rs
def get_question_html(entity): if entity.startswith('<li'): qs = get_html_element('<li class="IsTopic">', entity, with_tag=False, limit=1)[0] if not qs: return '' else: qs = get_html_element('<span class="optionoption">', entity, with_tag=False, limit=1)[0] if not qs: return '' qs = remove_tag('<XHTML', qs, all=False).strip() if entity.startswith('<span'): qs = make_option(qs) return qs.strip()
def find_table_options(html_string): """ 找 <td> 内容已 ABCD... 开头的 <table> """ tables = find_valid_elements(html_string, '<table', flags=re.I) rs = [] for table in tables: tds = get_html_element('<td', table, with_tag=False, flags=re.I) if len(tds) < 3: continue _tds = [re_tag.sub('', td).strip() for td in tds] if _startswith_abcd(_tds): rs.append([table, tds]) return rs
def no_table_format(html_string): if '<img' in html_string: html_string = convert_img_to_latex(html_string) html_string = html_string.replace('∏limit{s}', '\prod\limits ') html_string = html_string.replace('πlimit{s}', '\prod\limits ') html_string = html_string.replace('∑limit{s}', '\sum\limits ') html_string = html_string.replace('∫limit{s}', '\int\limits ') html_string = html_string.replace('%', ' \\%') if 'underpoint' in html_string: underpoints = get_html_element('<bdo [^<>]+underpoint', html_string, with_tag=True, regex=True, flags=re.I) for underpoint in set(underpoints): t = remove_start_tag(underpoint) underpoint_tex = '\\underset{{˙}}{{{}}}'.format(t) html_string = html_string.replace(underpoint, underpoint_tex) while True: spans = find_valid_elements(html_string, '<span ', flags=re.I) if not spans: break for span in spans: index = span.find('>') + 1 if 'vertical-align:sub' in span[:index]: n_span = remove_start_tag(span) html_string = html_string.replace(span, '_{%s}' % n_span, 1) elif 'vertical-align:sup' in span[:index]: n_span = remove_start_tag(span) html_string = html_string.replace(span, '^{%s}' % n_span, 1) html_string = re.sub(r'<(span|font)>', '', html_string, flags=re.I) html_string = re.sub(r'</(span|font)>', '', html_string, flags=re.I) return html_string
def to_latex(html_string, raw=False, md5=False): jy_math_span_list = get_html_element('<span [^<>]*?mathtag="math', html_string, regex=True, with_tag=True, flags=re.I) latexes = [] for jy_math_span_ori in jy_math_span_list: jy_math_span = remove_start_tag(jy_math_span_ori) # if not <table, no need to convert if '<table' not in jy_math_span.lower(): html_string = html_string.replace(jy_math_span_ori, jy_math_span) continue jy_math_span = re.sub(r'(<td[^<>]*)/>', r'\1></td>', jy_math_span, flags=re.I) root_node = Node() if md5 is True: root_node.md5 = True root_node.node_format = '{}' root_node.value_strs = [jy_math_span] parse(root_node) latex = convert(root_node).strip() latex = Node.no_table_format(latex) latexes.append(latex) if raw is False: latex_span = '<span class="afanti-latex">\( {} \)</span>'.format( latex) html_string = html_string.replace(jy_math_span_ori, latex_span) else: html_string = html_string.replace(jy_math_span_ori, '\( {} \)'.format(latex)) return html_string.strip(), latexes
def to_latex(html_string, raw=False): xb_math_span_list = get_html_element('<span [^<>]*?math-model', html_string, regex = True, with_tag=True, flags=re.I) for xb_math_span_ori in xb_math_span_list: xb_math_span = remove_start_tag(xb_math_span_ori) spans = get_spans(xb_math_span) root_node = Node(spans) parse(root_node) latex = str(root_node) if raw is False: latex_span = '<span class="afanti-latex">\( {} \)</span>'.format(latex) html_string = html_string.replace(xb_math_span_ori, latex_span) else: html_string = html_string.replace(xb_math_span_ori, '\( {} \)'.format(latex)) return html_string
def parse(self, html_string, url, aft_subj_id): cols = dict() exam_year = 0 paper_name = '' question_html_t = list() answer_all_html_t = list() fenxi_t = list() cols_dict = { '"IsTopic"': question_html_t, '"optionoption"': question_html_t, '"Answer"': answer_all_html_t, '"Analytical"': fenxi_t, } entities = { '"IsTopic"': get_question_html, '"optionoption"': get_question_html, '"Answer"': get_answer_all_html, '"Analytical"': get_fenxi, } elems = get_html_element( '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)', html_string, regex=True) q = -1 for elem in elems: for key in entities.keys(): if key in elem[:30]: entity = entities[key](elem) if q > 0 and key in ('"Answer"', '"Analytical"'): entity = '({}). {}'.format(q, entity) if q == -1 and key == '"IsTopic"': exam_year, paper_name = get_exam_info(entity) entity = remove_exam_info(entity) cols_dict[key].append(entity) if key == '"IsTopic"': q += 1 break question_all_html = '<br>\n'.join(question_html_t) question_html = self.html_magic.bewitch(question_all_html, spider_url=url) question_html = center_image(question_html) question_html = fix_any(question_html) question_html = displaystyle(question_html, latex=False, mml=True) #cols['question_html_origin'] = question_html answer_all_html = '<br>\n'.join(answer_all_html_t) answer_all_html = self.html_magic.bewitch(answer_all_html, spider_url=url) answer_all_html = center_image(answer_all_html) answer_all_html = fix_any(answer_all_html) answer_all_html = displaystyle(answer_all_html, latex=False, mml=True) #cols['answer_all_html_origin'] = answer_all_html fenxi = '<br>\n'.join(fenxi_t) fenxi = self.html_magic.bewitch(fenxi, spider_url=url) fenxi = center_image(fenxi) fenxi = fix_any(fenxi) fenxi = displaystyle(fenxi, latex=False, mml=True) #cols['fenxi_origin'] = fenxi cols['difficulty'] = get_difficulty(html_string) cols['question_type_str'] = get_question_type_str(html_string) cols['question_html'] = '' cols['option_html'] = '' cols['answer_all_html'] = '' cols['jieda'] = '' cols['fenxi'] = '' cols['dianping'] = '' cols['option_html_origin'] = '' cols['jieda_origin'] = '' cols['dianping_origin'] = '' cols['zhuanti'] = '' cols['paper_name'] = paper_name cols['paper_url'] = '' cols['spider_url'] = url cols['subject'] = aft_subj_id cols['spider_source'] = 56 cols['question_type'] = 0 cols['question_quality'] = 0 cols['knowledge_point'] = '' cols['exam_year'] = exam_year cols['exam_city'] = '' _question = Question( question_body=question_html, answer=answer_all_html, analy=fenxi, ) standard_question = _question.normialize() cols['question_html_origin'] = standard_question['question_body'] cols['answer_all_html_origin'] = standard_question['answer'] cols['fenxi_origin'] = standard_question['analy'] return cols
def beautify_html(html_string): ''' 慎重使用 ''' # html_string = ''.join([i.strip() + ' ' for i in StringIO(html_string).readlines()]) # html_string = get_html_element('<body', html_string)[0] html_string = html_string.strip() # remove style tag html_string = remove_style_tag(html_string) # remove comment html_string = re.sub(r'<![^<>]+>', '', html_string) # remove '\x1f' # html_string = re.sub(r'(\d)\x1f(\d)', r'\1\2', html_string) # remove h1, html_string = re.sub(r'<(/|)(h\d*|strong|font|em|[\w]+:[\w]+|xml)[^<>]*>', '', html_string, flags=re.I) # remove b html_string = remove_tag('<b>', html_string, flags=re.I) # remove a html_string = remove_a_tag(html_string) # fix super and sub tag tags = get_html_element('<span [^<>]+[^\w<>](super|sub|underline)[^\w<>]', html_string, regex=True, flags=re.I) for tag in tags: if 'super' in tag.lower(): text = re.sub(r'<span [^<>]+>', '<sup>', tag, flags=re.I)[:-7] + '</sup>' elif 'underline' in tag.lower(): text = re.sub(r'<span [^<>]+>', '<u>', tag, flags=re.I)[:-7] + '</u>' else: text = re.sub(r'<span [^<>]+>', '<sub>', tag, flags=re.I)[:-7] + '</sub>' html_string = html_string.replace(tag, text, 1) # clear table # tables = get_html_element('<table', html_string) # for table in tables: # if 'border-bottom:' in table: # continue # t = re.sub(r'<table[^<>]*>', '<table style="border: 1px solid black; border-collapse: collapse;">', table, flags=re.I) # t = re.sub(r'<tr[^<>]*>', '<tr>', t, flags=re.I) # t = re.sub(r'<td[^<>]*>', '<td style="border: 1px solid black; border-collapse: collapse;">', t, flags=re.I) # t = re.sub(r'<th[^<>]*>', '<th style="border: 1px solid black; border-collapse: collapse;">', t, flags=re.I) # html_string = html_string.replace(table, t, 1) # remove verbose span # while True: # spans = get_html_element('<span (?:tyle\s*=[^<>]+?font-family)', html_string, regex=True) # if not spans: # break # for span in spans: # sub_span = re.sub(r'^<span[^<>]*>', '', span, flags=re.I)[:-7] # html_string = html_string.replace(span, sub_span, 1) html_string = remove_tag('<span (?:style\s*=[^<>]+?font-family)', html_string, regex=True, flags=re.I) html_string = center_image(html_string) # DO NOT remove p, div style # html_string = re.sub(r'<(p|div|br) [^<>]+>', r'<\1>', html_string, flags=re.I) # remove word spercial tag # dirty_elems = get_html_element('<([\w]+:[\w]+|xml)', html_string, regex=True) # for elem in dirty_elems: # html_string = html_string.replace(elem, '', 1) # remove empty elements # html_string = re.sub(r'\s*<(\w+)>( |\s| |)*</\1>\s*', ' ', html_string) html_string = remove_empty_elements(html_string) # remove more html_string = limit_nbsp(html_string) # replace ( ) # html_string = html_string.replace(')', ')').replace('(', '(') # remove unclosed tags # html_string = remove_unclosed_tags(html_string) return html_string
async def get_pages(self, info): no_new_question = 0 page_num = 0 N = 0 while True: if no_new_question > 30: no_new_question = 0 page_num = 0 await asyncio.sleep(INTERNAL) continue ninfo = dict(info) ninfo['skip'] = page_num * 100 item = make_page_item(ninfo) logging.info('[get_pages]: {}, {}'.format(info['key'], page_num)) item.proxy = 'http://' + '119.7.227.133:9990' # _proxy.get(server_id=105) item.cookies = self.cookies with await self.lock: await asyncio.sleep(10) resp = await self.async_web_request(item, check_html=check_pg) if not (resp and resp.content): continue html_string = resp.text if not N: s = html_string.rfind('</div>|*|') + len('</div>|*|') e = html_string.find('|', s) qs_num = html_string[s:e] if not qs_num: logging.warn('not qs_num: {}'.format( json.dumps(item.json(), ensure_ascii=False))) continue N = int(qs_num) + 100 if page_num * 100 > N: await asyncio.sleep(INTERNAL) continue questions = get_html_element('<div [^<>]*class="Problems_item"', html_string, regex=True) has_qs = False for qs in questions: s = qs.find('<tt>') + 4 e = qs.find('</tt>') qid = qs[s:e] hkey = 'dz101_question_{}'.format(qid) if is_archived(hkey): continue has_qs = True logging.info('[question]: {}, {}'.format(info['key'], hkey)) save_html(hkey, qs, ninfo['aft_subj_id'], ninfo) if not has_qs: no_new_question += 1 else: no_new_question = 0 page_num += 1 logging.info('[page done]')
def find_mathml_elems(html_string, with_tag=True): maths = get_html_element('<(mfrac|msubsup|munder)', html_string, with_tag=with_tag, regex=True, flags=re.I) return maths