Beispiel #1
0
def fix_any(html_string):
    maths = get_html_element('<math', html_string, flags=re.I)
    for math in set(maths):
        math_t = '<span class="afanti-latex">{}</span>'.format(math)
        html_string = html_string.replace(math, math_t)

    # 加点字
    spans = get_html_element('<span class="founderdotem">', html_string)
    for span in spans:
        text = remove_start_tag(span)
        aft_tag = '<bdo class="aft_underpoint">{}</bdo>'.format(text)
        html_string = html_string.replace(span, aft_tag)

    return html_string
Beispiel #2
0
def format_spans(html_string):
    _LINE_THROUGH = LINE_THROUGH.replace('<span', '<sspan')\
                               .replace('</span>', '</sspan>')
    _UNDERLINE = UNDERLINE.replace('<span', '<sspan')\
                         .replace('</span>', '</sspan>')

    spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)',
                             html_string,
                             regex=True,
                             flags=re.I)

    spans = list(set(spans))
    spans = sorted(spans, key=lambda x: len(x), reverse=True)

    for span in spans:
        txt = remove_start_tag(span)
        i = span.find('>')
        tag = span[:i].lower()

        if 'text-decoration' in tag:
            if 'underline' in tag:
                nspan = _UNDERLINE.format(txt)
                html_string = html_string.replace(span, nspan)

            elif 'none' in tag:
                html_string = html_string.replace(span, txt)

            elif 'line-through' in tag:
                nspan = _LINE_THROUGH.format(txt)
                html_string = html_string.replace(span, nspan)

        elif 'vertical-align' in tag:
            if ':sub' in tag:
                nspan = '<sub>{}</sub>'.format(txt)
            elif ':sup' in tag:
                nspan = '<sup>{}</sup>'.format(txt)
            else:
                nspan = txt
            html_string = html_string.replace(span, nspan)

    while True:
        html_string = remove_tag('<span', html_string, all=False, flags=re.I)
        if not get_html_element('<span', html_string):
            break

    html_string = html_string.replace('<sspan', '<span')\
                             .replace('</sspan>', '</span>')

    return html_string
Beispiel #3
0
def convert_img_to_latex(html_string):
    imgs = get_html_element('<img ', html_string, only_tag=True)
    for img in imgs:

        # for ∑
        if _is_in(('19d3e9593386103f95e71affc87e62ea',
                   '5f4100b557a7c8116b2a45e4435b67ae', '/part/8721.png'), img):
            html_string = html_string.replace(img, '\\sum ')

        # for ∏
        if _is_in(('b9331d3ee2218a431c9203512001f479',
                   '9b0bbd95adbebda854a4ec3b1c2ab2e6', '/part/8719.png'), img):
            html_string = html_string.replace(img, '\\prod ')

        # for ∫
        if _is_in(('7ea7ce25490319b1bc0a30f02283c465',
                   '3d579d20afec8779d54985a5acf51879', '/part/8747.png'), img):
            html_string = html_string.replace(img, '\\int ')

        # for ⋃
        if _is_in(('c7fdd6777f4de5a1f490f96c17a414b3',
                   'cd0da1bb4f8a3b0e0656578988afb49a', '/part/8746.png'), img):
            html_string = html_string.replace(img, '\\bigcup ')

        # for ⋂
        if _is_in(('429c05d3df51962ccb70c6a9306e78ff',
                   '6c9dd31c5750dd38b991f3616df1517c', '/part/8745.png'), img):
            html_string = html_string.replace(img, '\\bigcup ')

    return html_string
Beispiel #4
0
def handle_mathml(html_string, uri2oss, url):
    img_dir = 'working/latex_imgs/'
    mathmls = get_html_element('<math', html_string)
    latexes = [fix_latex(lt) for lt in to_latexes(mathmls)]
    png_paths = [img_dir + md5_string(latex) + '.png' for latex in latexes]
    png_results = to_pngs(latexes, png_paths, check=False)
    for latex, mathml, png_path, png_result in zip(latexes, mathmls, png_paths,
                                                   png_results):
        if png_result is False:
            logging.warn('latex2png:{} {}'.format(url, latex))
            return False

        # if not os.path.exists(png_path):
        # if png_result is False:
        # logging.warn('latex2png:{}'.format(latex))
        # return False

        w, h = get_image_size(png_path)

        latex_base64 = compat_base64.b64encode(latex.encode('utf-8')).decode()

        span = '<span data-latex="base64,{}">'.format(latex_base64)
        md5_name = os.path.basename(png_path)
        oss_img_url = uri2oss.convert(md5_name, 56)
        # oss_img_url = png_path
        # img = span + ('<img src="{}" width="{}" heigh="{}" '
        # 'style="vertical-align: middle; margin: 5px 3px 5px 3px"></span>'.format(
        # oss_img_url, w // 2 + 2, h // 2 + 2))
        img = span + ('<img src="{}" width="{}" heigh="{}" '
                      'class="afanti_latex"></span>'.format(
                          oss_img_url, w // 2 + 2, h // 2 + 2))
        html_string = html_string.replace(mathml, img)

    return html_string
Beispiel #5
0
def displaystyle(html_string, latex_tag=None, regex=False,
                 flags=re.U, latex=True, mml=True):
    """
    give displaystyle at right places
    """

    if latex is True:
        texes = list()
        if latex_tag:
            texes = get_html_element(latex_tag, html_string,
                                    regex=regex, flags=flags)
        else:
            if _re_displaystyle_target.search(html_string):
                texes = find_latexes(html_string)

        for tex in set(texes):
            tex_t = _displaystyle(tex)
            tex_t = _discard_latex_displaystyle_for_subsup(tex_t)
            html_string = html_string.replace(tex, tex_t)

    if mml is True and '<math' in html_string.lower():
        mathmls = find_mathml_elems(html_string, with_tag=True)
        mathmls = list(set(mathmls))
        mathmls = sort_by_len(mathmls, reverse=True)

        for mathml in mathmls:
            html_string = html_string.replace(
                mathml, ('<mstyle displaystyle="true">{}</mstyle>').format(mathml))
        html_string = _discard_mathml_displaystyle_for_subsup(html_string)

    return html_string
Beispiel #6
0
def handle_spans(html_string):
    spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)',
                             html_string,
                             regex=True,
                             flags=re.I)

    spans = [span for span in set(spans)]
    spans = sorted(spans, key=lambda x: len(x), reverse=True)
    for span in spans:
        txt = remove_start_tag(span)
        i = span.find('>')
        tag = span[:i].lower()

        if 'text-decoration' in tag:
            if 'underline' in tag:
                nspan = UNDERLINE.format(txt)
                html_string = html_string.replace(span, nspan)

            elif 'none' in tag:
                html_string = html_string.replace(span, txt)

            elif 'line-through' in tag:
                nspan = LINE_THROUGH.format(txt)
                html_string = html_string.replace(span, nspan)

        elif 'vertical-align' in tag:
            if ':sub' in tag:
                nspan = '<sub>{}</sub>'.format(txt)
            elif ':sup' in tag:
                nspan = '<sup>{}</sup>'.format(txt)
            else:
                nspan = txt
            html_string = html_string.replace(span, nspan)

    return html_string
Beispiel #7
0
def remove_empty_elements(html_string, filter=None):
    #
    # filter is function return True or False,
    # True, then remove the elem,
    # False, then remain it
    #
    def _filter(elem):
        elem = elem.lower()
        if 'aft_' in elem \
                or 'afanti_' in elem \
                or '<u>' in elem \
                or '<img ' in elem:
            return False
        else:
            return True

    _filter = _filter or filter
    elems = get_html_element('<([a-zA-Z][a-zA-Z0-9:]*)',
                             html_string,
                             regex=True)
    elems = list(set(elems))
    elems = sorted(elems, key=lambda x: len(x), reverse=True)
    for elem in elems:
        elem_text = re_tag.sub('', elem)
        elem_text = re_empty_str.sub('', elem_text)
        if not elem_text:
            if _filter and not _filter(elem):
                continue
            html_string = html_string.replace(elem, '')

    return html_string.strip()
Beispiel #8
0
def remove_tag(tag,
               html_string,
               regex=False,
               flags=re.U,
               all=False,
               check=None):
    '''
    if all is True, remove matched elements including it's text
    '''

    if regex is False:
        if tag.lower() not in html_string.lower():
            return html_string
    else:
        if re.search(tag, html_string, flags=flags) is None:
            return html_string

    es = get_html_element(tag, html_string, regex=regex, flags=flags)
    for e in es:
        if check is not None and check(e) is False:
            continue

        if all:
            content = ''
        else:
            content = re.sub(r'^<[^<>]+>', '', e)
            content = re.sub(r'</[^<>]+>$', '', content)

            # sindex = e.find('>') + 1
            # eindex = e.rfind('<')
            # content = e[sindex:eindex]

        html_string = html_string.replace(e, content)
    return html_string
Beispiel #9
0
def center_image(html_string):
    imgs = get_html_element('<img', html_string, only_tag=True, flags=re.I)
    for img in imgs:
        try:
            src = re.search(r"""src\s*=\s*["'][^"'<>]+?["']""",
                            img,
                            flags=re.I).group()
        except Exception:
            continue

        w = ''
        mod = re.search(r'\W(width\s*(:|=)\s*[^<>]+?)(;|\s|/|>)',
                        img,
                        flags=re.I)
        if mod:
            w = mod.group(1)
            w = re.sub(r'\s*:\s*', '=', w, 1)
            if '"' not in w and '\'' not in w:
                w = w.replace('=', '="', 1) + '"'

        h = ''
        mod = re.search(r'\W(height\s*(:|=)\s*[^<>]+?)(;|\s|/|>)',
                        img,
                        flags=re.I)
        if mod:
            h = mod.group(1)
            h = re.sub(r'\s*:\s*', '=', h, 1)
            if '"' not in h and '\'' not in h:
                h = h.replace('=', '="', 1) + '"'

        style = ' '.join((src, w, h)).strip()
        new_img = '<img %s style="vertical-align: middle;">' % style
        html_string = html_string.replace(img, new_img)

    return html_string
Beispiel #10
0
def get_answer_all_html(entity):
    ans = get_html_element('<li class="Answer">', entity, with_tag=False)[0]
    if not ans:
        return ''

    ans = remove_tag('<XHTML', ans, all=False).strip()
    ans = ans.replace('【答案】', '', 1)
    return ans.strip()
Beispiel #11
0
def get_fenxi(entity):
    fx = get_html_element('<li class="Analytical">', entity, with_tag=False)[0]
    if not fx:
        return ''

    fx = remove_tag('<XHTML', fx, all=False).strip()
    fx = fx.replace('【解析】', '', 1)
    return fx.strip()
Beispiel #12
0
def restore_src(html_string):
    imgs = get_html_element('<img [^<>]*src-base64=',
                            html_string,
                            regex=True,
                            only_tag=True)
    for img in imgs:
        img_t = img.replace('src-base64=', 'src=', 1)
        html_string = html_string.replace(img, img_t, 1)
    return html_string
Beispiel #13
0
 def get_jieda(self, html_string):
     e = get_html_element('<font color=red>', html_string,
                          with_tag=False, limit=1)[0]
     e = self.fix_any(e)
     e = center_image(e)
     e = self.html_magic.bewitch(e, spider_url=self.url)
     if e.endswith('</div></p>'):
         e = e[:-4]
     return e.strip()
Beispiel #14
0
    def test_get_html_element(self):

        e = get_html_element((
            dict(e='<p>', with_tag=False),
            dict(e='<div', with_tag=False),
            dict(e='<p>', with_tag=False),
        ), self.html_string)[0]

        self.assertEqual(e, ' target ')
Beispiel #15
0
def make_option(entity):
    options = get_html_element('<span class="option">', entity, with_tag=False)
    tr_t = '<tr><td class="aft_option" data="{}">{}</td></tr>'
    option = '<table class="aft_option_wrapper" style="width: 100%;"><tbody class="measureRoot">{}</tbody></table>'.format(
        ''.join([
            tr_t.format(OPTION_DICT[index], td)
            for index, td in enumerate(options)
        ]))
    return option
Beispiel #16
0
def get_question_type_str(html_string):
    e = get_html_element('<div class="T">',
                         html_string,
                         with_tag=False,
                         limit=1)[0]
    mod = re.search('type">(.+?)</tt>', e)
    if not mod:
        return ''
    tp = mod.group(1)
    return tp
Beispiel #17
0
 def get_question_html(self, html_string):
     rs = []
     cns = get_html_element('<div class="content">', html_string, with_tag=False)
     for cn in cns:
         cn = abs_url(cn)
         cn = center_image(cn)
         cn = self.html_magic.bewitch(cn, spider_url=self.url)
         rs.append(cn.strip())
     rs[1] = self.fix_any(rs[1]).replace('\r', '').strip()
     return rs
Beispiel #18
0
def get_difficulty(html_string):
    e = get_html_element('<div class="T">',
                         html_string,
                         with_tag=False,
                         limit=1)[0]
    mod = re.search('difficulty">(.+?)<', e)
    if not mod:
        return ''
    dfs = mod.group(1)
    df = DIFFS.get(dfs, 0)
    return df
Beispiel #19
0
 def get_question_html(self, html_string):
     e = get_html_element('<div', html_string, with_tag=False, limit=1)
     if e:
         e = e[0]
     else:
         e = remove_start_tag(html_string)
     e = self.fix_any(e)
     e = center_image(e)
     e = self.html_magic.bewitch(e, spider_url=self.url)
     e = self.format_options(e)
     return e.strip()
Beispiel #20
0
    def get_render_html(self, raw_render_html):
        elem = get_html_element('<div id="-mathjax-render-div-">',
                                raw_render_html,
                                with_tag=False,
                                limit=1)
        if not elem:
            raise ParserError('Cant\'t find <div id="-mathjax-render-div-">')

        elem = self.remove_some_elem(elem[0])

        return elem.strip()
Beispiel #21
0
def get_spans(html_string):
    imgs = get_html_element('<img ', html_string, only_tag=True)
    for img in imgs:
        html_string = html_string.replace(img, img + '</img>')

    rs = []
    spans = find_valid_elements(html_string, '<(span|img) ', with_tag=True, regex=True)
    for span in spans:
        rs.append(Span(span))

    # format_spans(rs)
    return rs
Beispiel #22
0
def get_question_html(entity):
    if entity.startswith('<li'):
        qs = get_html_element('<li class="IsTopic">',
                              entity,
                              with_tag=False,
                              limit=1)[0]
        if not qs:
            return ''
    else:
        qs = get_html_element('<span class="optionoption">',
                              entity,
                              with_tag=False,
                              limit=1)[0]
        if not qs:
            return ''

    qs = remove_tag('<XHTML', qs, all=False).strip()

    if entity.startswith('<span'):
        qs = make_option(qs)

    return qs.strip()
Beispiel #23
0
def find_table_options(html_string):
    """
    找 <td> 内容已 ABCD... 开头的 <table>
    """

    tables = find_valid_elements(html_string, '<table', flags=re.I)

    rs = []
    for table in tables:
        tds = get_html_element('<td', table, with_tag=False, flags=re.I)
        if len(tds) < 3:
            continue
        _tds = [re_tag.sub('', td).strip() for td in tds]
        if _startswith_abcd(_tds):
            rs.append([table, tds])
    return rs
Beispiel #24
0
    def no_table_format(html_string):
        if '<img' in html_string:
            html_string = convert_img_to_latex(html_string)

        html_string = html_string.replace('∏limit{s}', '\prod\limits ')
        html_string = html_string.replace('πlimit{s}', '\prod\limits ')
        html_string = html_string.replace('∑limit{s}', '\sum\limits ')
        html_string = html_string.replace('∫limit{s}', '\int\limits ')

        html_string = html_string.replace('%', ' \\%')

        if 'underpoint' in html_string:
            underpoints = get_html_element('<bdo [^<>]+underpoint',
                                           html_string,
                                           with_tag=True,
                                           regex=True,
                                           flags=re.I)

            for underpoint in set(underpoints):
                t = remove_start_tag(underpoint)
                underpoint_tex = '\\underset{{˙}}{{{}}}'.format(t)
                html_string = html_string.replace(underpoint, underpoint_tex)

        while True:
            spans = find_valid_elements(html_string, '<span ', flags=re.I)
            if not spans:
                break
            for span in spans:
                index = span.find('>') + 1
                if 'vertical-align:sub' in span[:index]:
                    n_span = remove_start_tag(span)
                    html_string = html_string.replace(span, '_{%s}' % n_span,
                                                      1)
                elif 'vertical-align:sup' in span[:index]:
                    n_span = remove_start_tag(span)
                    html_string = html_string.replace(span, '^{%s}' % n_span,
                                                      1)

        html_string = re.sub(r'<(span|font)>', '', html_string, flags=re.I)
        html_string = re.sub(r'</(span|font)>', '', html_string, flags=re.I)

        return html_string
Beispiel #25
0
def to_latex(html_string, raw=False, md5=False):
    jy_math_span_list = get_html_element('<span [^<>]*?mathtag="math',
                                         html_string,
                                         regex=True,
                                         with_tag=True,
                                         flags=re.I)

    latexes = []

    for jy_math_span_ori in jy_math_span_list:
        jy_math_span = remove_start_tag(jy_math_span_ori)

        # if not <table, no need to convert
        if '<table' not in jy_math_span.lower():
            html_string = html_string.replace(jy_math_span_ori, jy_math_span)
            continue

        jy_math_span = re.sub(r'(<td[^<>]*)/>',
                              r'\1></td>',
                              jy_math_span,
                              flags=re.I)
        root_node = Node()
        if md5 is True:
            root_node.md5 = True
        root_node.node_format = '{}'
        root_node.value_strs = [jy_math_span]
        parse(root_node)
        latex = convert(root_node).strip()
        latex = Node.no_table_format(latex)

        latexes.append(latex)

        if raw is False:
            latex_span = '<span class="afanti-latex">\( {} \)</span>'.format(
                latex)
            html_string = html_string.replace(jy_math_span_ori, latex_span)
        else:
            html_string = html_string.replace(jy_math_span_ori,
                                              '\( {} \)'.format(latex))

    return html_string.strip(), latexes
Beispiel #26
0
def to_latex(html_string, raw=False):
    xb_math_span_list = get_html_element('<span [^<>]*?math-model',
                                         html_string,
                                         regex = True,
                                         with_tag=True,
                                         flags=re.I)

    for xb_math_span_ori in xb_math_span_list:
        xb_math_span = remove_start_tag(xb_math_span_ori)

        spans = get_spans(xb_math_span)
        root_node = Node(spans)
        parse(root_node)

        latex = str(root_node)

        if raw is False:
            latex_span = '<span class="afanti-latex">\( {} \)</span>'.format(latex)
            html_string = html_string.replace(xb_math_span_ori, latex_span)
        else:
            html_string = html_string.replace(xb_math_span_ori, '\( {} \)'.format(latex))

    return html_string
Beispiel #27
0
    def parse(self, html_string, url, aft_subj_id):
        cols = dict()

        exam_year = 0
        paper_name = ''

        question_html_t = list()
        answer_all_html_t = list()
        fenxi_t = list()

        cols_dict = {
            '"IsTopic"': question_html_t,
            '"optionoption"': question_html_t,
            '"Answer"': answer_all_html_t,
            '"Analytical"': fenxi_t,
        }

        entities = {
            '"IsTopic"': get_question_html,
            '"optionoption"': get_question_html,
            '"Answer"': get_answer_all_html,
            '"Analytical"': get_fenxi,
        }

        elems = get_html_element(
            '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)',
            html_string,
            regex=True)

        q = -1
        for elem in elems:
            for key in entities.keys():
                if key in elem[:30]:
                    entity = entities[key](elem)
                    if q > 0 and key in ('"Answer"', '"Analytical"'):
                        entity = '({}). {}'.format(q, entity)

                    if q == -1 and key == '"IsTopic"':
                        exam_year, paper_name = get_exam_info(entity)
                        entity = remove_exam_info(entity)

                    cols_dict[key].append(entity)

                    if key == '"IsTopic"':
                        q += 1
                    break

        question_all_html = '<br>\n'.join(question_html_t)

        question_html = self.html_magic.bewitch(question_all_html,
                                                spider_url=url)
        question_html = center_image(question_html)
        question_html = fix_any(question_html)
        question_html = displaystyle(question_html, latex=False, mml=True)
        #cols['question_html_origin'] = question_html

        answer_all_html = '<br>\n'.join(answer_all_html_t)
        answer_all_html = self.html_magic.bewitch(answer_all_html,
                                                  spider_url=url)
        answer_all_html = center_image(answer_all_html)
        answer_all_html = fix_any(answer_all_html)
        answer_all_html = displaystyle(answer_all_html, latex=False, mml=True)
        #cols['answer_all_html_origin'] = answer_all_html

        fenxi = '<br>\n'.join(fenxi_t)
        fenxi = self.html_magic.bewitch(fenxi, spider_url=url)
        fenxi = center_image(fenxi)
        fenxi = fix_any(fenxi)
        fenxi = displaystyle(fenxi, latex=False, mml=True)
        #cols['fenxi_origin'] = fenxi

        cols['difficulty'] = get_difficulty(html_string)
        cols['question_type_str'] = get_question_type_str(html_string)

        cols['question_html'] = ''
        cols['option_html'] = ''
        cols['answer_all_html'] = ''
        cols['jieda'] = ''
        cols['fenxi'] = ''
        cols['dianping'] = ''

        cols['option_html_origin'] = ''
        cols['jieda_origin'] = ''
        cols['dianping_origin'] = ''

        cols['zhuanti'] = ''
        cols['paper_name'] = paper_name
        cols['paper_url'] = ''
        cols['spider_url'] = url
        cols['subject'] = aft_subj_id
        cols['spider_source'] = 56
        cols['question_type'] = 0
        cols['question_quality'] = 0
        cols['knowledge_point'] = ''
        cols['exam_year'] = exam_year
        cols['exam_city'] = ''

        _question = Question(
            question_body=question_html,
            answer=answer_all_html,
            analy=fenxi,
        )
        standard_question = _question.normialize()
        cols['question_html_origin'] = standard_question['question_body']
        cols['answer_all_html_origin'] = standard_question['answer']
        cols['fenxi_origin'] = standard_question['analy']

        return cols
Beispiel #28
0
def beautify_html(html_string):
    '''
    慎重使用


    '''

    # html_string = ''.join([i.strip() + ' ' for i in StringIO(html_string).readlines()])
    # html_string = get_html_element('<body', html_string)[0]
    html_string = html_string.strip()

    # remove style tag
    html_string = remove_style_tag(html_string)

    # remove comment
    html_string = re.sub(r'<![^<>]+>', '', html_string)

    # remove '\x1f'
    # html_string = re.sub(r'(\d)\x1f(\d)', r'\1\2', html_string)

    # remove h1,
    html_string = re.sub(r'<(/|)(h\d*|strong|font|em|[\w]+:[\w]+|xml)[^<>]*>',
                         '',
                         html_string,
                         flags=re.I)

    # remove b
    html_string = remove_tag('<b>', html_string, flags=re.I)

    # remove a
    html_string = remove_a_tag(html_string)

    # fix super and sub tag
    tags = get_html_element('<span [^<>]+[^\w<>](super|sub|underline)[^\w<>]',
                            html_string,
                            regex=True,
                            flags=re.I)
    for tag in tags:
        if 'super' in tag.lower():
            text = re.sub(r'<span [^<>]+>', '<sup>', tag,
                          flags=re.I)[:-7] + '</sup>'
        elif 'underline' in tag.lower():
            text = re.sub(r'<span [^<>]+>', '<u>', tag,
                          flags=re.I)[:-7] + '</u>'
        else:
            text = re.sub(r'<span [^<>]+>', '<sub>', tag,
                          flags=re.I)[:-7] + '</sub>'
        html_string = html_string.replace(tag, text, 1)

    # clear table
    # tables = get_html_element('<table', html_string)
    # for table in tables:
    # if 'border-bottom:' in table:
    # continue
    # t = re.sub(r'<table[^<>]*>', '<table style="border: 1px solid black; border-collapse: collapse;">', table, flags=re.I)
    # t = re.sub(r'<tr[^<>]*>', '<tr>', t, flags=re.I)
    # t = re.sub(r'<td[^<>]*>', '<td style="border: 1px solid black; border-collapse: collapse;">', t, flags=re.I)
    # t = re.sub(r'<th[^<>]*>', '<th style="border: 1px solid black; border-collapse: collapse;">', t, flags=re.I)
    # html_string = html_string.replace(table, t, 1)

    # remove verbose span
    # while True:
    # spans = get_html_element('<span (?:tyle\s*=[^<>]+?font-family)', html_string, regex=True)
    # if not spans:
    # break
    # for span in spans:
    # sub_span = re.sub(r'^<span[^<>]*>', '', span, flags=re.I)[:-7]
    # html_string = html_string.replace(span, sub_span, 1)

    html_string = remove_tag('<span (?:style\s*=[^<>]+?font-family)',
                             html_string,
                             regex=True,
                             flags=re.I)

    html_string = center_image(html_string)

    # DO NOT remove p, div style
    # html_string = re.sub(r'<(p|div|br) [^<>]+>', r'<\1>', html_string, flags=re.I)

    # remove word spercial tag
    # dirty_elems = get_html_element('<([\w]+:[\w]+|xml)', html_string, regex=True)
    # for elem in dirty_elems:
    # html_string = html_string.replace(elem, '', 1)

    # remove empty elements
    # html_string = re.sub(r'\s*<(\w+)>(&nbsp;|\s| |)*</\1>\s*', ' ', html_string)
    html_string = remove_empty_elements(html_string)

    # remove more &nbsp;
    html_string = limit_nbsp(html_string)

    # replace (  )
    # html_string = html_string.replace(')', ')').replace('(', '(')

    # remove unclosed tags
    # html_string = remove_unclosed_tags(html_string)
    return html_string
Beispiel #29
0
    async def get_pages(self, info):
        no_new_question = 0
        page_num = 0
        N = 0

        while True:
            if no_new_question > 30:
                no_new_question = 0
                page_num = 0
                await asyncio.sleep(INTERNAL)
                continue

            ninfo = dict(info)
            ninfo['skip'] = page_num * 100

            item = make_page_item(ninfo)

            logging.info('[get_pages]: {}, {}'.format(info['key'], page_num))

            item.proxy = 'http://' + '119.7.227.133:9990'  # _proxy.get(server_id=105)
            item.cookies = self.cookies

            with await self.lock:
                await asyncio.sleep(10)
                resp = await self.async_web_request(item, check_html=check_pg)
            if not (resp and resp.content):
                continue

            html_string = resp.text

            if not N:
                s = html_string.rfind('</div>|*|') + len('</div>|*|')
                e = html_string.find('|', s)
                qs_num = html_string[s:e]
                if not qs_num:
                    logging.warn('not qs_num: {}'.format(
                        json.dumps(item.json(), ensure_ascii=False)))
                    continue
                N = int(qs_num) + 100

            if page_num * 100 > N:
                await asyncio.sleep(INTERNAL)
                continue

            questions = get_html_element('<div [^<>]*class="Problems_item"',
                                         html_string,
                                         regex=True)

            has_qs = False
            for qs in questions:
                s = qs.find('<tt>') + 4
                e = qs.find('</tt>')
                qid = qs[s:e]
                hkey = 'dz101_question_{}'.format(qid)

                if is_archived(hkey):
                    continue

                has_qs = True
                logging.info('[question]: {}, {}'.format(info['key'], hkey))
                save_html(hkey, qs, ninfo['aft_subj_id'], ninfo)

            if not has_qs:
                no_new_question += 1
            else:
                no_new_question = 0

            page_num += 1
            logging.info('[page done]')
Beispiel #30
0
def find_mathml_elems(html_string, with_tag=True):
    maths = get_html_element('<(mfrac|msubsup|munder)', html_string,
                             with_tag=with_tag, regex=True, flags=re.I)
    return maths