Ejemplo n.º 1
0
 def remove_some_elem(self, elem):
     elem = elem.replace('<span class="MathJax_Preview"></span>', '')\
                .replace(' style=""', '')\
                .replace('MJXc-processed', '')
     elem = remove_tag('<script ', elem, all=True)
     elem = remove_tag('<span class="MathJax_Preview">', elem, all=True)
     elem = re.sub(r' id=".+?"', '', elem)
     elem = self.compress_class(elem)
     return elem
Ejemplo n.º 2
0
def get_fenxi(entity):
    fx = get_html_element('<li class="Analytical">', entity, with_tag=False)[0]
    if not fx:
        return ''

    fx = remove_tag('<XHTML', fx, all=False).strip()
    fx = fx.replace('【解析】', '', 1)
    return fx.strip()
Ejemplo n.º 3
0
def get_answer_all_html(entity):
    ans = get_html_element('<li class="Answer">', entity, with_tag=False)[0]
    if not ans:
        return ''

    ans = remove_tag('<XHTML', ans, all=False).strip()
    ans = ans.replace('【答案】', '', 1)
    return ans.strip()
Ejemplo n.º 4
0
    def __init__(self, span):
        self.span = span
        self.left = int(re_left.search(span).group(1))
        self.top = int(re_top.search(span).group(1))
        self.text = remove_tag('<span ', span)

        mod = re_width.search(span)
        if mod:
            self.width = int(mod.group(1))
        else:
            self.width = None
Ejemplo n.º 5
0
def format_spans(html_string):
    _LINE_THROUGH = LINE_THROUGH.replace('<span', '<sspan')\
                               .replace('</span>', '</sspan>')
    _UNDERLINE = UNDERLINE.replace('<span', '<sspan')\
                         .replace('</span>', '</sspan>')

    spans = get_html_element('<span [^<>]+(text-decoration|vertical-align)',
                             html_string,
                             regex=True,
                             flags=re.I)

    spans = list(set(spans))
    spans = sorted(spans, key=lambda x: len(x), reverse=True)

    for span in spans:
        txt = remove_start_tag(span)
        i = span.find('>')
        tag = span[:i].lower()

        if 'text-decoration' in tag:
            if 'underline' in tag:
                nspan = _UNDERLINE.format(txt)
                html_string = html_string.replace(span, nspan)

            elif 'none' in tag:
                html_string = html_string.replace(span, txt)

            elif 'line-through' in tag:
                nspan = _LINE_THROUGH.format(txt)
                html_string = html_string.replace(span, nspan)

        elif 'vertical-align' in tag:
            if ':sub' in tag:
                nspan = '<sub>{}</sub>'.format(txt)
            elif ':sup' in tag:
                nspan = '<sup>{}</sup>'.format(txt)
            else:
                nspan = txt
            html_string = html_string.replace(span, nspan)

    while True:
        html_string = remove_tag('<span', html_string, all=False, flags=re.I)
        if not get_html_element('<span', html_string):
            break

    html_string = html_string.replace('<sspan', '<span')\
                             .replace('</sspan>', '</span>')

    return html_string
Ejemplo n.º 6
0
def _discard_mathml_displaystyle_for_subsup(mathml):
    """
    # fractions that are in sub/sup are not needed to display

    remove displaystyle in which of sub/sup
    """

    subsups = find_valid_elements(mathml, '<msu(b|p)',
                                  regex=True, with_tag=False)
    subsups = list(set(subsups))
    subsups = sort_by_len(subsups, reverse=True)
    for subsup in subsups:
        subsup_t = remove_tag('<mstyle displaystyle', subsup, all=False)
        mathml = mathml.replace(subsup, subsup_t, 1)
    return mathml
Ejemplo n.º 7
0
 def fix_any(self, html_string):
     html_string = html_string.replace('\n', '')
     html_string = re_p_tag.sub('<p>', html_string)
     html_string = handle_spans(html_string)
     html_string = remove_tag('<span', html_string)
     html_string = html_string.replace('<p></p>', '')\
                              .replace('<p><br></p>', '')\
                              .replace('<div><br></div>', '')\
                              .replace('<o:p></o:p>', '')\
                              .replace('</p><br>', '</p>')
     html_string = re_nbsp.sub('&nbsp;' * 6, html_string)
     html_string = re_underline.sub(UNDERLINE.format('&nbsp;' * 6),
                                    html_string)
     html_string = html_string.replace('<sspan', '<span')
     return html_string
Ejemplo n.º 8
0
def get_question_html(entity):
    if entity.startswith('<li'):
        qs = get_html_element('<li class="IsTopic">',
                              entity,
                              with_tag=False,
                              limit=1)[0]
        if not qs:
            return ''
    else:
        qs = get_html_element('<span class="optionoption">',
                              entity,
                              with_tag=False,
                              limit=1)[0]
        if not qs:
            return ''

    qs = remove_tag('<XHTML', qs, all=False).strip()

    if entity.startswith('<span'):
        qs = make_option(qs)

    return qs.strip()
Ejemplo n.º 9
0
 def fix_any(self, html_string):
     html_string = format_spans(html_string)
     html_string = remove_tag('<font', html_string)
     html_string = remove_a_tag(html_string)
     return html_string.strip()
Ejemplo n.º 10
0
 def fix_any(html_string):
     html_string = remove_tag('<div', html_string, flags=re.I)
     html_string = html_string.replace('&nbsp;', ' ')
     return html_string
Ejemplo n.º 11
0
    def parse(self, key, qs_json, as_json, aft_subj_id):

        cols = dict()

        question_html = qs_json['test']
        question_html = self.html_magic.bewitch(question_html,
                                                spider_url=key)
        question_html = fix_any(question_html)
        cols['question_html'] = center_image(question_html)

        ################################################################

        if not qs_json.get('diff'):
            difficulty = 0
        else:
            difficulty = (100 - int(qs_json.get('diff', 0) * 100))
        cols['difficulty'] = difficulty

        ################################################################

        paper_name = (qs_json.get('docname') or '')
        cols['paper_name'] = paper_name

        ################################################################

        mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name)
        if mod:
            exam_year = mod.group(1)
        else:
            exam_year = 0
        cols['exam_year'] = int(exam_year)

        ################################################################

        cols['question_type_str'] = (qs_json.get('typesname') or '')

        ################################################################

        as_js = as_json['data'][1][0][0]
        answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''),
                                                  spider_url=key)
        answer_all_html = fix_any(answer_all_html)
        cols['answer_all_html'] = center_image(answer_all_html)

        ################################################################

        fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''),
                                        spider_url=key)
        fenxi = fix_any(fenxi)
        cols['fenxi'] = center_image(fenxi)

        ################################################################

        knowledge_point_json = list()
        knowledge_point = list()
        kpstr = (as_js.get('kllist') or '')
        kpstr = remove_tag('<span', kpstr, all=True)
        kpl = kpstr.split('<br>')
        for kps in kpl:
            kps = kps.split(' >> ')
            knowledge_point.append(kps[-1])
            knowledge_point_json.append(kps)
        knowledge_point = ';'.join(knowledge_point)
        knowledge_point_json = json.dumps(knowledge_point_json,
                                          ensure_ascii=False)
        cols['knowledge_point'] = knowledge_point
        cols['knowledge_point_json'] = knowledge_point_json

        ################################################################

        other_info = (as_js.get('remark') or '')
        other_info = self.html_magic.bewitch(other_info, spider_url=key)
        other_info = fix_any(other_info)
        cols['other_info'] = center_image(other_info)

        ################################################################

        cols['spider_url'] = key
        cols['subject'] = aft_subj_id
        cols['exam_city'] = ''
        cols['paper_url'] = ''
        cols['zhuanti'] = ''
        cols['option_html'] = ''
        cols['jieda'] = ''
        cols['dianping'] = ''
        cols['spider_source'] = 52
        cols['question_type'] = 0
        cols['question_quality'] = 0

        return cols