예제 #1
0
def main():
    html_string = '''
<TBODY>
    <TR>
        <TD>若
            <IMG style="WIDTH: 18px; HEIGHT: 16px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716662863.png">=3,
            <IMG style="WIDTH: 18px; HEIGHT: 14px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716732789.png">=7,则x﹣y的值为&nbsp;&nbsp;&nbsp;&nbsp;</TD>
    </TR>
    <TR>
        <TD>
            <DIV align=right>[&nbsp;&nbsp;&nbsp;&nbsp; ]</DIV>
        </TD>
    </TR>
    <TR>
        <TD>A.±4&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>B.±10&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>C.﹣4或﹣10&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>D.±4或±10</TD>
    </TR>
</TBODY>
</TABLE>
    '''

    html_magic = HtmlMagic(8, download=True, beautify=False)
    html_string = html_magic.bewitch(
        html_string,
        spider_url=
        'http://www.mofangge.com/html/qDetail/02/c1/201208/1kzkc102222121.html',
        spider_source=8,
    )

    html_string = center_image(html_string)

    print(html_string)
예제 #2
0
 def get_jieda(self, html_string):
     e = get_html_element('<font color=red>', html_string,
                          with_tag=False, limit=1)[0]
     e = self.fix_any(e)
     e = center_image(e)
     e = self.html_magic.bewitch(e, spider_url=self.url)
     if e.endswith('</div></p>'):
         e = e[:-4]
     return e.strip()
예제 #3
0
 def get_question_html(self, html_string):
     rs = []
     cns = get_html_element('<div class="content">', html_string, with_tag=False)
     for cn in cns:
         cn = abs_url(cn)
         cn = center_image(cn)
         cn = self.html_magic.bewitch(cn, spider_url=self.url)
         rs.append(cn.strip())
     rs[1] = self.fix_any(rs[1]).replace('\r', '').strip()
     return rs
예제 #4
0
 def get_question_html(self, html_string):
     e = get_html_element('<div', html_string, with_tag=False, limit=1)
     if e:
         e = e[0]
     else:
         e = remove_start_tag(html_string)
     e = self.fix_any(e)
     e = center_image(e)
     e = self.html_magic.bewitch(e, spider_url=self.url)
     e = self.format_options(e)
     return e.strip()
예제 #5
0
    def parse(self, html_string, url, aft_subj_id):
        cols = dict()

        exam_year = 0
        paper_name = ''

        question_html_t = list()
        answer_all_html_t = list()
        fenxi_t = list()

        cols_dict = {
            '"IsTopic"': question_html_t,
            '"optionoption"': question_html_t,
            '"Answer"': answer_all_html_t,
            '"Analytical"': fenxi_t,
        }

        entities = {
            '"IsTopic"': get_question_html,
            '"optionoption"': get_question_html,
            '"Answer"': get_answer_all_html,
            '"Analytical"': get_fenxi,
        }

        elems = get_html_element(
            '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)',
            html_string,
            regex=True)

        q = -1
        for elem in elems:
            for key in entities.keys():
                if key in elem[:30]:
                    entity = entities[key](elem)
                    if q > 0 and key in ('"Answer"', '"Analytical"'):
                        entity = '({}). {}'.format(q, entity)

                    if q == -1 and key == '"IsTopic"':
                        exam_year, paper_name = get_exam_info(entity)
                        entity = remove_exam_info(entity)

                    cols_dict[key].append(entity)

                    if key == '"IsTopic"':
                        q += 1
                    break

        question_all_html = '<br>\n'.join(question_html_t)

        question_html = self.html_magic.bewitch(question_all_html,
                                                spider_url=url)
        question_html = center_image(question_html)
        question_html = fix_any(question_html)
        question_html = displaystyle(question_html, latex=False, mml=True)
        #cols['question_html_origin'] = question_html

        answer_all_html = '<br>\n'.join(answer_all_html_t)
        answer_all_html = self.html_magic.bewitch(answer_all_html,
                                                  spider_url=url)
        answer_all_html = center_image(answer_all_html)
        answer_all_html = fix_any(answer_all_html)
        answer_all_html = displaystyle(answer_all_html, latex=False, mml=True)
        #cols['answer_all_html_origin'] = answer_all_html

        fenxi = '<br>\n'.join(fenxi_t)
        fenxi = self.html_magic.bewitch(fenxi, spider_url=url)
        fenxi = center_image(fenxi)
        fenxi = fix_any(fenxi)
        fenxi = displaystyle(fenxi, latex=False, mml=True)
        #cols['fenxi_origin'] = fenxi

        cols['difficulty'] = get_difficulty(html_string)
        cols['question_type_str'] = get_question_type_str(html_string)

        cols['question_html'] = ''
        cols['option_html'] = ''
        cols['answer_all_html'] = ''
        cols['jieda'] = ''
        cols['fenxi'] = ''
        cols['dianping'] = ''

        cols['option_html_origin'] = ''
        cols['jieda_origin'] = ''
        cols['dianping_origin'] = ''

        cols['zhuanti'] = ''
        cols['paper_name'] = paper_name
        cols['paper_url'] = ''
        cols['spider_url'] = url
        cols['subject'] = aft_subj_id
        cols['spider_source'] = 56
        cols['question_type'] = 0
        cols['question_quality'] = 0
        cols['knowledge_point'] = ''
        cols['exam_year'] = exam_year
        cols['exam_city'] = ''

        _question = Question(
            question_body=question_html,
            answer=answer_all_html,
            analy=fenxi,
        )
        standard_question = _question.normialize()
        cols['question_html_origin'] = standard_question['question_body']
        cols['answer_all_html_origin'] = standard_question['answer']
        cols['fenxi_origin'] = standard_question['analy']

        return cols
예제 #6
0
    def parse(self, html_string, url, aft_subj_id):
        cols = dict()

        exam_year = 0
        paper_name = ''

        question_html_t = list()
        answer_all_html_t = list()
        fenxi_t = list()

        cols_dict = {
            '"IsTopic"': question_html_t,
            '"optionoption"': question_html_t,
            '"Answer"': answer_all_html_t,
            '"Analytical"': fenxi_t,
        }

        entities = {
            '"IsTopic"': get_question_html,
            '"optionoption"': get_question_html,
            '"Answer"': get_answer_all_html,
            '"Analytical"': get_fenxi,
        }

        elems = get_html_element(
            '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)',
            html_string,
            regex=True)

        q = -1
        for elem in elems:
            for key in entities.keys():
                if key in elem[:30]:
                    entity = entities[key](elem)
                    if q > 0 and key in ('"Answer"', '"Analytical"'):
                        entity = '({}). {}'.format(q, entity)

                    if q == -1 and key == '"IsTopic"':
                        exam_year, paper_name = get_exam_info(entity)
                        entity = remove_exam_info(entity)

                    cols_dict[key].append(entity)

                    if key == '"IsTopic"':
                        q += 1
                    break

        question_all_html = '<br>\n'.join(question_html_t)
        cols['question_all_html'] = question_all_html

        question_html = self.html_magic.bewitch(question_all_html,
                                                spider_url=url)
        question_html = center_image(question_html)
        question_html = handle_mathml(question_html, self.uri2oss, url)
        if question_html is False:
            return False
        cols['question_html'] = question_html

        answer_all_html = '<br>\n'.join(answer_all_html_t)
        answer_all_html = self.html_magic.bewitch(answer_all_html,
                                                  spider_url=url)
        answer_all_html = center_image(answer_all_html)
        answer_all_html = handle_mathml(answer_all_html, self.uri2oss, url)
        if answer_all_html is False:
            return False
        cols['answer_all_html'] = answer_all_html

        fenxi = '<br>\n'.join(fenxi_t)
        fenxi = self.html_magic.bewitch(fenxi, spider_url=url)
        fenxi = center_image(fenxi)
        fenxi = handle_mathml(fenxi, self.uri2oss, url)
        if fenxi is False:
            return False
        cols['fenxi'] = fenxi

        cols['difficulty'] = get_difficulty(html_string)
        cols['question_type_str'] = get_question_type_str(html_string)

        cols['dianping'] = ''
        cols['zhuanti'] = ''
        cols['paper_name'] = paper_name
        cols['paper_url'] = ''
        cols['spider_url'] = url
        cols['subject'] = aft_subj_id
        cols['spider_source'] = 56
        cols['question_type'] = 0
        cols['question_quality'] = 0
        cols['knowledge_point'] = ''
        cols['knowledge_point_json'] = json.dumps([])
        cols['exam_year'] = exam_year
        cols['exam_city'] = ''
        cols['option_html'] = ''

        return cols
예제 #7
0
    def parse(self, key, qs_json, as_json, html_id):

        cols = dict()

        html = self.html_magic.bewitch(qs_json, spider_url=key)
        html = fix_any(html)
        html = center_image(html)

        ################################################################

        knowledge_point = re.findall(
            '<div class="answer-context f-roman">(.+?)</div>', html, re.S)
        if len(knowledge_point) != 0:
            knowledge_point_jsons = []
            knowledge_points = ''
            knowledge_point = knowledge_point[0]
            knowledge_point_json = knowledge_point.split('<br/>')
            for i in knowledge_point_json:
                knowledge_points += remove_tags(i).split(' >> ')[-1] + ';'
                node_i = remove_tags(i).split(' >> ')
                #node_i = json.dumps(node_i, ensure_ascii=False)
                knowledge_point_jsons.append(node_i)
            knowledge_point_jsons = json.dumps(knowledge_point_jsons,
                                               ensure_ascii=False)
            cols['knowledge_point'] = knowledge_points[:-2]
            cols['knowledge_point_json'] = knowledge_point_jsons

        ################################################################

        paper_name = re.findall('id="docname">(.+?)</span>', html)
        if len(paper_name) != 0:
            paper_name = paper_name[0]
            cols['paper_name_abbr'] = paper_name
            subject = 0
            for key1, value1 in self.subject_item.items():
                if key1 in paper_name:
                    subject = value1
            cols['subject'] = subject

        ################################################################

        question_type_str = re.findall(
            '<p class="left">(.+?)</p><p class="right">', html)
        if len(question_type_str) != 0:
            question_type_str = question_type_str[0]
            cols['question_type_str'] = question_type_str
            for keys, values in self.pattern_item.items():
                if keys in question_type_str:
                    question_type_str = values
            if len(question_type_str) >= 2:
                question_type_str = '3'
            cols['question_type'] = question_type_str

        ################################################################

        question_html = re.findall(
            '<div class="test-item-body TD-body f-roman">(.+?)</div>', html,
            re.S)
        question_html = question_html[0].strip()
        #cols['question_html'] = question_html

        ################################################################

        diff = re.findall('class="staryellow">(.+?)<a', html)
        difficulty = len(diff) * 20
        cols['difficulty'] = difficulty

        ################################################################

        mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name)
        if mod:
            exam_year = mod.group(1)
        else:
            exam_year = 0
        cols['exam_year'] = int(exam_year)

        ################################################################

        as_js = as_json['data'][1][0][0]
        answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''),
                                                  spider_url=key)
        answer_all_html = fix_any(answer_all_html)
        answer_all_html = center_image(answer_all_html)
        #cols['answer_all_html'] = center_image(answer_all_html)

        ################################################################

        fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''),
                                        spider_url=key)
        fenxi = fix_any(fenxi)
        fenxi = center_image(fenxi)
        #cols['fenxi'] = fenxi

        ################################################################

        _question = Question(
            question_body=question_html,
            answer=answer_all_html,
            analy=fenxi,
        )
        standard_question = _question.normialize()
        cols['question_html'] = standard_question['question_body']
        cols['answer_all_html'] = standard_question['answer']
        cols['fenxi'] = standard_question['analy']

        ################################################################

        other_info = (as_js.get('remark') or '')
        other_info = self.html_magic.bewitch(other_info, spider_url=key)
        other_info = fix_any(other_info)
        cols['other_info'] = center_image(other_info)

        ################################################################

        cols['spider_url'] = key
        cols['exam_city'] = ''
        cols['paper_url'] = ''
        cols['zhuanti'] = ''
        cols['option_html'] = ''
        cols['jieda'] = ''
        cols['dianping'] = ''
        cols['spider_source'] = 52
        cols['question_quality'] = 0
        cols['html_id'] = html_id

        return cols
예제 #8
0
    def parse(self, key, qs_json, as_json, aft_subj_id):

        cols = dict()

        question_html = qs_json['test']
        question_html = self.html_magic.bewitch(question_html,
                                                spider_url=key)
        question_html = fix_any(question_html)
        cols['question_html'] = center_image(question_html)

        ################################################################

        if not qs_json.get('diff'):
            difficulty = 0
        else:
            difficulty = (100 - int(qs_json.get('diff', 0) * 100))
        cols['difficulty'] = difficulty

        ################################################################

        paper_name = (qs_json.get('docname') or '')
        cols['paper_name'] = paper_name

        ################################################################

        mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name)
        if mod:
            exam_year = mod.group(1)
        else:
            exam_year = 0
        cols['exam_year'] = int(exam_year)

        ################################################################

        cols['question_type_str'] = (qs_json.get('typesname') or '')

        ################################################################

        as_js = as_json['data'][1][0][0]
        answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''),
                                                  spider_url=key)
        answer_all_html = fix_any(answer_all_html)
        cols['answer_all_html'] = center_image(answer_all_html)

        ################################################################

        fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''),
                                        spider_url=key)
        fenxi = fix_any(fenxi)
        cols['fenxi'] = center_image(fenxi)

        ################################################################

        knowledge_point_json = list()
        knowledge_point = list()
        kpstr = (as_js.get('kllist') or '')
        kpstr = remove_tag('<span', kpstr, all=True)
        kpl = kpstr.split('<br>')
        for kps in kpl:
            kps = kps.split(' >> ')
            knowledge_point.append(kps[-1])
            knowledge_point_json.append(kps)
        knowledge_point = ';'.join(knowledge_point)
        knowledge_point_json = json.dumps(knowledge_point_json,
                                          ensure_ascii=False)
        cols['knowledge_point'] = knowledge_point
        cols['knowledge_point_json'] = knowledge_point_json

        ################################################################

        other_info = (as_js.get('remark') or '')
        other_info = self.html_magic.bewitch(other_info, spider_url=key)
        other_info = fix_any(other_info)
        cols['other_info'] = center_image(other_info)

        ################################################################

        cols['spider_url'] = key
        cols['subject'] = aft_subj_id
        cols['exam_city'] = ''
        cols['paper_url'] = ''
        cols['zhuanti'] = ''
        cols['option_html'] = ''
        cols['jieda'] = ''
        cols['dianping'] = ''
        cols['spider_source'] = 52
        cols['question_type'] = 0
        cols['question_quality'] = 0

        return cols
예제 #9
0
 def format_html(self, html_string):
     html_string = self.fix_any(html_string)
     html_string = center_image(html_string)
     html_string = self.html_magic.bewitch(html_string, spider_url=self.url)
     return html_string
예제 #10
0
    def parse(self, url, js, aft_subj_id):

        cols = dict()

        # 检测是否是多题 (如:完形填空)
        is_mqs = is_multi_qs(js)

        if is_mqs:
            question_html, option_html = get_multi_question(js)
        else:
            question_html, option_html = get_question(js)

        question_html = fix_any(question_html)
        question_html = self.html_magic.bewitch(question_html, spider_url=url)
        question_html = center_image(question_html)
        cols['question_html_origin'] = question_html
        cols['question_html'] = ''
        if 'afanti-latex' not in question_html:
            cols['question_html'] = question_html

        if option_html:
            option_html = fix_any(option_html)
            option_html = self.html_magic.bewitch(option_html, spider_url=url)
            option_html = center_image(option_html)
        cols['option_html_origin'] = option_html
        cols['option_html'] = ''
        if 'afanti-latex' not in option_html:
            cols['option_html'] = option_html

        ################################################################

        answer_all_html, fenxi = get_answers(js)

        answer_all_html = fix_any(answer_all_html)
        answer_all_html = self.html_magic.bewitch(answer_all_html,
                                                  spider_url=url)
        answer_all_html = center_image(answer_all_html)
        cols['answer_all_html_origin'] = answer_all_html
        cols['answer_all_html'] = ''
        if 'afanti-latex' not in answer_all_html:
            cols['answer_all_html'] = answer_all_html

        fenxi = fix_any(fenxi)
        fenxi = self.html_magic.bewitch(fenxi, spider_url=url)
        fenxi = center_image(fenxi)
        cols['fenxi_origin'] = fenxi
        cols['fenxi'] = ''
        if 'afanti-latex' not in fenxi:
            cols['fenxi'] = fenxi

        ################################################################

        cols['difficulty'] = (js['difficulty_int'] or 0)

        ################################################################

        cols['question_type_name'] = get_question_type_name(js)

        ################################################################

        cols['knowledge_point'] = ''
        cols['jieda_origin'] = ''
        cols['jieda'] = ''
        cols['exam_year'] = 0
        cols['exam_city'] = ''
        cols['spider_url'] = url
        cols['subject'] = aft_subj_id
        cols['zhuanti'] = ''
        cols['dianping'] = ''
        cols['spider_source'] = 53
        cols['question_type'] = 0
        cols['question_quality'] = 0

        return cols