def main(): html_string = ''' <TBODY> <TR> <TD>若 <IMG style="WIDTH: 18px; HEIGHT: 16px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716662863.png">=3, <IMG style="WIDTH: 18px; HEIGHT: 14px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716732789.png">=7,则x﹣y的值为 </TD> </TR> <TR> <TD> <DIV align=right>[ ]</DIV> </TD> </TR> <TR> <TD>A.±4 <BR>B.±10 <BR>C.﹣4或﹣10 <BR>D.±4或±10</TD> </TR> </TBODY> </TABLE> ''' html_magic = HtmlMagic(8, download=True, beautify=False) html_string = html_magic.bewitch( html_string, spider_url= 'http://www.mofangge.com/html/qDetail/02/c1/201208/1kzkc102222121.html', spider_source=8, ) html_string = center_image(html_string) print(html_string)
def get_jieda(self, html_string): e = get_html_element('<font color=red>', html_string, with_tag=False, limit=1)[0] e = self.fix_any(e) e = center_image(e) e = self.html_magic.bewitch(e, spider_url=self.url) if e.endswith('</div></p>'): e = e[:-4] return e.strip()
def get_question_html(self, html_string): rs = [] cns = get_html_element('<div class="content">', html_string, with_tag=False) for cn in cns: cn = abs_url(cn) cn = center_image(cn) cn = self.html_magic.bewitch(cn, spider_url=self.url) rs.append(cn.strip()) rs[1] = self.fix_any(rs[1]).replace('\r', '').strip() return rs
def get_question_html(self, html_string): e = get_html_element('<div', html_string, with_tag=False, limit=1) if e: e = e[0] else: e = remove_start_tag(html_string) e = self.fix_any(e) e = center_image(e) e = self.html_magic.bewitch(e, spider_url=self.url) e = self.format_options(e) return e.strip()
def parse(self, html_string, url, aft_subj_id): cols = dict() exam_year = 0 paper_name = '' question_html_t = list() answer_all_html_t = list() fenxi_t = list() cols_dict = { '"IsTopic"': question_html_t, '"optionoption"': question_html_t, '"Answer"': answer_all_html_t, '"Analytical"': fenxi_t, } entities = { '"IsTopic"': get_question_html, '"optionoption"': get_question_html, '"Answer"': get_answer_all_html, '"Analytical"': get_fenxi, } elems = get_html_element( '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)', html_string, regex=True) q = -1 for elem in elems: for key in entities.keys(): if key in elem[:30]: entity = entities[key](elem) if q > 0 and key in ('"Answer"', '"Analytical"'): entity = '({}). {}'.format(q, entity) if q == -1 and key == '"IsTopic"': exam_year, paper_name = get_exam_info(entity) entity = remove_exam_info(entity) cols_dict[key].append(entity) if key == '"IsTopic"': q += 1 break question_all_html = '<br>\n'.join(question_html_t) question_html = self.html_magic.bewitch(question_all_html, spider_url=url) question_html = center_image(question_html) question_html = fix_any(question_html) question_html = displaystyle(question_html, latex=False, mml=True) #cols['question_html_origin'] = question_html answer_all_html = '<br>\n'.join(answer_all_html_t) answer_all_html = self.html_magic.bewitch(answer_all_html, spider_url=url) answer_all_html = center_image(answer_all_html) answer_all_html = fix_any(answer_all_html) answer_all_html = displaystyle(answer_all_html, latex=False, mml=True) #cols['answer_all_html_origin'] = answer_all_html fenxi = '<br>\n'.join(fenxi_t) fenxi = self.html_magic.bewitch(fenxi, spider_url=url) fenxi = center_image(fenxi) fenxi = fix_any(fenxi) fenxi = displaystyle(fenxi, latex=False, mml=True) #cols['fenxi_origin'] = fenxi cols['difficulty'] = get_difficulty(html_string) cols['question_type_str'] = get_question_type_str(html_string) cols['question_html'] = '' cols['option_html'] = '' cols['answer_all_html'] = '' cols['jieda'] = '' cols['fenxi'] = '' cols['dianping'] = '' cols['option_html_origin'] = '' cols['jieda_origin'] = '' cols['dianping_origin'] = '' cols['zhuanti'] = '' cols['paper_name'] = paper_name cols['paper_url'] = '' cols['spider_url'] = url cols['subject'] = aft_subj_id cols['spider_source'] = 56 cols['question_type'] = 0 cols['question_quality'] = 0 cols['knowledge_point'] = '' cols['exam_year'] = exam_year cols['exam_city'] = '' _question = Question( question_body=question_html, answer=answer_all_html, analy=fenxi, ) standard_question = _question.normialize() cols['question_html_origin'] = standard_question['question_body'] cols['answer_all_html_origin'] = standard_question['answer'] cols['fenxi_origin'] = standard_question['analy'] return cols
def parse(self, html_string, url, aft_subj_id): cols = dict() exam_year = 0 paper_name = '' question_html_t = list() answer_all_html_t = list() fenxi_t = list() cols_dict = { '"IsTopic"': question_html_t, '"optionoption"': question_html_t, '"Answer"': answer_all_html_t, '"Analytical"': fenxi_t, } entities = { '"IsTopic"': get_question_html, '"optionoption"': get_question_html, '"Answer"': get_answer_all_html, '"Analytical"': get_fenxi, } elems = get_html_element( '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)', html_string, regex=True) q = -1 for elem in elems: for key in entities.keys(): if key in elem[:30]: entity = entities[key](elem) if q > 0 and key in ('"Answer"', '"Analytical"'): entity = '({}). {}'.format(q, entity) if q == -1 and key == '"IsTopic"': exam_year, paper_name = get_exam_info(entity) entity = remove_exam_info(entity) cols_dict[key].append(entity) if key == '"IsTopic"': q += 1 break question_all_html = '<br>\n'.join(question_html_t) cols['question_all_html'] = question_all_html question_html = self.html_magic.bewitch(question_all_html, spider_url=url) question_html = center_image(question_html) question_html = handle_mathml(question_html, self.uri2oss, url) if question_html is False: return False cols['question_html'] = question_html answer_all_html = '<br>\n'.join(answer_all_html_t) answer_all_html = self.html_magic.bewitch(answer_all_html, spider_url=url) answer_all_html = center_image(answer_all_html) answer_all_html = handle_mathml(answer_all_html, self.uri2oss, url) if answer_all_html is False: return False cols['answer_all_html'] = answer_all_html fenxi = '<br>\n'.join(fenxi_t) fenxi = self.html_magic.bewitch(fenxi, spider_url=url) fenxi = center_image(fenxi) fenxi = handle_mathml(fenxi, self.uri2oss, url) if fenxi is False: return False cols['fenxi'] = fenxi cols['difficulty'] = get_difficulty(html_string) cols['question_type_str'] = get_question_type_str(html_string) cols['dianping'] = '' cols['zhuanti'] = '' cols['paper_name'] = paper_name cols['paper_url'] = '' cols['spider_url'] = url cols['subject'] = aft_subj_id cols['spider_source'] = 56 cols['question_type'] = 0 cols['question_quality'] = 0 cols['knowledge_point'] = '' cols['knowledge_point_json'] = json.dumps([]) cols['exam_year'] = exam_year cols['exam_city'] = '' cols['option_html'] = '' return cols
def parse(self, key, qs_json, as_json, html_id): cols = dict() html = self.html_magic.bewitch(qs_json, spider_url=key) html = fix_any(html) html = center_image(html) ################################################################ knowledge_point = re.findall( '<div class="answer-context f-roman">(.+?)</div>', html, re.S) if len(knowledge_point) != 0: knowledge_point_jsons = [] knowledge_points = '' knowledge_point = knowledge_point[0] knowledge_point_json = knowledge_point.split('<br/>') for i in knowledge_point_json: knowledge_points += remove_tags(i).split(' >> ')[-1] + ';' node_i = remove_tags(i).split(' >> ') #node_i = json.dumps(node_i, ensure_ascii=False) knowledge_point_jsons.append(node_i) knowledge_point_jsons = json.dumps(knowledge_point_jsons, ensure_ascii=False) cols['knowledge_point'] = knowledge_points[:-2] cols['knowledge_point_json'] = knowledge_point_jsons ################################################################ paper_name = re.findall('id="docname">(.+?)</span>', html) if len(paper_name) != 0: paper_name = paper_name[0] cols['paper_name_abbr'] = paper_name subject = 0 for key1, value1 in self.subject_item.items(): if key1 in paper_name: subject = value1 cols['subject'] = subject ################################################################ question_type_str = re.findall( '<p class="left">(.+?)</p><p class="right">', html) if len(question_type_str) != 0: question_type_str = question_type_str[0] cols['question_type_str'] = question_type_str for keys, values in self.pattern_item.items(): if keys in question_type_str: question_type_str = values if len(question_type_str) >= 2: question_type_str = '3' cols['question_type'] = question_type_str ################################################################ question_html = re.findall( '<div class="test-item-body TD-body f-roman">(.+?)</div>', html, re.S) question_html = question_html[0].strip() #cols['question_html'] = question_html ################################################################ diff = re.findall('class="staryellow">(.+?)<a', html) difficulty = len(diff) * 20 cols['difficulty'] = difficulty ################################################################ mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name) if mod: exam_year = mod.group(1) else: exam_year = 0 cols['exam_year'] = int(exam_year) ################################################################ as_js = as_json['data'][1][0][0] answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''), spider_url=key) answer_all_html = fix_any(answer_all_html) answer_all_html = center_image(answer_all_html) #cols['answer_all_html'] = center_image(answer_all_html) ################################################################ fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''), spider_url=key) fenxi = fix_any(fenxi) fenxi = center_image(fenxi) #cols['fenxi'] = fenxi ################################################################ _question = Question( question_body=question_html, answer=answer_all_html, analy=fenxi, ) standard_question = _question.normialize() cols['question_html'] = standard_question['question_body'] cols['answer_all_html'] = standard_question['answer'] cols['fenxi'] = standard_question['analy'] ################################################################ other_info = (as_js.get('remark') or '') other_info = self.html_magic.bewitch(other_info, spider_url=key) other_info = fix_any(other_info) cols['other_info'] = center_image(other_info) ################################################################ cols['spider_url'] = key cols['exam_city'] = '' cols['paper_url'] = '' cols['zhuanti'] = '' cols['option_html'] = '' cols['jieda'] = '' cols['dianping'] = '' cols['spider_source'] = 52 cols['question_quality'] = 0 cols['html_id'] = html_id return cols
def parse(self, key, qs_json, as_json, aft_subj_id): cols = dict() question_html = qs_json['test'] question_html = self.html_magic.bewitch(question_html, spider_url=key) question_html = fix_any(question_html) cols['question_html'] = center_image(question_html) ################################################################ if not qs_json.get('diff'): difficulty = 0 else: difficulty = (100 - int(qs_json.get('diff', 0) * 100)) cols['difficulty'] = difficulty ################################################################ paper_name = (qs_json.get('docname') or '') cols['paper_name'] = paper_name ################################################################ mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name) if mod: exam_year = mod.group(1) else: exam_year = 0 cols['exam_year'] = int(exam_year) ################################################################ cols['question_type_str'] = (qs_json.get('typesname') or '') ################################################################ as_js = as_json['data'][1][0][0] answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''), spider_url=key) answer_all_html = fix_any(answer_all_html) cols['answer_all_html'] = center_image(answer_all_html) ################################################################ fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''), spider_url=key) fenxi = fix_any(fenxi) cols['fenxi'] = center_image(fenxi) ################################################################ knowledge_point_json = list() knowledge_point = list() kpstr = (as_js.get('kllist') or '') kpstr = remove_tag('<span', kpstr, all=True) kpl = kpstr.split('<br>') for kps in kpl: kps = kps.split(' >> ') knowledge_point.append(kps[-1]) knowledge_point_json.append(kps) knowledge_point = ';'.join(knowledge_point) knowledge_point_json = json.dumps(knowledge_point_json, ensure_ascii=False) cols['knowledge_point'] = knowledge_point cols['knowledge_point_json'] = knowledge_point_json ################################################################ other_info = (as_js.get('remark') or '') other_info = self.html_magic.bewitch(other_info, spider_url=key) other_info = fix_any(other_info) cols['other_info'] = center_image(other_info) ################################################################ cols['spider_url'] = key cols['subject'] = aft_subj_id cols['exam_city'] = '' cols['paper_url'] = '' cols['zhuanti'] = '' cols['option_html'] = '' cols['jieda'] = '' cols['dianping'] = '' cols['spider_source'] = 52 cols['question_type'] = 0 cols['question_quality'] = 0 return cols
def format_html(self, html_string): html_string = self.fix_any(html_string) html_string = center_image(html_string) html_string = self.html_magic.bewitch(html_string, spider_url=self.url) return html_string
def parse(self, url, js, aft_subj_id): cols = dict() # 检测是否是多题 (如:完形填空) is_mqs = is_multi_qs(js) if is_mqs: question_html, option_html = get_multi_question(js) else: question_html, option_html = get_question(js) question_html = fix_any(question_html) question_html = self.html_magic.bewitch(question_html, spider_url=url) question_html = center_image(question_html) cols['question_html_origin'] = question_html cols['question_html'] = '' if 'afanti-latex' not in question_html: cols['question_html'] = question_html if option_html: option_html = fix_any(option_html) option_html = self.html_magic.bewitch(option_html, spider_url=url) option_html = center_image(option_html) cols['option_html_origin'] = option_html cols['option_html'] = '' if 'afanti-latex' not in option_html: cols['option_html'] = option_html ################################################################ answer_all_html, fenxi = get_answers(js) answer_all_html = fix_any(answer_all_html) answer_all_html = self.html_magic.bewitch(answer_all_html, spider_url=url) answer_all_html = center_image(answer_all_html) cols['answer_all_html_origin'] = answer_all_html cols['answer_all_html'] = '' if 'afanti-latex' not in answer_all_html: cols['answer_all_html'] = answer_all_html fenxi = fix_any(fenxi) fenxi = self.html_magic.bewitch(fenxi, spider_url=url) fenxi = center_image(fenxi) cols['fenxi_origin'] = fenxi cols['fenxi'] = '' if 'afanti-latex' not in fenxi: cols['fenxi'] = fenxi ################################################################ cols['difficulty'] = (js['difficulty_int'] or 0) ################################################################ cols['question_type_name'] = get_question_type_name(js) ################################################################ cols['knowledge_point'] = '' cols['jieda_origin'] = '' cols['jieda'] = '' cols['exam_year'] = 0 cols['exam_city'] = '' cols['spider_url'] = url cols['subject'] = aft_subj_id cols['zhuanti'] = '' cols['dianping'] = '' cols['spider_source'] = 53 cols['question_type'] = 0 cols['question_quality'] = 0 return cols