def main(): html_string = ''' <TBODY> <TR> <TD>若 <IMG style="WIDTH: 18px; HEIGHT: 16px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716662863.png">=3, <IMG style="WIDTH: 18px; HEIGHT: 14px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716732789.png">=7,则x﹣y的值为 </TD> </TR> <TR> <TD> <DIV align=right>[ ]</DIV> </TD> </TR> <TR> <TD>A.±4 <BR>B.±10 <BR>C.﹣4或﹣10 <BR>D.±4或±10</TD> </TR> </TBODY> </TABLE> ''' html_magic = HtmlMagic(8, download=True, beautify=False) html_string = html_magic.bewitch( html_string, spider_url= 'http://www.mofangge.com/html/qDetail/02/c1/201208/1kzkc102222121.html', spider_source=8, ) html_string = center_image(html_string) print(html_string)
def __init__(self, archive_image=False, download=False): # img 格式化 self.html_magic = HtmlMagic( 74, # XXX, spider_source archive_image=archive_image, download=download, beautify=False)
def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(52, archive_image=archive_image, download=download, beautify=False) self.subject_item = { '语文': '1', '数学': '2', '英语': '3', '科学': '4', '物理': '5', '化学': '6', '地理': '7', '历史': '8', '生物': '9', '政治': '10' } self.pattern_item = {'单选': '1', '填空': '2', '多选': '4', '选择': '1'}
class ImageCover(object): NAME = "image_cover" model = AnoahQuestion fields = [ 'fenxi', 'option_html', 'question_html_origin', 'option_html_origin', 'fenxi_origin', 'answer_all_html_origin', 'answer_all_html' ] def set_magic(self): from afanti_tiku_lib.html.magic import HtmlMagic self.html_magic = HtmlMagic(75, archive_image=True, download=True) def get_objects_id(self): ids = self.model.objects.all().values_list('question_id') ids = list(map(lambda x: x[0], ids)) return ids def has_cover(self, html): return True if 'http://qimg.afanti100.com/data' in html else False def is_image_in(self, question): def in_question(field): print(question.question_id) html = getattr(question, field) if not html: return False if self.has_cover(html): return False return True if '<img' in html else False return in_question def bewitch_html(self, question): def bewitch_question(field): new_html = self.html_magic.bewitch(getattr(question, field), spider_url=question.spider_url) setattr(question, field, new_html) return bewitch_question def run_parser(self, _id): q = self.model.objects.get(question_id=_id) magic = self.bewitch_html(q) has_image = self.is_image_in(q) is_change = list(map(magic, filter(has_image, self.fields))) is_change and q.save() def start(self): self.run() def run(self): self.set_magic() all_ids = self.get_objects_id() list(map(self.run_parser, all_ids))
def parse_detail(row): pattern_item = {'单选': '1', '填空': '2', '多选': '4'} spider_source = int(row['spider_source']) spider_url = row['spider_url'] image_parse = HtmlMagic(spider_source=spider_source, download=True, archive_image=False) result1 = {} pattern = row['pattern'] result1['question_type_name'] = pattern for key, value in pattern_item.items(): if key in pattern: pattern = value if len(pattern) >= 2: pattern = '3' result1['question_type'] = pattern topic = row['topic'] topic = replace_href(topic) topic = remove_tags(text=topic, which_ones=('h1', 'div')) topic = image_parse.bewitch(html_string=topic, spider_url=spider_url, spider_source=spider_source) #result1['question_body'] = topic answer = row['answer'] answer = replace_href(answer) answer = image_parse.bewitch(html_string=answer, spider_url=spider_url, spider_source=spider_source) #result1['answer'] = answer analy = row['analy'] analy = replace_href(analy) analy = image_parse.bewitch(html_string=analy, spider_url=spider_url, spider_source=spider_source) #result1['analy'] = analy _question = Question( question_body=topic, answer=answer, analy=analy, ) standard_question = _question.normialize() result1['question_body'] = standard_question['question_body'] result1['answer'] = standard_question['answer'] result1['analy'] = standard_question['analy'] source_shijuan = row['source_shijuan'] source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan) if len(source_shijuan) != 0: result1['paper_name'] = source_shijuan[0] else: result1['paper_name'] = '' mapping_dict = { 'question_id': 'source_id', 'subject': 'subject', 'spider_url': 'spider_url', 'knowledge_point': 'kaodian', 'difficulty': 'difficulty', 'source': 'spider_source', 'spider_source': 'spdier_source' } result2 = {key: row.get(value, '') for key, value in mapping_dict.items()} result = dict(result1, **result2) return result
class Manfen5ZujuanParser(object): def __init__(self, archive_image=False, download=False): # img 格式化 self.html_magic = HtmlMagic(80, # XXX, spider_source archive_image=archive_image, download=download, beautify=False) def parse(self, html_string, url, info): self.url = url cols = dict() tds = find_valid_elements(html_string, '<td') question_html = self.get_question_html(tds[3]) jieda = self.get_jieda(tds[4]) kps = self.get_kps(tds[2]) question_type_name = self.get_question_type_name(tds[0]) # format question object _question = Question(question_body = question_html, jieda = jieda) # unity question style unity_question = _question.normialize() cols['question_html'] = unity_question['question_body'] cols['jieda'] = unity_question['jieda'] cols['knowledge_point'] = kps cols['question_type_name'] = question_type_name cols['subject'] = self.get_subject(info) cols['fenxi'] = '' cols['dianping'] = '' cols['answer_all_html'] = '' cols['option_html'] = '' cols['difficulty'] = 0 cols['zhuanti'] = '' cols['spider_url'] = url cols['spider_source'] = 80 cols['question_type'] = 0 cols['question_quality'] = 0 cols['exam_year'] = 0 cols['exam_city'] = '' return cols def get_question_html(self, html_string): e = get_html_element('<div', html_string, with_tag=False, limit=1) if e: e = e[0] else: e = remove_start_tag(html_string) e = self.fix_any(e) e = center_image(e) e = self.html_magic.bewitch(e, spider_url=self.url) e = self.format_options(e) return e.strip() def get_jieda(self, html_string): e = get_html_element('<font color=red>', html_string, with_tag=False, limit=1)[0] e = self.fix_any(e) e = center_image(e) e = self.html_magic.bewitch(e, spider_url=self.url) if e.endswith('</div></p>'): e = e[:-4] return e.strip() def get_kps(self, html_string): e = get_html_element('<b', html_string, with_tag=False, limit=1)[0].replace(',', ';') return e.strip() def get_question_type_name(self, html_string): e = get_html_element('题型:<b>', html_string, with_tag=False, limit=1)[0] return e.strip() def get_subject(self, info): return SUBJS.get(info['subj']) def fix_any(self, html_string): html_string = format_spans(html_string) html_string = remove_tag('<font', html_string) html_string = remove_a_tag(html_string) return html_string.strip() def format_options(self, html_string): html_string = re_opts.sub(r' <br>\2 ', html_string) return html_string
def set_magic(self): from afanti_tiku_lib.html.magic import HtmlMagic self.html_magic = HtmlMagic(75, archive_image=True, download=True)
class VkoParser(object): def __init__(self, archive_image=False, download=False): # img 格式化 self.html_magic = HtmlMagic( 74, # XXX, spider_source archive_image=archive_image, download=download, beautify=False) def parse(self, js, url): self.url = url self._paper = '' self._year = 0 cols = dict() question_html = self.get_question_html(js) cols['question_html'] = question_html answer_all_html = self.get_answer_all_html(js) cols['answer_all_html'] = answer_all_html jieda = self.get_jieda(js) cols['jieda'] = jieda cols['option_html'] = '' cols['fenxi'] = '' cols['dianping'] = '' cols['paper_name'] = self._paper cols['difficulty'] = 0 cols['zhuanti'] = '' cols['spider_url'] = url cols['subject'] = 0 cols['spider_source'] = 74 cols['question_type'] = 0 cols['question_quality'] = 0 cols['knowledge_point'] = '' cols['exam_year'] = self._year cols['exam_city'] = '' return cols def get_question_html(self, js): html_string = js['content'] html_string = self.format_html(html_string) mod = re_paper.search(html_string) if mod: self._paper = mod.group(1) html_string = re_paper.sub('<p>', html_string) mod = re_year.search(self._paper) if mod: self._year = int(mod.group(1)) return html_string def get_answer_all_html(self, js): html_string = js.get('answer') or '' html_string = self.format_html(html_string) return html_string def get_jieda(self, js): if not js.get('examsResolve'): return '' jiedas = [] for er in js['examsResolve']: jiedas.append(er['content']) html_string = '<br>'.join(jiedas) html_string = self.format_html(html_string) return html_string def format_html(self, html_string): html_string = self.fix_any(html_string) html_string = center_image(html_string) html_string = self.html_magic.bewitch(html_string, spider_url=self.url) return html_string def fix_any(self, html_string): html_string = html_string.replace('\n', '') html_string = re_p_tag.sub('<p>', html_string) html_string = handle_spans(html_string) html_string = remove_tag('<span', html_string) html_string = html_string.replace('<p></p>', '')\ .replace('<p><br></p>', '')\ .replace('<div><br></div>', '')\ .replace('<o:p></o:p>', '')\ .replace('</p><br>', '</p>') html_string = re_nbsp.sub(' ' * 6, html_string) html_string = re_underline.sub(UNDERLINE.format(' ' * 6), html_string) html_string = html_string.replace('<sspan', '<span') return html_string
def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(56, archive_image=archive_image, download=download, beautify=False)
def tableToJson(table): config = json.load(open(CONFIG_FILE)) conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive', port=3306, charset="utf8", use_unicode=True, cursorclass=pymysql.cursors.DictCursor) cur = conn.cursor() #sql = 'select * from %s ' % table sql = 'select * from %s limit 320000' % table cur.execute(sql) data = cur.fetchall() cur.close() jsonData = [] pattern_item = {'单选': '1', '填空': '2', '多选': '4'} for row in data: spider_source = int(row['spider_source']) image_parse = HtmlMagic(spider_source=spider_source, download=True, archive_image=False) result1 = {} spider_url = row['spider_url'] result1['spider_url'] = spider_url question_id = re.findall('shiti/(.+).html', spider_url) result1['question_id'] = question_id[0] pattern = row['pattern'] result1['question_type_name'] = pattern for key, value in pattern_item.items(): if key in pattern: pattern = value if len(pattern) >= 2: pattern = '3' result1['question_type'] = pattern topic = row['topic'] topic = replace_href(topic) topic = remove_tags(text=topic, which_ones=('h1', 'div')) topic = image_parse.bewitch(html_string=topic, spider_url=spider_url, spider_source=spider_source) result1['question_body'] = topic answer = row['answer'] answer = replace_href(answer) answer = image_parse.bewitch(html_string=answer, spider_url=spider_url, spider_source=spider_source) result1['answer'] = answer analy = row['analy'] analy = replace_href(analy) analy = image_parse.bewitch(html_string=analy, spider_url=spider_url, spider_source=spider_source) result1['analy'] = analy source_shijuan = row['source_shijuan'] source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan) if len(source_shijuan) != 0: result1['paper_name'] = source_shijuan[0] mapping_dict = { 'spider_sorce': 'spider_source', 'subject': 'subject', 'knowledge_point': 'kaodian', 'difficulty': 'difficulty', 'book': 'book', 'version': 'version', 'source': 'spider_source' } result2 = { key: row.get(value, '') for key, value in mapping_dict.items() } #result['exam_year'] = row['year'] #result['exam_city'] = row['province'] result = dict(result1, **result2) jsonData.append(result) return jsonData
class Dz101QuestionParser(object): def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(56, archive_image=archive_image, download=download, beautify=False) self.uri2oss = self.html_magic.image_magic.uri2oss def parse(self, html_string, url, aft_subj_id): cols = dict() exam_year = 0 paper_name = '' question_html_t = list() answer_all_html_t = list() fenxi_t = list() cols_dict = { '"IsTopic"': question_html_t, '"optionoption"': question_html_t, '"Answer"': answer_all_html_t, '"Analytical"': fenxi_t, } entities = { '"IsTopic"': get_question_html, '"optionoption"': get_question_html, '"Answer"': get_answer_all_html, '"Analytical"': get_fenxi, } elems = get_html_element( '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)', html_string, regex=True) q = -1 for elem in elems: for key in entities.keys(): if key in elem[:30]: entity = entities[key](elem) if q > 0 and key in ('"Answer"', '"Analytical"'): entity = '({}). {}'.format(q, entity) if q == -1 and key == '"IsTopic"': exam_year, paper_name = get_exam_info(entity) entity = remove_exam_info(entity) cols_dict[key].append(entity) if key == '"IsTopic"': q += 1 break question_all_html = '<br>\n'.join(question_html_t) cols['question_all_html'] = question_all_html question_html = self.html_magic.bewitch(question_all_html, spider_url=url) question_html = center_image(question_html) question_html = handle_mathml(question_html, self.uri2oss, url) if question_html is False: return False cols['question_html'] = question_html answer_all_html = '<br>\n'.join(answer_all_html_t) answer_all_html = self.html_magic.bewitch(answer_all_html, spider_url=url) answer_all_html = center_image(answer_all_html) answer_all_html = handle_mathml(answer_all_html, self.uri2oss, url) if answer_all_html is False: return False cols['answer_all_html'] = answer_all_html fenxi = '<br>\n'.join(fenxi_t) fenxi = self.html_magic.bewitch(fenxi, spider_url=url) fenxi = center_image(fenxi) fenxi = handle_mathml(fenxi, self.uri2oss, url) if fenxi is False: return False cols['fenxi'] = fenxi cols['difficulty'] = get_difficulty(html_string) cols['question_type_str'] = get_question_type_str(html_string) cols['dianping'] = '' cols['zhuanti'] = '' cols['paper_name'] = paper_name cols['paper_url'] = '' cols['spider_url'] = url cols['subject'] = aft_subj_id cols['spider_source'] = 56 cols['question_type'] = 0 cols['question_quality'] = 0 cols['knowledge_point'] = '' cols['knowledge_point_json'] = json.dumps([]) cols['exam_year'] = exam_year cols['exam_city'] = '' cols['option_html'] = '' return cols
class Wln100QuestionParser(object): def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(52, archive_image=archive_image, download=download, beautify=False) self.subject_item = { '语文': '1', '数学': '2', '英语': '3', '科学': '4', '物理': '5', '化学': '6', '地理': '7', '历史': '8', '生物': '9', '政治': '10' } self.pattern_item = {'单选': '1', '填空': '2', '多选': '4', '选择': '1'} def parse(self, key, qs_json, as_json, html_id): cols = dict() html = self.html_magic.bewitch(qs_json, spider_url=key) html = fix_any(html) html = center_image(html) ################################################################ knowledge_point = re.findall( '<div class="answer-context f-roman">(.+?)</div>', html, re.S) if len(knowledge_point) != 0: knowledge_point_jsons = [] knowledge_points = '' knowledge_point = knowledge_point[0] knowledge_point_json = knowledge_point.split('<br/>') for i in knowledge_point_json: knowledge_points += remove_tags(i).split(' >> ')[-1] + ';' node_i = remove_tags(i).split(' >> ') #node_i = json.dumps(node_i, ensure_ascii=False) knowledge_point_jsons.append(node_i) knowledge_point_jsons = json.dumps(knowledge_point_jsons, ensure_ascii=False) cols['knowledge_point'] = knowledge_points[:-2] cols['knowledge_point_json'] = knowledge_point_jsons ################################################################ paper_name = re.findall('id="docname">(.+?)</span>', html) if len(paper_name) != 0: paper_name = paper_name[0] cols['paper_name_abbr'] = paper_name subject = 0 for key1, value1 in self.subject_item.items(): if key1 in paper_name: subject = value1 cols['subject'] = subject ################################################################ question_type_str = re.findall( '<p class="left">(.+?)</p><p class="right">', html) if len(question_type_str) != 0: question_type_str = question_type_str[0] cols['question_type_str'] = question_type_str for keys, values in self.pattern_item.items(): if keys in question_type_str: question_type_str = values if len(question_type_str) >= 2: question_type_str = '3' cols['question_type'] = question_type_str ################################################################ question_html = re.findall( '<div class="test-item-body TD-body f-roman">(.+?)</div>', html, re.S) question_html = question_html[0].strip() #cols['question_html'] = question_html ################################################################ diff = re.findall('class="staryellow">(.+?)<a', html) difficulty = len(diff) * 20 cols['difficulty'] = difficulty ################################################################ mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name) if mod: exam_year = mod.group(1) else: exam_year = 0 cols['exam_year'] = int(exam_year) ################################################################ as_js = as_json['data'][1][0][0] answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''), spider_url=key) answer_all_html = fix_any(answer_all_html) answer_all_html = center_image(answer_all_html) #cols['answer_all_html'] = center_image(answer_all_html) ################################################################ fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''), spider_url=key) fenxi = fix_any(fenxi) fenxi = center_image(fenxi) #cols['fenxi'] = fenxi ################################################################ _question = Question( question_body=question_html, answer=answer_all_html, analy=fenxi, ) standard_question = _question.normialize() cols['question_html'] = standard_question['question_body'] cols['answer_all_html'] = standard_question['answer'] cols['fenxi'] = standard_question['analy'] ################################################################ other_info = (as_js.get('remark') or '') other_info = self.html_magic.bewitch(other_info, spider_url=key) other_info = fix_any(other_info) cols['other_info'] = center_image(other_info) ################################################################ cols['spider_url'] = key cols['exam_city'] = '' cols['paper_url'] = '' cols['zhuanti'] = '' cols['option_html'] = '' cols['jieda'] = '' cols['dianping'] = '' cols['spider_source'] = 52 cols['question_quality'] = 0 cols['html_id'] = html_id return cols
def tableToJson(table): config = json.load(open(CONFIG_FILE)) conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive', port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor) cur = conn.cursor() #sql = 'select * from {} where html like "%img%" limit 300'.format(table) sql = 'select * from %s limit 100000,100000' % table cur.execute(sql) data = cur.fetchall() cur.close() jsonData = [] for row in data: image_parse = HtmlMagic(75,download=True, archive_image=False) # row = list(row) result = {} # temp store one jsonObject result['question_id'] = row['source_id'] result['spider_sorce'] = 75 result['spider_url'] = row['key2'] result['subject'] = row['subject'] result['question_type'] = row['question_type'] #由于html解析后出现"aorder":false等情况,如果不加下列两行,则出现name 'false' is not defined报错 false = False true = True null = None try: if isinstance(row['html'], str): html_contents = row['html'] # try: # html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'], # spider_source='75') # except Exception as e: # print(traceback.print_exc()) html_contents = remove_biaoqian(html_contents) html_contents = eval(html_contents) if isinstance(html_contents,bytes): html_contents = html_contents.decode() html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'], spider_source='75') html_content = eval(html_contents) elif isinstance(html_contents,dict): html_contents = image_parse.bewitch(html_string=str(html_contents), spider_url=row['key2'], spider_source='75') html_content = eval(html_contents) except Exception as e: # print(row) # print(row['html']) # print('++' * 20) # print(traceback.print_exc()) # print(e) pass mapping_dict = { 'difficulty': 'difficulty', 'question_body': 'prompt', 'comment': 'comment', 'analy': 'parse' } result2 = { key: html_content.get(value, '') for key, value in mapping_dict.items() } try: options = html_content['options'] option = [] if options: for keys, values in options.items(): value_items = {} value_items['value'] = keys value_items['content'] = values option.append(value_items) result['option_lst'] = option except: pass try: answer = html_content['answer'] if len(answer) == 0: answer = '' result['answer'] = answer else: if isinstance(answer, str): result['answer'] = answer elif isinstance(answer, list): answers = '' for i in answer: if isinstance(i, str): answers += i + ' ' elif isinstance(i, list): answers += i[0] + ' ' if len(answers) == 0: answers = '' result['answer'] = answers except: pass try: sub_question_lst = html_content['items'] sub_question_lsts = [] if sub_question_lst: for i in range(len(sub_question_lst)): sub_question = parse_sub_question_lst(sub_question_lst[i]) sub_question_lsts.append(sub_question) result['sub_question_lst'] = sub_question_lsts except: pass try: result['flag'] = row['flag'] except: pass result1 = dict(result , **result2) # question_body = result1['question_body'] # result1['question_body'] = image_parse.bewitch(html_string=question_body, spider_url=row['key2'], # spider_source='75') # if len(result1['answer']) != 0: # answer = result1['answer'] # result1['answer'] = image_parse.bewitch(html_string=answer, spider_url=row['key2'], # spider_source='75') jsonData.append(result1) #jsonData.append(result) return jsonData
class Wln100QuestionParser(object): def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(52, archive_image=archive_image, download=download, beautify=False) def parse(self, key, qs_json, as_json, aft_subj_id): cols = dict() question_html = qs_json['test'] question_html = self.html_magic.bewitch(question_html, spider_url=key) question_html = fix_any(question_html) cols['question_html'] = center_image(question_html) ################################################################ if not qs_json.get('diff'): difficulty = 0 else: difficulty = (100 - int(qs_json.get('diff', 0) * 100)) cols['difficulty'] = difficulty ################################################################ paper_name = (qs_json.get('docname') or '') cols['paper_name'] = paper_name ################################################################ mod = re.search(r'([12][09][0189]\d)[^\d]', paper_name) if mod: exam_year = mod.group(1) else: exam_year = 0 cols['exam_year'] = int(exam_year) ################################################################ cols['question_type_str'] = (qs_json.get('typesname') or '') ################################################################ as_js = as_json['data'][1][0][0] answer_all_html = self.html_magic.bewitch((as_js.get('answer') or ''), spider_url=key) answer_all_html = fix_any(answer_all_html) cols['answer_all_html'] = center_image(answer_all_html) ################################################################ fenxi = self.html_magic.bewitch((as_js.get('analytic') or ''), spider_url=key) fenxi = fix_any(fenxi) cols['fenxi'] = center_image(fenxi) ################################################################ knowledge_point_json = list() knowledge_point = list() kpstr = (as_js.get('kllist') or '') kpstr = remove_tag('<span', kpstr, all=True) kpl = kpstr.split('<br>') for kps in kpl: kps = kps.split(' >> ') knowledge_point.append(kps[-1]) knowledge_point_json.append(kps) knowledge_point = ';'.join(knowledge_point) knowledge_point_json = json.dumps(knowledge_point_json, ensure_ascii=False) cols['knowledge_point'] = knowledge_point cols['knowledge_point_json'] = knowledge_point_json ################################################################ other_info = (as_js.get('remark') or '') other_info = self.html_magic.bewitch(other_info, spider_url=key) other_info = fix_any(other_info) cols['other_info'] = center_image(other_info) ################################################################ cols['spider_url'] = key cols['subject'] = aft_subj_id cols['exam_city'] = '' cols['paper_url'] = '' cols['zhuanti'] = '' cols['option_html'] = '' cols['jieda'] = '' cols['dianping'] = '' cols['spider_source'] = 52 cols['question_type'] = 0 cols['question_quality'] = 0 return cols
class GzywtkParser(object): def __init__(self, archive_image=False, download=False): # img 格式化 self.html_magic = HtmlMagic(68, # XXX, spider_source archive_image=archive_image, download=download, beautify=False) def parse(self, html_string, url): self.url = url cols = dict() question_html, jieda = self.get_question_html(html_string) cols['question_html'] = question_html cols['jieda'] = jieda kps = self.get_kps(html_string) cols['knowledge_point'] = kps paper_url, paper_name = self.get_paper(html_string) cols['paper_url'] = paper_url cols['paper_name'] = paper_name cols['answer_all_html'] = '' cols['fenxi'] = '' cols['dianping'] = '' cols['difficulty'] = 0 cols['zhuanti'] = '' cols['spider_url'] = url cols['subject'] = 21 cols['spider_source'] = 68 cols['question_type'] = 0 cols['question_quality'] = 0 cols['exam_year'] = 0 cols['exam_city'] = '' cols['option_html'] = '' return cols def get_question_html(self, html_string): rs = [] cns = get_html_element('<div class="content">', html_string, with_tag=False) for cn in cns: cn = abs_url(cn) cn = center_image(cn) cn = self.html_magic.bewitch(cn, spider_url=self.url) rs.append(cn.strip()) rs[1] = self.fix_any(rs[1]).replace('\r', '').strip() return rs def get_kps(self, html_string): for line in html_string.split('\n'): if '<b>考点详细:</b>' in line: kps = re.findall('</b>(.+?)</li>', line) kps2 = kps[0].replace('-', ';') return ';'.join(re_kps.findall(line)) or kps2 def get_paper(self, html_string): e = re.search('所属试卷:(.+?)</a>', html_string).group() #e = get_html_element('<li>所属试卷:', html_string, limit=1)[0] mod = re_paper.search(e) if mod: paper = 'http://www.gzywtk.com' + mod.group(1) paper_name = mod.group(2) return paper, paper_name else: return '', '' def fix_any(self, html_string): i = html_string.find('<a href=') return html_string[:i]
class Parser(object): def __init__(self): self.logger = logging.getLogger('iter') self.sql_client = MySQLClient(**QUESTION_DICT) self.html_magic = HtmlMagic(spider_source=78, download=True, proxy=True) def deal_one_item(self, item): html = item['html'] html_id = item['html_id'] spider_url = item['key'] subject_dict = True and item['info'] or {} subject_dict = json.loads(subject_dict) subject_string = subject_dict.get('name', '') subject = convert_str_subject_to_int(subject_string) question_item = dict( spider_source = 78, spider_url = spider_url, subject=subject ) question_dict = self.parse(html) question_item['knowledge_point'] = question_dict['knowledge_point'] question_item['paper_name_abbr'] = question_dict['paper_name_abbr'] question_dict = Question(**question_dict).normialize() question_item['question_html'] = question_dict['question_body'] question_item['option_html'] = '' question_item['jieda'] = '' question_item['zhuanti'] = '' question_item['question_type'] = 0 question_item['answer_all_html'] = question_dict['answer'] question_item['fenxi'] = question_dict['analy'] question_item['dianping'] = question_dict['comment'] for key in ['question_html', 'option_html', 'answer_all_html', 'fenxi', 'dianping']: question_item[key] = sub_word_tag(question_item[key]) question_item[key] = self.html_magic.bewitch( question_item[key], question_item['spider_url'], spider_source=78, headers=HEADERS ) # print(question_item) try: if self.sql_client.select('select spider_url from question_db_offline.jtyhjy_question_20171010 where spider_url =%s', spider_url): self.sql_client.update('delete from question_db_offline.jtyhjy_question_20171010 where spider_url = %s limit 1', spider_url) self.sql_client.insert('question_db_offline.jtyhjy_question_20171010', **question_item) except Exception as err: self.logger.warning( 'html_id: %s. error happend when insert question: %s', html_id, err ) raise err def parse(self, html): if isinstance(html, dict): question_json = html else: question_json = json.loads(html) question_dict = {} if not question_json: return question_dict question_dict['question_body'] = question_json.get('bodyHtmlText', '') question_dict['answer'] = question_json.get('answerHtmlText', '') question_dict['analy'] = question_json.get('analysisHtmlText', '') question_dict['knowledge_point'] = question_json.get('knowledgeName', '') question_dict['paper_name_abbr'] = question_json.get('queSource', '') question_dict['difficulty'] = question_json.get('difficult', '') return question_dict
class Dz101QuestionParser(object): def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(56, archive_image=archive_image, download=download, beautify=False) def parse(self, html_string, url, aft_subj_id): cols = dict() exam_year = 0 paper_name = '' question_html_t = list() answer_all_html_t = list() fenxi_t = list() cols_dict = { '"IsTopic"': question_html_t, '"optionoption"': question_html_t, '"Answer"': answer_all_html_t, '"Analytical"': fenxi_t, } entities = { '"IsTopic"': get_question_html, '"optionoption"': get_question_html, '"Answer"': get_answer_all_html, '"Analytical"': get_fenxi, } elems = get_html_element( '<(li|span) class="(IsTopic|Answer|Analytical|optionoption)', html_string, regex=True) q = -1 for elem in elems: for key in entities.keys(): if key in elem[:30]: entity = entities[key](elem) if q > 0 and key in ('"Answer"', '"Analytical"'): entity = '({}). {}'.format(q, entity) if q == -1 and key == '"IsTopic"': exam_year, paper_name = get_exam_info(entity) entity = remove_exam_info(entity) cols_dict[key].append(entity) if key == '"IsTopic"': q += 1 break question_all_html = '<br>\n'.join(question_html_t) question_html = self.html_magic.bewitch(question_all_html, spider_url=url) question_html = center_image(question_html) question_html = fix_any(question_html) question_html = displaystyle(question_html, latex=False, mml=True) #cols['question_html_origin'] = question_html answer_all_html = '<br>\n'.join(answer_all_html_t) answer_all_html = self.html_magic.bewitch(answer_all_html, spider_url=url) answer_all_html = center_image(answer_all_html) answer_all_html = fix_any(answer_all_html) answer_all_html = displaystyle(answer_all_html, latex=False, mml=True) #cols['answer_all_html_origin'] = answer_all_html fenxi = '<br>\n'.join(fenxi_t) fenxi = self.html_magic.bewitch(fenxi, spider_url=url) fenxi = center_image(fenxi) fenxi = fix_any(fenxi) fenxi = displaystyle(fenxi, latex=False, mml=True) #cols['fenxi_origin'] = fenxi cols['difficulty'] = get_difficulty(html_string) cols['question_type_str'] = get_question_type_str(html_string) cols['question_html'] = '' cols['option_html'] = '' cols['answer_all_html'] = '' cols['jieda'] = '' cols['fenxi'] = '' cols['dianping'] = '' cols['option_html_origin'] = '' cols['jieda_origin'] = '' cols['dianping_origin'] = '' cols['zhuanti'] = '' cols['paper_name'] = paper_name cols['paper_url'] = '' cols['spider_url'] = url cols['subject'] = aft_subj_id cols['spider_source'] = 56 cols['question_type'] = 0 cols['question_quality'] = 0 cols['knowledge_point'] = '' cols['exam_year'] = exam_year cols['exam_city'] = '' _question = Question( question_body=question_html, answer=answer_all_html, analy=fenxi, ) standard_question = _question.normialize() cols['question_html_origin'] = standard_question['question_body'] cols['answer_all_html_origin'] = standard_question['answer'] cols['fenxi_origin'] = standard_question['analy'] return cols
def __init__(self): self.logger = logging.getLogger('iter') self.sql_client = MySQLClient(**QUESTION_DICT) self.html_magic = HtmlMagic(spider_source=78, download=True, proxy=True)
def tableToJson(table): config = json.load(open(CONFIG_FILE)) first_id = 1 conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive', port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor) cur = conn.cursor() #sql = 'select * from %s where question_type = 12 limit 5000' % table #sql = 'select * from %s where question_type = 2 limit 500 ' % table #sql = 'select * from {} where topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%" '.format(table) sql = 'select * from {0} where source_id > {1} and topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%" limit 1000'.format( table, first_id) cur.execute(sql) data = cur.fetchall() cur.close() pattern_item = { '单选': '1', '填空': '2', '多选': '4' } jsonData = [] for row in data: spider_source = int(row['spider_source']) spider_url = row['spider_url'] image_parse = HtmlMagic(spider_source=spider_source,download=True, archive_image=False) result1 = {} pattern = row['pattern'] result1['question_type_name'] = pattern for key, value in pattern_item.items(): if key in pattern: pattern = value if len(pattern) >= 2: pattern = '3' result1['question_type'] = pattern topic = row['topic'] topic = replace_href(topic) topic = remove_tags(text=topic, which_ones=('h1', 'div')) topic = image_parse.bewitch(html_string=topic, spider_url=spider_url, spider_source=spider_source) result1['question_body'] = topic answer = row['answer'] answer = replace_href(answer) answer = image_parse.bewitch(html_string=answer, spider_url=spider_url, spider_source=spider_source) result1['answer'] = answer analy = row['analy'] analy = replace_href(analy) analy = image_parse.bewitch(html_string=analy, spider_url=spider_url, spider_source=spider_source) result1['analy'] = analy # html = row['html'] # analy = re.findall('<font>试题解析</font>(.+)</li><li class="noborder"><font>答案</font>', html) # if len(analy) != 0: # analy = replace_href(analy[0]) # result1['analy'] = analy # answer = re.findall('<font>答案</font><div class="editorBox">(.+?)</div>', html) # if len(answer) != 0: # answer = replace_href(answer[0]) # result1['answer'] = answer source_shijuan = row['source_shijuan'] source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan) if len(source_shijuan) != 0: result1['paper_name'] = source_shijuan[0] mapping_dict = { 'question_id': 'source_id', 'subject': 'subject', 'spider_url': 'spider_url', 'knowledge_point': 'kaodian', 'difficulty': 'difficulty', 'source': 'spider_source' } result2 = { key: row.get(value, '') for key, value in mapping_dict.items() } result = dict(result1, **result2) jsonData.append(result) return jsonData
class Zuoye17QuestionParser(object): def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(53, archive_image=archive_image, download=download, beautify=False) def parse(self, url, js, aft_subj_id): cols = dict() # 检测是否是多题 (如:完形填空) is_mqs = is_multi_qs(js) if is_mqs: question_html, option_html = get_multi_question(js) else: question_html, option_html = get_question(js) question_html = fix_any(question_html) question_html = self.html_magic.bewitch(question_html, spider_url=url) question_html = center_image(question_html) cols['question_html_origin'] = question_html cols['question_html'] = '' if 'afanti-latex' not in question_html: cols['question_html'] = question_html if option_html: option_html = fix_any(option_html) option_html = self.html_magic.bewitch(option_html, spider_url=url) option_html = center_image(option_html) cols['option_html_origin'] = option_html cols['option_html'] = '' if 'afanti-latex' not in option_html: cols['option_html'] = option_html ################################################################ answer_all_html, fenxi = get_answers(js) answer_all_html = fix_any(answer_all_html) answer_all_html = self.html_magic.bewitch(answer_all_html, spider_url=url) answer_all_html = center_image(answer_all_html) cols['answer_all_html_origin'] = answer_all_html cols['answer_all_html'] = '' if 'afanti-latex' not in answer_all_html: cols['answer_all_html'] = answer_all_html fenxi = fix_any(fenxi) fenxi = self.html_magic.bewitch(fenxi, spider_url=url) fenxi = center_image(fenxi) cols['fenxi_origin'] = fenxi cols['fenxi'] = '' if 'afanti-latex' not in fenxi: cols['fenxi'] = fenxi ################################################################ cols['difficulty'] = (js['difficulty_int'] or 0) ################################################################ cols['question_type_name'] = get_question_type_name(js) ################################################################ cols['knowledge_point'] = '' cols['jieda_origin'] = '' cols['jieda'] = '' cols['exam_year'] = 0 cols['exam_city'] = '' cols['spider_url'] = url cols['subject'] = aft_subj_id cols['zhuanti'] = '' cols['dianping'] = '' cols['spider_source'] = 53 cols['question_type'] = 0 cols['question_quality'] = 0 return cols