def main(): html_string = ''' <TBODY> <TR> <TD>若 <IMG style="WIDTH: 18px; HEIGHT: 16px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716662863.png">=3, <IMG style="WIDTH: 18px; HEIGHT: 14px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716732789.png">=7,则x﹣y的值为 </TD> </TR> <TR> <TD> <DIV align=right>[ ]</DIV> </TD> </TR> <TR> <TD>A.±4 <BR>B.±10 <BR>C.﹣4或﹣10 <BR>D.±4或±10</TD> </TR> </TBODY> </TABLE> ''' html_magic = HtmlMagic(8, download=True, beautify=False) html_string = html_magic.bewitch( html_string, spider_url= 'http://www.mofangge.com/html/qDetail/02/c1/201208/1kzkc102222121.html', spider_source=8, ) html_string = center_image(html_string) print(html_string)
def __init__(self, archive_image=False, download=False): # img 格式化 self.html_magic = HtmlMagic( 74, # XXX, spider_source archive_image=archive_image, download=download, beautify=False)
def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(52, archive_image=archive_image, download=download, beautify=False) self.subject_item = { '语文': '1', '数学': '2', '英语': '3', '科学': '4', '物理': '5', '化学': '6', '地理': '7', '历史': '8', '生物': '9', '政治': '10' } self.pattern_item = {'单选': '1', '填空': '2', '多选': '4', '选择': '1'}
def tableToJson(table): config = json.load(open(CONFIG_FILE)) first_id = 1 conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive', port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor) cur = conn.cursor() #sql = 'select * from %s where question_type = 12 limit 5000' % table #sql = 'select * from %s where question_type = 2 limit 500 ' % table #sql = 'select * from {} where topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%" '.format(table) sql = 'select * from {0} where source_id > {1} and topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%" limit 1000'.format( table, first_id) cur.execute(sql) data = cur.fetchall() cur.close() pattern_item = { '单选': '1', '填空': '2', '多选': '4' } jsonData = [] for row in data: spider_source = int(row['spider_source']) spider_url = row['spider_url'] image_parse = HtmlMagic(spider_source=spider_source,download=True, archive_image=False) result1 = {} pattern = row['pattern'] result1['question_type_name'] = pattern for key, value in pattern_item.items(): if key in pattern: pattern = value if len(pattern) >= 2: pattern = '3' result1['question_type'] = pattern topic = row['topic'] topic = replace_href(topic) topic = remove_tags(text=topic, which_ones=('h1', 'div')) topic = image_parse.bewitch(html_string=topic, spider_url=spider_url, spider_source=spider_source) result1['question_body'] = topic answer = row['answer'] answer = replace_href(answer) answer = image_parse.bewitch(html_string=answer, spider_url=spider_url, spider_source=spider_source) result1['answer'] = answer analy = row['analy'] analy = replace_href(analy) analy = image_parse.bewitch(html_string=analy, spider_url=spider_url, spider_source=spider_source) result1['analy'] = analy # html = row['html'] # analy = re.findall('<font>试题解析</font>(.+)</li><li class="noborder"><font>答案</font>', html) # if len(analy) != 0: # analy = replace_href(analy[0]) # result1['analy'] = analy # answer = re.findall('<font>答案</font><div class="editorBox">(.+?)</div>', html) # if len(answer) != 0: # answer = replace_href(answer[0]) # result1['answer'] = answer source_shijuan = row['source_shijuan'] source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan) if len(source_shijuan) != 0: result1['paper_name'] = source_shijuan[0] mapping_dict = { 'question_id': 'source_id', 'subject': 'subject', 'spider_url': 'spider_url', 'knowledge_point': 'kaodian', 'difficulty': 'difficulty', 'source': 'spider_source' } result2 = { key: row.get(value, '') for key, value in mapping_dict.items() } result = dict(result1, **result2) jsonData.append(result) return jsonData
def __init__(self, archive_image=False, download=False): self.html_magic = HtmlMagic(56, archive_image=archive_image, download=download, beautify=False)
def parse_detail(row): pattern_item = {'单选': '1', '填空': '2', '多选': '4'} spider_source = int(row['spider_source']) spider_url = row['spider_url'] image_parse = HtmlMagic(spider_source=spider_source, download=True, archive_image=False) result1 = {} pattern = row['pattern'] result1['question_type_name'] = pattern for key, value in pattern_item.items(): if key in pattern: pattern = value if len(pattern) >= 2: pattern = '3' result1['question_type'] = pattern topic = row['topic'] topic = replace_href(topic) topic = remove_tags(text=topic, which_ones=('h1', 'div')) topic = image_parse.bewitch(html_string=topic, spider_url=spider_url, spider_source=spider_source) #result1['question_body'] = topic answer = row['answer'] answer = replace_href(answer) answer = image_parse.bewitch(html_string=answer, spider_url=spider_url, spider_source=spider_source) #result1['answer'] = answer analy = row['analy'] analy = replace_href(analy) analy = image_parse.bewitch(html_string=analy, spider_url=spider_url, spider_source=spider_source) #result1['analy'] = analy _question = Question( question_body=topic, answer=answer, analy=analy, ) standard_question = _question.normialize() result1['question_body'] = standard_question['question_body'] result1['answer'] = standard_question['answer'] result1['analy'] = standard_question['analy'] source_shijuan = row['source_shijuan'] source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan) if len(source_shijuan) != 0: result1['paper_name'] = source_shijuan[0] else: result1['paper_name'] = '' mapping_dict = { 'question_id': 'source_id', 'subject': 'subject', 'spider_url': 'spider_url', 'knowledge_point': 'kaodian', 'difficulty': 'difficulty', 'source': 'spider_source', 'spider_source': 'spdier_source' } result2 = {key: row.get(value, '') for key, value in mapping_dict.items()} result = dict(result1, **result2) return result
def set_magic(self): from afanti_tiku_lib.html.magic import HtmlMagic self.html_magic = HtmlMagic(75, archive_image=True, download=True)
def __init__(self): self.logger = logging.getLogger('iter') self.sql_client = MySQLClient(**QUESTION_DICT) self.html_magic = HtmlMagic(spider_source=78, download=True, proxy=True)
def tableToJson(table): config = json.load(open(CONFIG_FILE)) conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive', port=3306, charset="utf8", use_unicode=True, cursorclass=pymysql.cursors.DictCursor) cur = conn.cursor() #sql = 'select * from %s ' % table sql = 'select * from %s limit 320000' % table cur.execute(sql) data = cur.fetchall() cur.close() jsonData = [] pattern_item = {'单选': '1', '填空': '2', '多选': '4'} for row in data: spider_source = int(row['spider_source']) image_parse = HtmlMagic(spider_source=spider_source, download=True, archive_image=False) result1 = {} spider_url = row['spider_url'] result1['spider_url'] = spider_url question_id = re.findall('shiti/(.+).html', spider_url) result1['question_id'] = question_id[0] pattern = row['pattern'] result1['question_type_name'] = pattern for key, value in pattern_item.items(): if key in pattern: pattern = value if len(pattern) >= 2: pattern = '3' result1['question_type'] = pattern topic = row['topic'] topic = replace_href(topic) topic = remove_tags(text=topic, which_ones=('h1', 'div')) topic = image_parse.bewitch(html_string=topic, spider_url=spider_url, spider_source=spider_source) result1['question_body'] = topic answer = row['answer'] answer = replace_href(answer) answer = image_parse.bewitch(html_string=answer, spider_url=spider_url, spider_source=spider_source) result1['answer'] = answer analy = row['analy'] analy = replace_href(analy) analy = image_parse.bewitch(html_string=analy, spider_url=spider_url, spider_source=spider_source) result1['analy'] = analy source_shijuan = row['source_shijuan'] source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan) if len(source_shijuan) != 0: result1['paper_name'] = source_shijuan[0] mapping_dict = { 'spider_sorce': 'spider_source', 'subject': 'subject', 'knowledge_point': 'kaodian', 'difficulty': 'difficulty', 'book': 'book', 'version': 'version', 'source': 'spider_source' } result2 = { key: row.get(value, '') for key, value in mapping_dict.items() } #result['exam_year'] = row['year'] #result['exam_city'] = row['province'] result = dict(result1, **result2) jsonData.append(result) return jsonData
def tableToJson(table): config = json.load(open(CONFIG_FILE)) conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive', port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor) cur = conn.cursor() #sql = 'select * from {} where html like "%img%" limit 300'.format(table) sql = 'select * from %s limit 100000,100000' % table cur.execute(sql) data = cur.fetchall() cur.close() jsonData = [] for row in data: image_parse = HtmlMagic(75,download=True, archive_image=False) # row = list(row) result = {} # temp store one jsonObject result['question_id'] = row['source_id'] result['spider_sorce'] = 75 result['spider_url'] = row['key2'] result['subject'] = row['subject'] result['question_type'] = row['question_type'] #由于html解析后出现"aorder":false等情况,如果不加下列两行,则出现name 'false' is not defined报错 false = False true = True null = None try: if isinstance(row['html'], str): html_contents = row['html'] # try: # html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'], # spider_source='75') # except Exception as e: # print(traceback.print_exc()) html_contents = remove_biaoqian(html_contents) html_contents = eval(html_contents) if isinstance(html_contents,bytes): html_contents = html_contents.decode() html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'], spider_source='75') html_content = eval(html_contents) elif isinstance(html_contents,dict): html_contents = image_parse.bewitch(html_string=str(html_contents), spider_url=row['key2'], spider_source='75') html_content = eval(html_contents) except Exception as e: # print(row) # print(row['html']) # print('++' * 20) # print(traceback.print_exc()) # print(e) pass mapping_dict = { 'difficulty': 'difficulty', 'question_body': 'prompt', 'comment': 'comment', 'analy': 'parse' } result2 = { key: html_content.get(value, '') for key, value in mapping_dict.items() } try: options = html_content['options'] option = [] if options: for keys, values in options.items(): value_items = {} value_items['value'] = keys value_items['content'] = values option.append(value_items) result['option_lst'] = option except: pass try: answer = html_content['answer'] if len(answer) == 0: answer = '' result['answer'] = answer else: if isinstance(answer, str): result['answer'] = answer elif isinstance(answer, list): answers = '' for i in answer: if isinstance(i, str): answers += i + ' ' elif isinstance(i, list): answers += i[0] + ' ' if len(answers) == 0: answers = '' result['answer'] = answers except: pass try: sub_question_lst = html_content['items'] sub_question_lsts = [] if sub_question_lst: for i in range(len(sub_question_lst)): sub_question = parse_sub_question_lst(sub_question_lst[i]) sub_question_lsts.append(sub_question) result['sub_question_lst'] = sub_question_lsts except: pass try: result['flag'] = row['flag'] except: pass result1 = dict(result , **result2) # question_body = result1['question_body'] # result1['question_body'] = image_parse.bewitch(html_string=question_body, spider_url=row['key2'], # spider_source='75') # if len(result1['answer']) != 0: # answer = result1['answer'] # result1['answer'] = image_parse.bewitch(html_string=answer, spider_url=row['key2'], # spider_source='75') jsonData.append(result1) #jsonData.append(result) return jsonData