コード例 #1
0
def main():
    html_string = '''
<TBODY>
    <TR>
        <TD>若
            <IMG style="WIDTH: 18px; HEIGHT: 16px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716662863.png">=3,
            <IMG style="WIDTH: 18px; HEIGHT: 14px; VERTICAL-ALIGN: middle" src="http://pic1.mofangge.com/upload/papers/c02/20120814/20120814192716732789.png">=7,则x﹣y的值为&nbsp;&nbsp;&nbsp;&nbsp;</TD>
    </TR>
    <TR>
        <TD>
            <DIV align=right>[&nbsp;&nbsp;&nbsp;&nbsp; ]</DIV>
        </TD>
    </TR>
    <TR>
        <TD>A.±4&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>B.±10&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>C.﹣4或﹣10&nbsp;&nbsp;&nbsp;&nbsp;
            <BR>D.±4或±10</TD>
    </TR>
</TBODY>
</TABLE>
    '''

    html_magic = HtmlMagic(8, download=True, beautify=False)
    html_string = html_magic.bewitch(
        html_string,
        spider_url=
        'http://www.mofangge.com/html/qDetail/02/c1/201208/1kzkc102222121.html',
        spider_source=8,
    )

    html_string = center_image(html_string)

    print(html_string)
コード例 #2
0
 def __init__(self, archive_image=False, download=False):
     # img 格式化
     self.html_magic = HtmlMagic(
         74,  # XXX, spider_source
         archive_image=archive_image,
         download=download,
         beautify=False)
コード例 #3
0
 def __init__(self, archive_image=False, download=False):
     self.html_magic = HtmlMagic(52,
                                 archive_image=archive_image,
                                 download=download,
                                 beautify=False)
     self.subject_item = {
         '语文': '1',
         '数学': '2',
         '英语': '3',
         '科学': '4',
         '物理': '5',
         '化学': '6',
         '地理': '7',
         '历史': '8',
         '生物': '9',
         '政治': '10'
     }
     self.pattern_item = {'单选': '1', '填空': '2', '多选': '4', '选择': '1'}
コード例 #4
0
def tableToJson(table):
    config = json.load(open(CONFIG_FILE))
    first_id = 1
    conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive',
                           port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor)
    cur = conn.cursor()
    #sql = 'select * from %s where question_type = 12 limit 5000' % table
    #sql = 'select * from %s where question_type = 2 limit 500 ' % table
    #sql = 'select * from {} where topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%" '.format(table)
    sql = 'select * from {0} where source_id > {1} and topic not like "%yitikuimage.oss-cn-qingdao.aliyuncs.com%"  limit 1000'.format(
        table, first_id)
    cur.execute(sql)
    data = cur.fetchall()
    cur.close()
    pattern_item = {
        '单选': '1',
        '填空': '2',
        '多选': '4'
    }
    jsonData = []
    for row in data:
        spider_source = int(row['spider_source'])
        spider_url = row['spider_url']
        image_parse = HtmlMagic(spider_source=spider_source,download=True, archive_image=False)
        result1 = {}

        pattern = row['pattern']
        result1['question_type_name'] = pattern
        for key, value in pattern_item.items():
            if key in pattern:
                pattern = value
        if len(pattern) >= 2:
            pattern = '3'
        result1['question_type'] = pattern

        topic = row['topic']
        topic = replace_href(topic)
        topic = remove_tags(text=topic, which_ones=('h1', 'div'))
        topic = image_parse.bewitch(html_string=topic, spider_url=spider_url,
                                    spider_source=spider_source)
        result1['question_body'] = topic
        answer = row['answer']
        answer = replace_href(answer)
        answer = image_parse.bewitch(html_string=answer, spider_url=spider_url,
                                     spider_source=spider_source)
        result1['answer'] = answer
        analy = row['analy']
        analy = replace_href(analy)
        analy = image_parse.bewitch(html_string=analy, spider_url=spider_url,
                                    spider_source=spider_source)
        result1['analy'] = analy

        # html = row['html']
        # analy = re.findall('<font>试题解析</font>(.+)</li><li class="noborder"><font>答案</font>', html)
        # if len(analy) != 0:
        #     analy = replace_href(analy[0])
        #     result1['analy'] = analy
        # answer = re.findall('<font>答案</font><div class="editorBox">(.+?)</div>', html)
        # if len(answer) != 0:
        #     answer = replace_href(answer[0])
        #     result1['answer'] = answer

        source_shijuan = row['source_shijuan']
        source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>', source_shijuan)
        if len(source_shijuan) != 0:
            result1['paper_name'] = source_shijuan[0]

        mapping_dict = {
            'question_id': 'source_id',
            'subject': 'subject',
            'spider_url': 'spider_url',
            'knowledge_point': 'kaodian',
            'difficulty': 'difficulty',
            'source': 'spider_source'
        }
        result2 = {
            key: row.get(value, '')
            for key, value in mapping_dict.items()
        }

        result = dict(result1, **result2)
        jsonData.append(result)
    return jsonData
コード例 #5
0
 def __init__(self, archive_image=False, download=False):
     self.html_magic = HtmlMagic(56,
                                 archive_image=archive_image,
                                 download=download,
                                 beautify=False)
コード例 #6
0
def parse_detail(row):
    pattern_item = {'单选': '1', '填空': '2', '多选': '4'}
    spider_source = int(row['spider_source'])
    spider_url = row['spider_url']
    image_parse = HtmlMagic(spider_source=spider_source,
                            download=True,
                            archive_image=False)
    result1 = {}

    pattern = row['pattern']
    result1['question_type_name'] = pattern
    for key, value in pattern_item.items():
        if key in pattern:
            pattern = value
    if len(pattern) >= 2:
        pattern = '3'
    result1['question_type'] = pattern

    topic = row['topic']
    topic = replace_href(topic)
    topic = remove_tags(text=topic, which_ones=('h1', 'div'))
    topic = image_parse.bewitch(html_string=topic,
                                spider_url=spider_url,
                                spider_source=spider_source)
    #result1['question_body'] = topic
    answer = row['answer']
    answer = replace_href(answer)
    answer = image_parse.bewitch(html_string=answer,
                                 spider_url=spider_url,
                                 spider_source=spider_source)
    #result1['answer'] = answer
    analy = row['analy']
    analy = replace_href(analy)
    analy = image_parse.bewitch(html_string=analy,
                                spider_url=spider_url,
                                spider_source=spider_source)
    #result1['analy'] = analy
    _question = Question(
        question_body=topic,
        answer=answer,
        analy=analy,
    )
    standard_question = _question.normialize()
    result1['question_body'] = standard_question['question_body']
    result1['answer'] = standard_question['answer']
    result1['analy'] = standard_question['analy']

    source_shijuan = row['source_shijuan']
    source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>',
                                source_shijuan)
    if len(source_shijuan) != 0:
        result1['paper_name'] = source_shijuan[0]
    else:
        result1['paper_name'] = ''

    mapping_dict = {
        'question_id': 'source_id',
        'subject': 'subject',
        'spider_url': 'spider_url',
        'knowledge_point': 'kaodian',
        'difficulty': 'difficulty',
        'source': 'spider_source',
        'spider_source': 'spdier_source'
    }
    result2 = {key: row.get(value, '') for key, value in mapping_dict.items()}

    result = dict(result1, **result2)

    return result
コード例 #7
0
 def set_magic(self):
     from afanti_tiku_lib.html.magic import HtmlMagic
     self.html_magic = HtmlMagic(75, archive_image=True, download=True)
コード例 #8
0
 def __init__(self):
     self.logger = logging.getLogger('iter')
     self.sql_client = MySQLClient(**QUESTION_DICT)
     self.html_magic = HtmlMagic(spider_source=78, download=True, proxy=True)
コード例 #9
0
def tableToJson(table):
    config = json.load(open(CONFIG_FILE))
    conn = pymysql.connect(host=config['host'],
                           user=config['user'],
                           passwd=config['password'],
                           db='html_archive',
                           port=3306,
                           charset="utf8",
                           use_unicode=True,
                           cursorclass=pymysql.cursors.DictCursor)
    cur = conn.cursor()
    #sql = 'select * from %s ' % table
    sql = 'select * from %s limit 320000' % table
    cur.execute(sql)
    data = cur.fetchall()
    cur.close()
    jsonData = []
    pattern_item = {'单选': '1', '填空': '2', '多选': '4'}
    for row in data:
        spider_source = int(row['spider_source'])
        image_parse = HtmlMagic(spider_source=spider_source,
                                download=True,
                                archive_image=False)
        result1 = {}
        spider_url = row['spider_url']
        result1['spider_url'] = spider_url
        question_id = re.findall('shiti/(.+).html', spider_url)
        result1['question_id'] = question_id[0]

        pattern = row['pattern']
        result1['question_type_name'] = pattern
        for key, value in pattern_item.items():
            if key in pattern:
                pattern = value
        if len(pattern) >= 2:
            pattern = '3'
        result1['question_type'] = pattern

        topic = row['topic']
        topic = replace_href(topic)
        topic = remove_tags(text=topic, which_ones=('h1', 'div'))
        topic = image_parse.bewitch(html_string=topic,
                                    spider_url=spider_url,
                                    spider_source=spider_source)
        result1['question_body'] = topic
        answer = row['answer']
        answer = replace_href(answer)
        answer = image_parse.bewitch(html_string=answer,
                                     spider_url=spider_url,
                                     spider_source=spider_source)
        result1['answer'] = answer
        analy = row['analy']
        analy = replace_href(analy)
        analy = image_parse.bewitch(html_string=analy,
                                    spider_url=spider_url,
                                    spider_source=spider_source)
        result1['analy'] = analy

        source_shijuan = row['source_shijuan']
        source_shijuan = re.findall('<span class="colf43">来源:(.+?)</span>',
                                    source_shijuan)
        if len(source_shijuan) != 0:
            result1['paper_name'] = source_shijuan[0]

        mapping_dict = {
            'spider_sorce': 'spider_source',
            'subject': 'subject',
            'knowledge_point': 'kaodian',
            'difficulty': 'difficulty',
            'book': 'book',
            'version': 'version',
            'source': 'spider_source'
        }
        result2 = {
            key: row.get(value, '')
            for key, value in mapping_dict.items()
        }

        #result['exam_year'] = row['year']
        #result['exam_city'] = row['province']
        result = dict(result1, **result2)
        jsonData.append(result)
    return jsonData
コード例 #10
0
def tableToJson(table):
    config = json.load(open(CONFIG_FILE))
    conn = pymysql.connect(host=config['host'], user=config['user'], passwd=config['password'], db='html_archive',
                           port=3306, charset= "utf8", use_unicode=True, cursorclass = pymysql.cursors.DictCursor)
    cur = conn.cursor()
    #sql = 'select * from {}  where html like "%img%" limit 300'.format(table)
    sql = 'select * from %s limit 100000,100000' % table
    cur.execute(sql)
    data = cur.fetchall()
    cur.close()
    jsonData = []
    for row in data:
        image_parse = HtmlMagic(75,download=True, archive_image=False)
        # row = list(row)
        result = {}  # temp store one jsonObject
        result['question_id'] = row['source_id']
        result['spider_sorce'] = 75
        result['spider_url'] = row['key2']
        result['subject'] = row['subject']
        result['question_type'] = row['question_type']

        #由于html解析后出现"aorder":false等情况,如果不加下列两行,则出现name 'false' is not defined报错
        false = False
        true = True
        null = None
        try:
            if isinstance(row['html'], str):
                html_contents = row['html']
                # try:
                #     html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'],
                #                                         spider_source='75')
                # except Exception as e:
                #     print(traceback.print_exc())
                html_contents = remove_biaoqian(html_contents)
                html_contents = eval(html_contents)
                if isinstance(html_contents,bytes):
                    html_contents = html_contents.decode()
                    html_contents = image_parse.bewitch(html_string=html_contents, spider_url=row['key2'],
                                                        spider_source='75')
                    html_content = eval(html_contents)
                elif isinstance(html_contents,dict):
                    html_contents = image_parse.bewitch(html_string=str(html_contents), spider_url=row['key2'],
                                                        spider_source='75')
                    html_content = eval(html_contents)

        except Exception as e:
            # print(row)
            # print(row['html'])
            # print('++' * 20)
            # print(traceback.print_exc())
            # print(e)
            pass

        mapping_dict = {
            'difficulty': 'difficulty',
            'question_body': 'prompt',
            'comment': 'comment',
            'analy': 'parse'
        }

        result2 = {
            key: html_content.get(value, '')
            for key, value in mapping_dict.items()
            }

        try:
            options = html_content['options']
            option = []
            if options:
                for keys, values in options.items():
                    value_items = {}
                    value_items['value'] = keys
                    value_items['content'] = values
                    option.append(value_items)
            result['option_lst'] = option
        except:
            pass

        try:
            answer = html_content['answer']
            if len(answer) == 0:
                answer = ''
                result['answer'] = answer
            else:
                if isinstance(answer, str):
                    result['answer'] = answer
                elif isinstance(answer, list):
                    answers = ''
                    for i in answer:
                        if isinstance(i, str):
                            answers += i + ' '
                        elif isinstance(i, list):
                            answers += i[0] + ' '
                    if len(answers) == 0:
                        answers = ''
                    result['answer'] = answers
        except:
            pass

        try:
            sub_question_lst = html_content['items']
            sub_question_lsts = []
            if sub_question_lst:
                for i in range(len(sub_question_lst)):
                    sub_question = parse_sub_question_lst(sub_question_lst[i])
                    sub_question_lsts.append(sub_question)
                result['sub_question_lst'] = sub_question_lsts
        except:
            pass

        try:
            result['flag'] = row['flag']
        except:
            pass
        result1 = dict(result , **result2)
        # question_body = result1['question_body']
        # result1['question_body'] = image_parse.bewitch(html_string=question_body, spider_url=row['key2'],
        #                                     spider_source='75')
        # if len(result1['answer']) != 0:
        #     answer = result1['answer']
        #     result1['answer'] = image_parse.bewitch(html_string=answer, spider_url=row['key2'],
        #                                                    spider_source='75')
        jsonData.append(result1)
        #jsonData.append(result)
    return jsonData