Beispiel #1
0
def save_base64_img_to_local(html_string, local_dir, source):
    source = str(source)
    base64_imgs = find_base64_imgs(html_string)

    for img in base64_imgs:
        src, url = get_src(img)
        if src is None:
            raise NotFindSrc(img)

        encode_base64 = get_encode_base64_str(url)
        decode_base64 = compat_base64.b64decode(encode_base64.encode('utf-8'))

        md5_name = md5_string(encode_base64) + DEFAULT_IMG_EXT
        img_path = os.path.join(local_dir, source, md5_name)
        oss_url = uri2oss.convert(md5_name, source)

        _dir = os.path.dirname(img_path)
        if not os.path.exists(_dir):
            os.makedirs(_dir)

        with open(img_path, 'wb') as fd:
            fd.write(decode_base64)

        html_string = html_string.replace(src,
                                          'src-base64="{}"'.format(oss_url), 1)

    return html_string
Beispiel #2
0
    def archive_imgs(absurl,
                     spider_source,
                     spider_url='',
                     table=None,
                     md5=None,
                     image_filename=None,
                     mysql=None,
                     ignore=True,
                     config_file=None):

        table = (table or ImageMagic.IMAGE_TABLE)
        md5 = (md5 or md5_string(absurl))
        image_filename = (image_filename or md5 + DEFAULT_IMG_EXT)
        mysql = (mysql or CommonMysql('question_db_offline',
                                      config_file=config_file))

        _acquireLock()
        try:
            # make sure one connect to be runing at one same time
            mysql.insert(table,
                         dict(url=absurl,
                              source=spider_source,
                              spider_url=spider_url,
                              md5=md5,
                              image_filename=image_filename),
                         ignore=ignore)
        finally:
            _releaseLock()
Beispiel #3
0
def handle_mathml(html_string, uri2oss, url):
    img_dir = 'working/latex_imgs/'
    mathmls = get_html_element('<math', html_string)
    latexes = [fix_latex(lt) for lt in to_latexes(mathmls)]
    png_paths = [img_dir + md5_string(latex) + '.png' for latex in latexes]
    png_results = to_pngs(latexes, png_paths, check=False)
    for latex, mathml, png_path, png_result in zip(latexes, mathmls, png_paths,
                                                   png_results):
        if png_result is False:
            logging.warn('latex2png:{} {}'.format(url, latex))
            return False

        # if not os.path.exists(png_path):
        # if png_result is False:
        # logging.warn('latex2png:{}'.format(latex))
        # return False

        w, h = get_image_size(png_path)

        latex_base64 = compat_base64.b64encode(latex.encode('utf-8')).decode()

        span = '<span data-latex="base64,{}">'.format(latex_base64)
        md5_name = os.path.basename(png_path)
        oss_img_url = uri2oss.convert(md5_name, 56)
        # oss_img_url = png_path
        # img = span + ('<img src="{}" width="{}" heigh="{}" '
        # 'style="vertical-align: middle; margin: 5px 3px 5px 3px"></span>'.format(
        # oss_img_url, w // 2 + 2, h // 2 + 2))
        img = span + ('<img src="{}" width="{}" heigh="{}" '
                      'class="afanti_latex"></span>'.format(
                          oss_img_url, w // 2 + 2, h // 2 + 2))
        html_string = html_string.replace(mathml, img)

    return html_string
Beispiel #4
0
def save_html(key, html_string, flag=0):
    mysql_conn = get_mysql_connection()

    sql, vals = html_archive.insert_sql('gzywtk_spider_html_archive_table',
                                        dict(
                                            key=key,
                                            html=html_string,
                                            md5=md5_string(html_string),
                                            source=68,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #5
0
def login(username, password):
    url = 'http://www.dz101.com/common/Login'
    session = requests.Session()

    resp = session.get(url, headers=common_headers)

    url = 'http://www.dz101.com/common/IsUser?Step=IsLogin&IsMobile={}&PasswordA={}&PasswordB=undefined&IsVerify=undefined&appstr=Teacher&province=undefined&city=undefined&county=undefined&unit_id=undefined&my_school=undefined&verify_token=undefined'.format(
        username, md5_string(password))
    resp = session.get(url, headers=common_headers)

    cookies = {
        'PHPSESSID':
        session.cookies.get('PHPSESSID'),
        'MyName':
        username,
        'Automatic_login':
        '******'.format(username, md5_string(password))
    }

    url = 'http://www.dz101.com/common/get_session'
    resp = session.get(url, headers=common_headers, cookies=cookies)

    return cookies
Beispiel #6
0
def save_html(key, js, flag=0):
    mysql_conn = get_mysql_connection()

    html_string = json.dumps(js, ensure_ascii=False, sort_keys=True)

    sql, vals = html_archive.insert_sql('vko_spider_html_archive_table',
                                        dict(
                                            key=key,
                                            html=html_string,
                                            md5=md5_string(html_string),
                                            source=74,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #7
0
def extract_image_info_from_base64_image(src):
    '''
    从<img src="data:image.....">的src值中将图片的二进制内容提取出来
    '''

    # pattern = re.compile(r'^data:image/(?P<img_type>(png)|(jpg)|(jpeg)|(gif)|(bmp)|(svg));base64,(?P<base64_string>.*?)$', re.I)
    match = _re_img_base64.match(src)
    if match:
        base64_string = match.group('base64_string')
        img_type = match.group('img_type')
        ext = '.' + img_type if img_type else ''
        md5 = md5_string(base64_string)
        img_content = base64.b64decode(base64_string)
        return {'url':src, 'md5':md5, 'img_content':img_content, 'ext':ext}
    else:
        raise Exception('cannot extract image from src')
Beispiel #8
0
def save_html(url, html_string, subj_id, info, flag=0):
    mysql_conn = get_mysql_connection()

    info = json.dumps(info, ensure_ascii=False)
    sql, vals = html_archive.insert_sql('dz101_spider_html_archive_table',
                                        dict(
                                            key=url,
                                            html=html_string,
                                            md5=md5_string(html_string),
                                            subject=subj_id,
                                            source=56,
                                            flag=flag,
                                            info=info,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #9
0
def save_html(key, cn, flag=0):
    mysql_conn = get_mysql_connection()

    if not isinstance(cn, str):
        cn = json.dumps(cn, ensure_ascii=False, sort_keys=True)

    sql, vals = html_archive.insert_sql(
        'wln100_spider_html_archive_table',
        dict(
            key          = key,
            html         = cn,
            md5          = md5_string(cn),
            source       = 52,
            flag         = flag,
        ), ignore=True
    )
    execute(mysql_conn, sql, values=vals)
Beispiel #10
0
def save_html(js, info, request_info, flag=0):
    mysql_conn = get_mysql_connection()

    html = json.dumps(js, ensure_ascii=False)
    sql, vals = html_archive.insert_sql('17zuoye_spider_html_archive_table',
                                        dict(
                                            key='17zuoye_qs_{}'.format(
                                                js['_id']),
                                            html=html,
                                            md5=md5_string(html),
                                            subject=info['subject'],
                                            request_info=request_info,
                                            source=53,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #11
0
def save_answer(js, info, request_info, testid, flag=0):
    mysql_conn = get_mysql_connection()

    html = json.dumps(js, ensure_ascii=False)
    sql, vals = html_archive.insert_sql('wln100_spider_html_archive_table',
                                        dict(
                                            key='wln100_as_{}'.format(testid),
                                            html=html,
                                            md5=md5_string(html),
                                            subject=info['aft_subid'],
                                            request_info=request_info,
                                            source=52,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
    mysql_conn.commit()
Beispiel #12
0
def save_html(key, html_string, info, flag=0):
    mysql_conn = get_mysql_connection()

    info = json.dumps(info, ensure_ascii=False, sort_keys=True)

    sql, vals = html_archive.insert_sql(
        'manfen5_zujuan_spider_html_archive_table',
        dict(
            key=key,
            html=html_string,
            md5=md5_string(html_string),
            info=info,
            source=80,
            flag=flag,
        ),
        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #13
0
def convert_url_into_md5(url, base_url):
    absurl = abs_url(url, base_url)
    md5 = md5_string(absurl)
    ext = get_ext(absurl)
    return (url, absurl, md5, ext)