def save_base64_img_to_local(html_string, local_dir, source): source = str(source) base64_imgs = find_base64_imgs(html_string) for img in base64_imgs: src, url = get_src(img) if src is None: raise NotFindSrc(img) encode_base64 = get_encode_base64_str(url) decode_base64 = compat_base64.b64decode(encode_base64.encode('utf-8')) md5_name = md5_string(encode_base64) + DEFAULT_IMG_EXT img_path = os.path.join(local_dir, source, md5_name) oss_url = uri2oss.convert(md5_name, source) _dir = os.path.dirname(img_path) if not os.path.exists(_dir): os.makedirs(_dir) with open(img_path, 'wb') as fd: fd.write(decode_base64) html_string = html_string.replace(src, 'src-base64="{}"'.format(oss_url), 1) return html_string
def archive_imgs(absurl, spider_source, spider_url='', table=None, md5=None, image_filename=None, mysql=None, ignore=True, config_file=None): table = (table or ImageMagic.IMAGE_TABLE) md5 = (md5 or md5_string(absurl)) image_filename = (image_filename or md5 + DEFAULT_IMG_EXT) mysql = (mysql or CommonMysql('question_db_offline', config_file=config_file)) _acquireLock() try: # make sure one connect to be runing at one same time mysql.insert(table, dict(url=absurl, source=spider_source, spider_url=spider_url, md5=md5, image_filename=image_filename), ignore=ignore) finally: _releaseLock()
def handle_mathml(html_string, uri2oss, url): img_dir = 'working/latex_imgs/' mathmls = get_html_element('<math', html_string) latexes = [fix_latex(lt) for lt in to_latexes(mathmls)] png_paths = [img_dir + md5_string(latex) + '.png' for latex in latexes] png_results = to_pngs(latexes, png_paths, check=False) for latex, mathml, png_path, png_result in zip(latexes, mathmls, png_paths, png_results): if png_result is False: logging.warn('latex2png:{} {}'.format(url, latex)) return False # if not os.path.exists(png_path): # if png_result is False: # logging.warn('latex2png:{}'.format(latex)) # return False w, h = get_image_size(png_path) latex_base64 = compat_base64.b64encode(latex.encode('utf-8')).decode() span = '<span data-latex="base64,{}">'.format(latex_base64) md5_name = os.path.basename(png_path) oss_img_url = uri2oss.convert(md5_name, 56) # oss_img_url = png_path # img = span + ('<img src="{}" width="{}" heigh="{}" ' # 'style="vertical-align: middle; margin: 5px 3px 5px 3px"></span>'.format( # oss_img_url, w // 2 + 2, h // 2 + 2)) img = span + ('<img src="{}" width="{}" heigh="{}" ' 'class="afanti_latex"></span>'.format( oss_img_url, w // 2 + 2, h // 2 + 2)) html_string = html_string.replace(mathml, img) return html_string
def save_html(key, html_string, flag=0): mysql_conn = get_mysql_connection() sql, vals = html_archive.insert_sql('gzywtk_spider_html_archive_table', dict( key=key, html=html_string, md5=md5_string(html_string), source=68, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def login(username, password): url = 'http://www.dz101.com/common/Login' session = requests.Session() resp = session.get(url, headers=common_headers) url = 'http://www.dz101.com/common/IsUser?Step=IsLogin&IsMobile={}&PasswordA={}&PasswordB=undefined&IsVerify=undefined&appstr=Teacher&province=undefined&city=undefined&county=undefined&unit_id=undefined&my_school=undefined&verify_token=undefined'.format( username, md5_string(password)) resp = session.get(url, headers=common_headers) cookies = { 'PHPSESSID': session.cookies.get('PHPSESSID'), 'MyName': username, 'Automatic_login': '******'.format(username, md5_string(password)) } url = 'http://www.dz101.com/common/get_session' resp = session.get(url, headers=common_headers, cookies=cookies) return cookies
def save_html(key, js, flag=0): mysql_conn = get_mysql_connection() html_string = json.dumps(js, ensure_ascii=False, sort_keys=True) sql, vals = html_archive.insert_sql('vko_spider_html_archive_table', dict( key=key, html=html_string, md5=md5_string(html_string), source=74, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def extract_image_info_from_base64_image(src): ''' 从<img src="data:image.....">的src值中将图片的二进制内容提取出来 ''' # pattern = re.compile(r'^data:image/(?P<img_type>(png)|(jpg)|(jpeg)|(gif)|(bmp)|(svg));base64,(?P<base64_string>.*?)$', re.I) match = _re_img_base64.match(src) if match: base64_string = match.group('base64_string') img_type = match.group('img_type') ext = '.' + img_type if img_type else '' md5 = md5_string(base64_string) img_content = base64.b64decode(base64_string) return {'url':src, 'md5':md5, 'img_content':img_content, 'ext':ext} else: raise Exception('cannot extract image from src')
def save_html(url, html_string, subj_id, info, flag=0): mysql_conn = get_mysql_connection() info = json.dumps(info, ensure_ascii=False) sql, vals = html_archive.insert_sql('dz101_spider_html_archive_table', dict( key=url, html=html_string, md5=md5_string(html_string), subject=subj_id, source=56, flag=flag, info=info, ), ignore=True) execute(mysql_conn, sql, values=vals)
def save_html(key, cn, flag=0): mysql_conn = get_mysql_connection() if not isinstance(cn, str): cn = json.dumps(cn, ensure_ascii=False, sort_keys=True) sql, vals = html_archive.insert_sql( 'wln100_spider_html_archive_table', dict( key = key, html = cn, md5 = md5_string(cn), source = 52, flag = flag, ), ignore=True ) execute(mysql_conn, sql, values=vals)
def save_html(js, info, request_info, flag=0): mysql_conn = get_mysql_connection() html = json.dumps(js, ensure_ascii=False) sql, vals = html_archive.insert_sql('17zuoye_spider_html_archive_table', dict( key='17zuoye_qs_{}'.format( js['_id']), html=html, md5=md5_string(html), subject=info['subject'], request_info=request_info, source=53, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def save_answer(js, info, request_info, testid, flag=0): mysql_conn = get_mysql_connection() html = json.dumps(js, ensure_ascii=False) sql, vals = html_archive.insert_sql('wln100_spider_html_archive_table', dict( key='wln100_as_{}'.format(testid), html=html, md5=md5_string(html), subject=info['aft_subid'], request_info=request_info, source=52, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals) mysql_conn.commit()
def save_html(key, html_string, info, flag=0): mysql_conn = get_mysql_connection() info = json.dumps(info, ensure_ascii=False, sort_keys=True) sql, vals = html_archive.insert_sql( 'manfen5_zujuan_spider_html_archive_table', dict( key=key, html=html_string, md5=md5_string(html_string), info=info, source=80, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def convert_url_into_md5(url, base_url): absurl = abs_url(url, base_url) md5 = md5_string(absurl) ext = get_ext(absurl) return (url, absurl, md5, ext)