def record_questions(rows): mysql_conn = get_mysql_connection() for row in rows: html_string = row[1] spider_url = row[2] info = json.loads(row[3]) # if is_archived(spider_url): # continue logging.info(spider_url) try: cols = parser.parse(html_string, spider_url, info) except Exception as err: logging.error('[parser.parse] {}, {}'.format(err, spider_url)) continue if not cols: continue # print(json.dumps(cols, indent=4, ensure_ascii=False)) sql, vals = insert_sql( 'question_db_offline.manfen5_zujuan_question_20161205', cols, ignore=True) execute(mysql_conn, sql, values=vals) mysql_conn.commit()
def save_question(cols): mysql_conn = get_mysql_connection() # sql, vals = insert_sql('question_db_offline.wln100_question_20170919', cols, ignore=True) execute(mysql_conn, sql, values=vals)
def update_db(table, cols, spider_url): mysql_conn = get_mysql_connection() sql, vals = update_sql(table, cols, where='where spider_url = %s') vals.append(spider_url) try: execute(mysql_conn, sql, values=vals) mysql_conn.commit() return True except Exception: return False
def test(): sql = 'select * from wln100_spider_html_archive_table where `key` = "wln100_qs_76285"' row = execute(mysql_conn, sql) qs_json = json.loads(row[0][3]) print(qs_json) sql = 'select * from wln100_spider_html_archive_table where `key` = "wln100_as_76285"' as_json = execute(mysql_conn, sql)[0][3] as_json = json.loads(as_json) cols = parser.parse('url', qs_json, as_json, row[0][2]) print(json.dumps(cols, indent=4, ensure_ascii=False))
def save_html(key, html_string, flag=0): mysql_conn = get_mysql_connection() sql, vals = html_archive.insert_sql('gzywtk_spider_html_archive_table', dict( key=key, html=html_string, md5=md5_string(html_string), source=68, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def save_html(key, js, flag=0): mysql_conn = get_mysql_connection() html_string = json.dumps(js, ensure_ascii=False, sort_keys=True) sql, vals = html_archive.insert_sql('vko_spider_html_archive_table', dict( key=key, html=html_string, md5=md5_string(html_string), source=74, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def save_html(url, html_string, subj_id, info, flag=0): mysql_conn = get_mysql_connection() info = json.dumps(info, ensure_ascii=False) sql, vals = html_archive.insert_sql('dz101_spider_html_archive_table', dict( key=url, html=html_string, md5=md5_string(html_string), subject=subj_id, source=56, flag=flag, info=info, ), ignore=True) execute(mysql_conn, sql, values=vals)
def save_html(key, cn, flag=0): mysql_conn = get_mysql_connection() if not isinstance(cn, str): cn = json.dumps(cn, ensure_ascii=False, sort_keys=True) sql, vals = html_archive.insert_sql( 'wln100_spider_html_archive_table', dict( key = key, html = cn, md5 = md5_string(cn), source = 52, flag = flag, ), ignore=True ) execute(mysql_conn, sql, values=vals)
def is_archived(url): mysql_conn = get_mysql_connection() sql = select_sql('question_db_offline.manfen5_zujuan_question_20161205', ('question_id', ), condition='where `spider_url` = %s') result = execute(mysql_conn, sql, values=(url, )) return result
def save_answer(js, info, request_info, testid, flag=0): mysql_conn = get_mysql_connection() html = json.dumps(js, ensure_ascii=False) sql, vals = html_archive.insert_sql('wln100_spider_html_archive_table', dict( key='wln100_as_{}'.format(testid), html=html, md5=md5_string(html), subject=info['aft_subid'], request_info=request_info, source=52, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals) mysql_conn.commit()
def is_archived(testid): mysql_conn = get_mysql_connection() cmd = 'select html_id from 17zuoye_spider_html_archive_table where `key` = %s and flag = 0' result = execute(mysql_conn, cmd, values=('17zuoye_qs_{}'.format(testid), )) return result
def save_html(js, info, request_info, flag=0): mysql_conn = get_mysql_connection() html = json.dumps(js, ensure_ascii=False) sql, vals = html_archive.insert_sql('17zuoye_spider_html_archive_table', dict( key='17zuoye_qs_{}'.format( js['_id']), html=html, md5=md5_string(html), subject=info['subject'], request_info=request_info, source=53, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def save_html(key, html_string, info, flag=0): mysql_conn = get_mysql_connection() info = json.dumps(info, ensure_ascii=False, sort_keys=True) sql, vals = html_archive.insert_sql( 'manfen5_zujuan_spider_html_archive_table', dict( key=key, html=html_string, md5=md5_string(html_string), info=info, source=80, flag=flag, ), ignore=True) execute(mysql_conn, sql, values=vals)
def is_archived(url): mysql_conn = get_mysql_connection() cmd = 'select html_id from dz101_spider_html_archive_table where `key` = %s and flag = 0' result = execute(mysql_conn, cmd, values=(url, )) if result: return True else: return False
def get_answer_json(wln_qid): sql = select_sql('wln100_spider_html_archive_table', ('html', ), condition='where `key` = "wln100_as_{}"'.format(wln_qid)) row = execute(mysql_conn, sql) if not row: logging.warn('[not answer]:{}'.format(wln_qid)) return False else: return json.loads(row[0][0])
def is_rendered(table, spider_url): mysql_conn = get_mysql_connection() sql = select_sql(table, ('is_rendered', ), condition='where spider_url = "{}"'.format(spider_url)) rows = execute(mysql_conn, sql) if rows[0][0] == 0: return False else: return True
def is_as_archived(qid): mysql_conn = get_mysql_connection() key = 'wln100_as_{}'.format(qid) cmd = 'select html_id from wln100_spider_html_archive_table where `key` = %s and flag = 0' result = execute(mysql_conn, cmd, values=(key,)) if result: return True else: return False
def record_questions(rows): mysql_conn = get_mysql_connection() for row in rows: js = json.loads(row[1]) spider_url = row[2] aft_subj_id = row[3] try: cols = parser.parse(spider_url, js, aft_subj_id) except Exception as err: logging.error('[parser.parse] {}, {}'.format(err, spider_url)) continue # print(json.dumps(cols, indent=4, ensure_ascii=False)) sql, vals = insert_sql('question_db_offline.17zuoye_question_20160719', cols, ignore=True) execute(mysql_conn, sql, values=vals)
def record_questions(rows): mysql_conn = get_mysql_connection() for row in rows: html_string = row[1] spider_url = row[2] try: cols = parser.parse(html_string, spider_url) except Exception as err: logging.error('[parser.parse] {}, {}'.format(err, spider_url)) continue if not cols: continue # print(json.dumps(cols, indent=4, ensure_ascii=False)) sql, vals = insert_sql('question_db_offline.gzywtk_question_20161109', cols, ignore=True) execute(mysql_conn, sql, values=vals)
def main(): mysql_conn = get_mysql_connection() #html_id = 30692943 max_id = 0 while True: sql = select_sql('wln100_spider_html_archive_table', ('html_id', 'html', 'key', 'subject'), condition='where html_id > {} and `key` like "wln100_qs%" limit 1000'.format(max_id)) rows = execute(mysql_conn, sql) if not rows: break record_questions(rows) max_id = rows[-1][0]
def test(): sql = select_sql( 'gzywtk_spider_html_archive_table', ('key', 'html'), condition='where `key` = "http://www.gzywtk.com/tmshow/16650.html"') # condition='where html_id > 0 limit 1') rows = execute(mysql_conn, sql) for row in rows: url = row[0] html_string = row[1] cols = parser.parse(html_string, url) print(json.dumps(cols, indent=4, ensure_ascii=False))
def test(): sql = select_sql('dz101_spider_html_archive_table', ('key', 'html', 'subject'), condition='where html_id > 0 limit 10') rows = execute(mysql_conn, sql) for row in rows: url = row[0] html_string = row[1] subject = row[2] cols = parser.parse(html_string, url, subject) print(json.dumps(cols, indent=4, ensure_ascii=False))
def record_questions(rows): mysql_conn = get_mysql_connection() for row in rows: try: js = json.loads(row[1]) spider_url = row[2] cols = parser.parse(js, spider_url) except Exception as err: logging.error('[parser.parse] {}, {}'.format(err, spider_url)) continue if not cols: continue # print(json.dumps(cols, indent=4, ensure_ascii=False)) logging.info(spider_url) sql, vals = insert_sql('question_db_offline.vko_question_20161116', cols, ignore=True) execute(mysql_conn, sql, values=vals)
def test(): sql = select_sql('vko_spider_html_archive_table', ('key', 'html'), condition='where `key` = "vko_qs_970"') # condition='where html_id = 11496') # condition='where html_id > 0 limit 10') rows = execute(mysql_conn, sql) for row in rows: url = row[0] js = json.loads(row[1]) cols = parser.parse(js, url) print(json.dumps(cols, indent=4, ensure_ascii=False))
def test(): sql = select_sql( 'manfen5_zujuan_spider_html_archive_table', ('key', 'html', 'info'), condition= 'where `key` = "manfen5_zujuan_qs_SYS201409011517434544660993"') # condition='where html_id > 0 limit 10') rows = execute(mysql_conn, sql) for row in rows: url = row[0] html_string = row[1] info = json.loads(row[2]) cols = parser.parse(html_string, url, info) print(json.dumps(cols, indent=4, ensure_ascii=False))
def main(): mysql_conn = get_mysql_connection() max_id = 0 while True: sql = select_sql( '17zuoye_spider_html_archive_table', ('html_id', 'html', 'key', 'subject'), condition='where html_id > {} order by html_id limit 1000'.format( max_id)) # condition='where `key` = "17zuoye_qs_Q_20300538822231"'.format(max_id)) rows = execute(mysql_conn, sql) if not rows: break record_questions(rows) max_id = rows[-1][0]
def main(): mysql_conn = get_mysql_connection() max_id = 0 while True: sql = select_sql( 'vko_spider_html_archive_table', ('html_id', 'html', 'key'), condition='where html_id > {} order by html_id limit 10'.format( max_id)) rows = execute(mysql_conn, sql) if not rows: break record_questions(rows) max_id = rows[-1][0] logging.info('# over')
def main(): mysql_conn = get_mysql_connection() #html_id = 28139704 max_id = 28139703 while True: sql = select_sql( 'wln100_spider_html_archive_table', ('html_id', 'html', 'key', 'subject'), condition='where html_id > {} and `key` like "wln100_qs%" limit 100' .format(max_id)) rows = execute(mysql_conn, sql) if not rows: break try: record_questions(rows) except Exception as e: print(e) max_id = rows[-1][0] pass
async def run(args): global mysql global mysql_conn mysql = CommonMysql(args.db, config_file=args.config_file) mysql_conn = mysql.connection() ctrl_queue = asyncio.queues.Queue(maxsize=args.ctrl_queue_size) max_id = 0 while True: sql = select_sql(args.table, COLS_HEADERS + [cs[0] for cs in args.cols], condition=args.condition.format(max_id)) rows = execute(mysql_conn, sql) if not rows: break await ctrl_queue.put(None) asyncio.ensure_future( render_questions(args.table, rows, ctrl_queue, args)) if args.test: break max_id = rows[-1][0] logger.info('{} [max_id]:{}'.format(args.table, max_id)) while True: logger.info('[ctrl_queue.qsize]: {}'.format(ctrl_queue.qsize())) if ctrl_queue.qsize() != 0: await asyncio.sleep(1 * 60) else: # over break logger.info('# over')
def bewitch(self, html_string, spider_url, spider_source, download=None, redownload=False, archive_image=None, headers=None, proxy=None, exclude_md5s=None, img_ext=None, priority=None): download = (download, self.download)[download is None] archive_image = (archive_image, self.archive_image)[archive_image is None] proxy = (proxy or self.proxy) img_ext = (img_ext or self.img_ext) priority = (priority or self.priority) self._check_exclude_md5(exclude_md5s) html_string, img_infos = ImageMagic.img_ossify(html_string, spider_source, uri2oss=self.uri2oss) for (ori_url, absurl, md5, ext) in img_infos: # if isinstance(img_ext, compat_str): # image_filename = md5 + img_ext # elif img_ext is True: # image_filename = md5 + ext # elif img_ext is None: # image_filename = md5 # else: # image_filename = md5 if ext.lower() == '.svg': image_filename = md5 + '.svg' else: image_filename = md5 + DEFAULT_IMG_EXT # not save to db # images which are in image_archive must been downloaded # successfully # if archive_image: # ImageMagic.archive_imgs(absurl, spider_source, # spider_url=spider_url, # table=self.IMAGE_TABLE, # md5=md5, # image_filename=image_filename, # mysql=self.mysql, # ignore=True, # config_file=self.config_file) if md5 in self.exclude_md5s: continue if redownload: if not self.mysql_conn: self.mysql_conn = self.mysql.connection() sql = 'delete from {} where `md5` = %s'.format( self.IMAGE_TABLE) execute(self.mysql_conn, sql, values=(md5, )) # send to image_downloader to download if download: ImageMagic.download_image( absurl, spider_source, image_filename=image_filename, spider_url=spider_url, headers=headers, proxy=proxy, priority=priority, queue=self.image_downloader_item_queue, config_file=self.config_file) return html_string