def is_archived(url): mysql_conn = get_mysql_connection() sql = select_sql('question_db_offline.manfen5_zujuan_question_20161205', ('question_id', ), condition='where `spider_url` = %s') result = execute(mysql_conn, sql, values=(url, )) return result
def get_answer_json(wln_qid): sql = select_sql('wln100_spider_html_archive_table', ('html', ), condition='where `key` = "wln100_as_{}"'.format(wln_qid)) row = execute(mysql_conn, sql) if not row: logging.warn('[not answer]:{}'.format(wln_qid)) return False else: return json.loads(row[0][0])
def is_rendered(table, spider_url): mysql_conn = get_mysql_connection() sql = select_sql(table, ('is_rendered', ), condition='where spider_url = "{}"'.format(spider_url)) rows = execute(mysql_conn, sql) if rows[0][0] == 0: return False else: return True
def test(): sql = select_sql('dz101_spider_html_archive_table', ('key', 'html', 'subject'), condition='where html_id > 0 limit 10') rows = execute(mysql_conn, sql) for row in rows: url = row[0] html_string = row[1] subject = row[2] cols = parser.parse(html_string, url, subject) print(json.dumps(cols, indent=4, ensure_ascii=False))
def main(): mysql_conn = get_mysql_connection() #html_id = 30692943 max_id = 0 while True: sql = select_sql('wln100_spider_html_archive_table', ('html_id', 'html', 'key', 'subject'), condition='where html_id > {} and `key` like "wln100_qs%" limit 1000'.format(max_id)) rows = execute(mysql_conn, sql) if not rows: break record_questions(rows) max_id = rows[-1][0]
def test(): sql = select_sql( 'gzywtk_spider_html_archive_table', ('key', 'html'), condition='where `key` = "http://www.gzywtk.com/tmshow/16650.html"') # condition='where html_id > 0 limit 1') rows = execute(mysql_conn, sql) for row in rows: url = row[0] html_string = row[1] cols = parser.parse(html_string, url) print(json.dumps(cols, indent=4, ensure_ascii=False))
def test(): sql = select_sql('vko_spider_html_archive_table', ('key', 'html'), condition='where `key` = "vko_qs_970"') # condition='where html_id = 11496') # condition='where html_id > 0 limit 10') rows = execute(mysql_conn, sql) for row in rows: url = row[0] js = json.loads(row[1]) cols = parser.parse(js, url) print(json.dumps(cols, indent=4, ensure_ascii=False))
def test(): sql = select_sql( 'manfen5_zujuan_spider_html_archive_table', ('key', 'html', 'info'), condition= 'where `key` = "manfen5_zujuan_qs_SYS201409011517434544660993"') # condition='where html_id > 0 limit 10') rows = execute(mysql_conn, sql) for row in rows: url = row[0] html_string = row[1] info = json.loads(row[2]) cols = parser.parse(html_string, url, info) print(json.dumps(cols, indent=4, ensure_ascii=False))
def main(): mysql_conn = get_mysql_connection() max_id = 0 while True: sql = select_sql( 'vko_spider_html_archive_table', ('html_id', 'html', 'key'), condition='where html_id > {} order by html_id limit 10'.format( max_id)) rows = execute(mysql_conn, sql) if not rows: break record_questions(rows) max_id = rows[-1][0] logging.info('# over')
def main(): mysql_conn = get_mysql_connection() max_id = 0 while True: sql = select_sql( '17zuoye_spider_html_archive_table', ('html_id', 'html', 'key', 'subject'), condition='where html_id > {} order by html_id limit 1000'.format( max_id)) # condition='where `key` = "17zuoye_qs_Q_20300538822231"'.format(max_id)) rows = execute(mysql_conn, sql) if not rows: break record_questions(rows) max_id = rows[-1][0]
def main(): mysql_conn = get_mysql_connection() #html_id = 28139704 max_id = 28139703 while True: sql = select_sql( 'wln100_spider_html_archive_table', ('html_id', 'html', 'key', 'subject'), condition='where html_id > {} and `key` like "wln100_qs%" limit 100' .format(max_id)) rows = execute(mysql_conn, sql) if not rows: break try: record_questions(rows) except Exception as e: print(e) max_id = rows[-1][0] pass
async def run(args): global mysql global mysql_conn mysql = CommonMysql(args.db, config_file=args.config_file) mysql_conn = mysql.connection() ctrl_queue = asyncio.queues.Queue(maxsize=args.ctrl_queue_size) max_id = 0 while True: sql = select_sql(args.table, COLS_HEADERS + [cs[0] for cs in args.cols], condition=args.condition.format(max_id)) rows = execute(mysql_conn, sql) if not rows: break await ctrl_queue.put(None) asyncio.ensure_future( render_questions(args.table, rows, ctrl_queue, args)) if args.test: break max_id = rows[-1][0] logger.info('{} [max_id]:{}'.format(args.table, max_id)) while True: logger.info('[ctrl_queue.qsize]: {}'.format(ctrl_queue.qsize())) if ctrl_queue.qsize() != 0: await asyncio.sleep(1 * 60) else: # over break logger.info('# over')