Esempio n. 1
0
def is_archived(url):
    mysql_conn = get_mysql_connection()

    sql = select_sql('question_db_offline.manfen5_zujuan_question_20161205',
                     ('question_id', ),
                     condition='where `spider_url` = %s')
    result = execute(mysql_conn, sql, values=(url, ))
    return result
Esempio n. 2
0
def get_answer_json(wln_qid):
    sql = select_sql('wln100_spider_html_archive_table', ('html', ),
                     condition='where `key` = "wln100_as_{}"'.format(wln_qid))
    row = execute(mysql_conn, sql)
    if not row:
        logging.warn('[not answer]:{}'.format(wln_qid))
        return False
    else:
        return json.loads(row[0][0])
Esempio n. 3
0
def is_rendered(table, spider_url):
    mysql_conn = get_mysql_connection()

    sql = select_sql(table, ('is_rendered', ),
                     condition='where spider_url = "{}"'.format(spider_url))

    rows = execute(mysql_conn, sql)
    if rows[0][0] == 0:
        return False
    else:
        return True
Esempio n. 4
0
def test():
    sql = select_sql('dz101_spider_html_archive_table',
                     ('key', 'html', 'subject'),
                     condition='where html_id > 0 limit 10')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        html_string = row[1]
        subject = row[2]

        cols = parser.parse(html_string, url, subject)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Esempio n. 5
0
def main():
    mysql_conn = get_mysql_connection()
    #html_id = 30692943
    max_id = 0
    while True:
        sql = select_sql('wln100_spider_html_archive_table',
                         ('html_id', 'html', 'key', 'subject'),
                         condition='where html_id > {} and `key` like "wln100_qs%" limit 1000'.format(max_id))
        rows = execute(mysql_conn, sql)
        if not rows:
            break

        record_questions(rows)
        max_id = rows[-1][0]
Esempio n. 6
0
def test():
    sql = select_sql(
        'gzywtk_spider_html_archive_table', ('key', 'html'),
        condition='where `key` = "http://www.gzywtk.com/tmshow/16650.html"')
    # condition='where html_id > 0 limit 1')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        html_string = row[1]

        cols = parser.parse(html_string, url)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Esempio n. 7
0
def test():
    sql = select_sql('vko_spider_html_archive_table',
                     ('key', 'html'),
                     condition='where `key` = "vko_qs_970"')
                     # condition='where html_id = 11496')
                     # condition='where html_id > 0 limit 10')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        js = json.loads(row[1])

        cols = parser.parse(js, url)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Esempio n. 8
0
def test():
    sql = select_sql(
        'manfen5_zujuan_spider_html_archive_table', ('key', 'html', 'info'),
        condition=
        'where `key` = "manfen5_zujuan_qs_SYS201409011517434544660993"')
    # condition='where html_id > 0 limit 10')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        html_string = row[1]
        info = json.loads(row[2])

        cols = parser.parse(html_string, url, info)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Esempio n. 9
0
def main():
    mysql_conn = get_mysql_connection()

    max_id = 0
    while True:
        sql = select_sql(
            'vko_spider_html_archive_table', ('html_id', 'html', 'key'),
            condition='where html_id > {} order by html_id limit 10'.format(
                max_id))

        rows = execute(mysql_conn, sql)
        if not rows:
            break

        record_questions(rows)

        max_id = rows[-1][0]

    logging.info('# over')
Esempio n. 10
0
def main():
    mysql_conn = get_mysql_connection()

    max_id = 0
    while True:
        sql = select_sql(
            '17zuoye_spider_html_archive_table',
            ('html_id', 'html', 'key', 'subject'),
            condition='where html_id > {} order by html_id limit 1000'.format(
                max_id))
        # condition='where `key` = "17zuoye_qs_Q_20300538822231"'.format(max_id))

        rows = execute(mysql_conn, sql)
        if not rows:
            break

        record_questions(rows)

        max_id = rows[-1][0]
Esempio n. 11
0
def main():
    mysql_conn = get_mysql_connection()
    #html_id = 28139704
    max_id = 28139703
    while True:
        sql = select_sql(
            'wln100_spider_html_archive_table',
            ('html_id', 'html', 'key', 'subject'),
            condition='where html_id > {} and `key` like "wln100_qs%" limit 100'
            .format(max_id))
        rows = execute(mysql_conn, sql)
        if not rows:
            break

        try:
            record_questions(rows)
        except Exception as e:
            print(e)
        max_id = rows[-1][0]
    pass
Esempio n. 12
0
async def run(args):
    global mysql
    global mysql_conn

    mysql = CommonMysql(args.db, config_file=args.config_file)
    mysql_conn = mysql.connection()

    ctrl_queue = asyncio.queues.Queue(maxsize=args.ctrl_queue_size)

    max_id = 0
    while True:
        sql = select_sql(args.table,
                         COLS_HEADERS + [cs[0] for cs in args.cols],
                         condition=args.condition.format(max_id))
        rows = execute(mysql_conn, sql)
        if not rows:
            break

        await ctrl_queue.put(None)
        asyncio.ensure_future(
            render_questions(args.table, rows, ctrl_queue, args))

        if args.test:
            break

        max_id = rows[-1][0]
        logger.info('{} [max_id]:{}'.format(args.table, max_id))

    while True:
        logger.info('[ctrl_queue.qsize]: {}'.format(ctrl_queue.qsize()))
        if ctrl_queue.qsize() != 0:
            await asyncio.sleep(1 * 60)
        else:
            # over
            break

    logger.info('# over')