Beispiel #1
0
def record_questions(rows):
    mysql_conn = get_mysql_connection()

    for row in rows:
        html_string = row[1]
        spider_url = row[2]
        info = json.loads(row[3])

        # if is_archived(spider_url):
        # continue

        logging.info(spider_url)

        try:
            cols = parser.parse(html_string, spider_url, info)
        except Exception as err:
            logging.error('[parser.parse] {}, {}'.format(err, spider_url))
            continue

        if not cols:
            continue

        # print(json.dumps(cols, indent=4, ensure_ascii=False))

        sql, vals = insert_sql(
            'question_db_offline.manfen5_zujuan_question_20161205',
            cols,
            ignore=True)
        execute(mysql_conn, sql, values=vals)
        mysql_conn.commit()
Beispiel #2
0
def save_question(cols):
    mysql_conn = get_mysql_connection()
    #
    sql, vals = insert_sql('question_db_offline.wln100_question_20170919',
                           cols,
                           ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #3
0
def update_db(table, cols, spider_url):
    mysql_conn = get_mysql_connection()

    sql, vals = update_sql(table, cols, where='where spider_url = %s')
    vals.append(spider_url)
    try:
        execute(mysql_conn, sql, values=vals)
        mysql_conn.commit()
        return True
    except Exception:
        return False
Beispiel #4
0
def test():
    sql = 'select * from wln100_spider_html_archive_table where `key` = "wln100_qs_76285"'
    row = execute(mysql_conn, sql)
    qs_json = json.loads(row[0][3])
    print(qs_json)

    sql = 'select * from wln100_spider_html_archive_table where `key` = "wln100_as_76285"'
    as_json = execute(mysql_conn, sql)[0][3]
    as_json = json.loads(as_json)

    cols = parser.parse('url', qs_json, as_json, row[0][2])
    print(json.dumps(cols, indent=4, ensure_ascii=False))
Beispiel #5
0
def save_html(key, html_string, flag=0):
    mysql_conn = get_mysql_connection()

    sql, vals = html_archive.insert_sql('gzywtk_spider_html_archive_table',
                                        dict(
                                            key=key,
                                            html=html_string,
                                            md5=md5_string(html_string),
                                            source=68,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #6
0
def save_html(key, js, flag=0):
    mysql_conn = get_mysql_connection()

    html_string = json.dumps(js, ensure_ascii=False, sort_keys=True)

    sql, vals = html_archive.insert_sql('vko_spider_html_archive_table',
                                        dict(
                                            key=key,
                                            html=html_string,
                                            md5=md5_string(html_string),
                                            source=74,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #7
0
def save_html(url, html_string, subj_id, info, flag=0):
    mysql_conn = get_mysql_connection()

    info = json.dumps(info, ensure_ascii=False)
    sql, vals = html_archive.insert_sql('dz101_spider_html_archive_table',
                                        dict(
                                            key=url,
                                            html=html_string,
                                            md5=md5_string(html_string),
                                            subject=subj_id,
                                            source=56,
                                            flag=flag,
                                            info=info,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #8
0
def save_html(key, cn, flag=0):
    mysql_conn = get_mysql_connection()

    if not isinstance(cn, str):
        cn = json.dumps(cn, ensure_ascii=False, sort_keys=True)

    sql, vals = html_archive.insert_sql(
        'wln100_spider_html_archive_table',
        dict(
            key          = key,
            html         = cn,
            md5          = md5_string(cn),
            source       = 52,
            flag         = flag,
        ), ignore=True
    )
    execute(mysql_conn, sql, values=vals)
Beispiel #9
0
def is_archived(url):
    mysql_conn = get_mysql_connection()

    sql = select_sql('question_db_offline.manfen5_zujuan_question_20161205',
                     ('question_id', ),
                     condition='where `spider_url` = %s')
    result = execute(mysql_conn, sql, values=(url, ))
    return result
Beispiel #10
0
def save_answer(js, info, request_info, testid, flag=0):
    mysql_conn = get_mysql_connection()

    html = json.dumps(js, ensure_ascii=False)
    sql, vals = html_archive.insert_sql('wln100_spider_html_archive_table',
                                        dict(
                                            key='wln100_as_{}'.format(testid),
                                            html=html,
                                            md5=md5_string(html),
                                            subject=info['aft_subid'],
                                            request_info=request_info,
                                            source=52,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
    mysql_conn.commit()
Beispiel #11
0
def is_archived(testid):
    mysql_conn = get_mysql_connection()

    cmd = 'select html_id from 17zuoye_spider_html_archive_table where `key` = %s and flag = 0'
    result = execute(mysql_conn,
                     cmd,
                     values=('17zuoye_qs_{}'.format(testid), ))
    return result
Beispiel #12
0
def save_html(js, info, request_info, flag=0):
    mysql_conn = get_mysql_connection()

    html = json.dumps(js, ensure_ascii=False)
    sql, vals = html_archive.insert_sql('17zuoye_spider_html_archive_table',
                                        dict(
                                            key='17zuoye_qs_{}'.format(
                                                js['_id']),
                                            html=html,
                                            md5=md5_string(html),
                                            subject=info['subject'],
                                            request_info=request_info,
                                            source=53,
                                            flag=flag,
                                        ),
                                        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #13
0
def save_html(key, html_string, info, flag=0):
    mysql_conn = get_mysql_connection()

    info = json.dumps(info, ensure_ascii=False, sort_keys=True)

    sql, vals = html_archive.insert_sql(
        'manfen5_zujuan_spider_html_archive_table',
        dict(
            key=key,
            html=html_string,
            md5=md5_string(html_string),
            info=info,
            source=80,
            flag=flag,
        ),
        ignore=True)
    execute(mysql_conn, sql, values=vals)
Beispiel #14
0
def is_archived(url):
    mysql_conn = get_mysql_connection()

    cmd = 'select html_id from dz101_spider_html_archive_table where `key` = %s and flag = 0'
    result = execute(mysql_conn, cmd, values=(url, ))
    if result:
        return True
    else:
        return False
Beispiel #15
0
def get_answer_json(wln_qid):
    sql = select_sql('wln100_spider_html_archive_table', ('html', ),
                     condition='where `key` = "wln100_as_{}"'.format(wln_qid))
    row = execute(mysql_conn, sql)
    if not row:
        logging.warn('[not answer]:{}'.format(wln_qid))
        return False
    else:
        return json.loads(row[0][0])
Beispiel #16
0
def is_rendered(table, spider_url):
    mysql_conn = get_mysql_connection()

    sql = select_sql(table, ('is_rendered', ),
                     condition='where spider_url = "{}"'.format(spider_url))

    rows = execute(mysql_conn, sql)
    if rows[0][0] == 0:
        return False
    else:
        return True
Beispiel #17
0
def is_as_archived(qid):
    mysql_conn = get_mysql_connection()

    key = 'wln100_as_{}'.format(qid)

    cmd = 'select html_id from wln100_spider_html_archive_table where `key` = %s and flag = 0'
    result = execute(mysql_conn, cmd, values=(key,))
    if result:
        return True
    else:
        return False
Beispiel #18
0
def record_questions(rows):
    mysql_conn = get_mysql_connection()

    for row in rows:
        js = json.loads(row[1])
        spider_url = row[2]
        aft_subj_id = row[3]

        try:
            cols = parser.parse(spider_url, js, aft_subj_id)
        except Exception as err:
            logging.error('[parser.parse] {}, {}'.format(err, spider_url))
            continue

        # print(json.dumps(cols, indent=4, ensure_ascii=False))

        sql, vals = insert_sql('question_db_offline.17zuoye_question_20160719',
                               cols,
                               ignore=True)
        execute(mysql_conn, sql, values=vals)
Beispiel #19
0
def record_questions(rows):
    mysql_conn = get_mysql_connection()

    for row in rows:
        html_string = row[1]
        spider_url = row[2]

        try:
            cols = parser.parse(html_string, spider_url)
        except Exception as err:
            logging.error('[parser.parse] {}, {}'.format(err, spider_url))
            continue

        if not cols:
            continue

        # print(json.dumps(cols, indent=4, ensure_ascii=False))

        sql, vals = insert_sql('question_db_offline.gzywtk_question_20161109',
                               cols,
                               ignore=True)
        execute(mysql_conn, sql, values=vals)
Beispiel #20
0
def main():
    mysql_conn = get_mysql_connection()
    #html_id = 30692943
    max_id = 0
    while True:
        sql = select_sql('wln100_spider_html_archive_table',
                         ('html_id', 'html', 'key', 'subject'),
                         condition='where html_id > {} and `key` like "wln100_qs%" limit 1000'.format(max_id))
        rows = execute(mysql_conn, sql)
        if not rows:
            break

        record_questions(rows)
        max_id = rows[-1][0]
Beispiel #21
0
def test():
    sql = select_sql(
        'gzywtk_spider_html_archive_table', ('key', 'html'),
        condition='where `key` = "http://www.gzywtk.com/tmshow/16650.html"')
    # condition='where html_id > 0 limit 1')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        html_string = row[1]

        cols = parser.parse(html_string, url)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Beispiel #22
0
def test():
    sql = select_sql('dz101_spider_html_archive_table',
                     ('key', 'html', 'subject'),
                     condition='where html_id > 0 limit 10')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        html_string = row[1]
        subject = row[2]

        cols = parser.parse(html_string, url, subject)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Beispiel #23
0
def record_questions(rows):
    mysql_conn = get_mysql_connection()

    for row in rows:
        try:
            js = json.loads(row[1])
            spider_url = row[2]
            cols = parser.parse(js, spider_url)
        except Exception as err:
            logging.error('[parser.parse] {}, {}'.format(err, spider_url))
            continue

        if not cols:
            continue

        # print(json.dumps(cols, indent=4, ensure_ascii=False))

        logging.info(spider_url)

        sql, vals = insert_sql('question_db_offline.vko_question_20161116',
                               cols,
                               ignore=True)
        execute(mysql_conn, sql, values=vals)
Beispiel #24
0
def test():
    sql = select_sql('vko_spider_html_archive_table',
                     ('key', 'html'),
                     condition='where `key` = "vko_qs_970"')
                     # condition='where html_id = 11496')
                     # condition='where html_id > 0 limit 10')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        js = json.loads(row[1])

        cols = parser.parse(js, url)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Beispiel #25
0
def test():
    sql = select_sql(
        'manfen5_zujuan_spider_html_archive_table', ('key', 'html', 'info'),
        condition=
        'where `key` = "manfen5_zujuan_qs_SYS201409011517434544660993"')
    # condition='where html_id > 0 limit 10')
    rows = execute(mysql_conn, sql)

    for row in rows:
        url = row[0]
        html_string = row[1]
        info = json.loads(row[2])

        cols = parser.parse(html_string, url, info)

        print(json.dumps(cols, indent=4, ensure_ascii=False))
Beispiel #26
0
def main():
    mysql_conn = get_mysql_connection()

    max_id = 0
    while True:
        sql = select_sql(
            '17zuoye_spider_html_archive_table',
            ('html_id', 'html', 'key', 'subject'),
            condition='where html_id > {} order by html_id limit 1000'.format(
                max_id))
        # condition='where `key` = "17zuoye_qs_Q_20300538822231"'.format(max_id))

        rows = execute(mysql_conn, sql)
        if not rows:
            break

        record_questions(rows)

        max_id = rows[-1][0]
Beispiel #27
0
def main():
    mysql_conn = get_mysql_connection()

    max_id = 0
    while True:
        sql = select_sql(
            'vko_spider_html_archive_table', ('html_id', 'html', 'key'),
            condition='where html_id > {} order by html_id limit 10'.format(
                max_id))

        rows = execute(mysql_conn, sql)
        if not rows:
            break

        record_questions(rows)

        max_id = rows[-1][0]

    logging.info('# over')
Beispiel #28
0
def main():
    mysql_conn = get_mysql_connection()
    #html_id = 28139704
    max_id = 28139703
    while True:
        sql = select_sql(
            'wln100_spider_html_archive_table',
            ('html_id', 'html', 'key', 'subject'),
            condition='where html_id > {} and `key` like "wln100_qs%" limit 100'
            .format(max_id))
        rows = execute(mysql_conn, sql)
        if not rows:
            break

        try:
            record_questions(rows)
        except Exception as e:
            print(e)
        max_id = rows[-1][0]
    pass
Beispiel #29
0
async def run(args):
    global mysql
    global mysql_conn

    mysql = CommonMysql(args.db, config_file=args.config_file)
    mysql_conn = mysql.connection()

    ctrl_queue = asyncio.queues.Queue(maxsize=args.ctrl_queue_size)

    max_id = 0
    while True:
        sql = select_sql(args.table,
                         COLS_HEADERS + [cs[0] for cs in args.cols],
                         condition=args.condition.format(max_id))
        rows = execute(mysql_conn, sql)
        if not rows:
            break

        await ctrl_queue.put(None)
        asyncio.ensure_future(
            render_questions(args.table, rows, ctrl_queue, args))

        if args.test:
            break

        max_id = rows[-1][0]
        logger.info('{} [max_id]:{}'.format(args.table, max_id))

    while True:
        logger.info('[ctrl_queue.qsize]: {}'.format(ctrl_queue.qsize()))
        if ctrl_queue.qsize() != 0:
            await asyncio.sleep(1 * 60)
        else:
            # over
            break

    logger.info('# over')
Beispiel #30
0
    def bewitch(self,
                html_string,
                spider_url,
                spider_source,
                download=None,
                redownload=False,
                archive_image=None,
                headers=None,
                proxy=None,
                exclude_md5s=None,
                img_ext=None,
                priority=None):

        download = (download, self.download)[download is None]
        archive_image = (archive_image,
                         self.archive_image)[archive_image is None]
        proxy = (proxy or self.proxy)
        img_ext = (img_ext or self.img_ext)
        priority = (priority or self.priority)
        self._check_exclude_md5(exclude_md5s)

        html_string, img_infos = ImageMagic.img_ossify(html_string,
                                                       spider_source,
                                                       uri2oss=self.uri2oss)

        for (ori_url, absurl, md5, ext) in img_infos:

            # if isinstance(img_ext, compat_str):
            # image_filename = md5 + img_ext
            # elif img_ext is True:
            # image_filename = md5 + ext
            # elif img_ext is None:
            # image_filename = md5
            # else:
            # image_filename = md5

            if ext.lower() == '.svg':
                image_filename = md5 + '.svg'
            else:
                image_filename = md5 + DEFAULT_IMG_EXT

            # not save to db
            # images which are in image_archive must been downloaded
            # successfully
            # if archive_image:
            # ImageMagic.archive_imgs(absurl, spider_source,
            # spider_url=spider_url,
            # table=self.IMAGE_TABLE,
            # md5=md5,
            # image_filename=image_filename,
            # mysql=self.mysql,
            # ignore=True,
            # config_file=self.config_file)

            if md5 in self.exclude_md5s:
                continue

            if redownload:
                if not self.mysql_conn:
                    self.mysql_conn = self.mysql.connection()

                sql = 'delete from {} where `md5` = %s'.format(
                    self.IMAGE_TABLE)
                execute(self.mysql_conn, sql, values=(md5, ))

            # send to image_downloader to download
            if download:
                ImageMagic.download_image(
                    absurl,
                    spider_source,
                    image_filename=image_filename,
                    spider_url=spider_url,
                    headers=headers,
                    proxy=proxy,
                    priority=priority,
                    queue=self.image_downloader_item_queue,
                    config_file=self.config_file)
        return html_string