def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_tld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
def push_queue_items():
    inert_sql = """
    insert into hainiu_queue (type,params,action) values(1,%s,%s);
    """
    count_sql = """
    select count(1) from hainiu_queue where type=1;
    """
    select_sql = """
    select id from hainiu_queue where type=1 limit %s,%s;
    """
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = inert_sql
        insert_list = [("aaa", "bbb"), ("dffddf", "awwee")]
        d.executemany(sql, insert_list)

        sql = count_sql
        queue_total = d.read_one(sql)[0]
        print "queue_total", queue_total
        page_size = 10
        page = (queue_total / page_size) + 1
        print "page", page

        for i in range(0, page):
            sql = select_sql % (i * page_size, page_size)
            select_list = d.read_tuple(sql)
            print "page", i
            for record in select_list:
                id = record[0]
                print id

    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Beispiel #3
0
def put_queue_inner():

    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # # 统计internally表的符合条件的总记录数
    # count_inner_sql='''
    # select count(*) from web_seed_internally where status=0;
    # '''
    #
    # # web_seed_internally 表的记录
    # select_inner_limit_sql='''
    # select id,a_url,param from web_seed_internally where status=0 limit %s,%s;
    # '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_internally set status=1 where md5=%s and a_md5=%s;
        '''
    try:
        # redis_tmp 数据
        redis_d = RedisUtill()
        db_uitl = DBUtil(_ZZ_DB)

        ips = ['192.168.235.136', '192.168.235.137', '192.168.235.138']
        port = '6379'
        list = []
        total_num = 0
        is_get_lock = redis_d.get_lock('seed_lock', 10)
        logger = LogUtil().get_base_logger()

        sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
        res1 = db_uitl.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            logger.info("queue has %d records,not insert!" % total_num1)
            return None

        logger.info("正在获取锁...")
        if is_get_lock:
            logger.info("获取到锁")
            start_time = time.time()

            def scan_limit_to_queue_table(host, port, cursor, match, count):
                r = redis.Redis(host, port)
                rs = r.scan(cursor, match, count)
                # 新游标
                next_num = rs[0]
                # print rs
                li = rs[1]
                for i in li:
                    if i.__contains__('a_url'):
                        list.append(i)

                # 递归出口
                if next_num == 0:
                    return None
                scan_limit_to_queue_table(host, port, next_num, match, count)

            for ip in ips:
                scan_limit_to_queue_table(ip, port, 0, 'seed_temp*', 100)

            # 分页插入queue表
            redis_result = []
            up_inner = []
            delete_list = []
            for k in list:
                if k.__contains__('a_url'):

                    # 确定同一MD5的其他参数的key
                    param = k.replace('a_url', 'param')
                    md5 = k.replace('a_url', 'md5')
                    a_md5 = k.replace('a_url', 'a_md5')

                    action = redis_d.get_value_for_key(k)
                    params = redis_d.get_value_for_key(param)
                    redis_result.append((2, action, params))

                    md5_val = redis_d.get_value_for_key(md5)
                    a_md5_val = redis_d.get_value_for_key(a_md5)
                    up_inner.append((md5_val, a_md5_val))
                    # 添加要删除的列表
                    delete_list.append(k)
                    delete_list.append(param)
                    delete_list.append(md5)
                    delete_list.append(a_md5)
                    total_num += 1

                # 批量插入queue
                if (len(redis_result) == 5):
                    db_uitl.executemany(insert_queue_sql, redis_result)
                    db_uitl.executemany(update_sql, up_inner)
                    redis_result = []
                    up_inner = []
            # 提交不满五个的最后一组
            db_uitl.executemany(insert_queue_sql, redis_result)
            db_uitl.executemany(update_sql, up_inner)
            # 删除redis_tmp
            redis_d.delete_batch(delete_list)

            redis_d.release('seed_lock')
            logger.info("释放锁")
        else:
            logger.info('其他线程正在处理,获取锁超过最大超时时间,退出处理逻辑 ')

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("total_num:%d, run_time:%.2f" % (total_num, run_time))

    except Exception, err:
        db_uitl.rollback()
        redis_d.release('seed_lock')
        traceback.print_exc(err)
Beispiel #4
0
def put_queue(page_show_num):

    db_util = DBUtil(_ZZ_DB)
    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # 统计web_seed表的符合条件的总记录数
    count_seed_sql = '''
    select count(*) from web_seed where status=0;
    '''

    # 分页查询web_seed 表的记录
    select_seed_limit_sql = '''
    select id,url,category from web_seed where status=0 limit %s,%s;
    '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # 更新web_seed表中的 status
    update_sql = '''
    update web_seed set status=1 where id in(%s);
    '''

    try:
        sql_params = [0, _QUEUE_ZZ["MAX_FAIL_TIMES"]]
        res1 = db_util.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            print "queue has %d records,not insert!" % total_num1
            return None

        start_time = time.time()

        # 统计web_seed 表符合条件的总记录数
        res2 = db_util.read_one(count_seed_sql)
        total_num2 = res2[0]

        # 计算分多少页查询
        page_num = total_num2 / page_show_num if total_num2 % page_show_num == 0 else total_num2 / page_show_num + 1

        # 分页查询
        ids = []

        for i in range(0, page_num):
            sql_params = [i * page_show_num, page_show_num]
            print sql_params
            res3 = db_util.read_dict(select_seed_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))
                action = row["url"]
                params = row["category"]
                type = 1
                list1.append((type, action, params))

            # 批量插入queue
            db_util.executemany(insert_queue_sql, list1)

        # 更新 status=1
        db_util.execute_no_commit(update_sql % ",".join(ids))

        db_util.commit()

        end_time = time.time()
        run_time = end_time - start_time
        print "total_num:%d, run_time:%.2f" % (total_num2, run_time)

    except Exception, err:
        db_util.rollback()
        traceback.print_exc(err)
Beispiel #5
0
def put_seed():

    # 统计seed符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_seed where status=%s and fail_times < %s;
    '''

    # 统计web_seed表的符合条件的总记录数
    count_exter_sql = '''
    select count(*) from web_seed_externally where status=0;
    '''

    # web_seed_externally 表的记录
    select_exter_limit_sql = '''
    select id,a_url,a_md5,a_host,param from web_seed_externally where status=0 limit %s,%s;
    '''

    # 插入seed表记录
    insert_seed_sql = '''
    insert into web_seed (url,md5,domain,host,category) values (%s,%s,%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_externally set status=1 where id in(%s);
     '''
    db_uitl = DBUtil(_ZZ_DB)

    sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
    res1 = db_uitl.read_one(count_queue_sql, sql_params)
    total_num1 = res1[0]
    if total_num1 != 0:
        print "queue has %d records,not insert!" % total_num1
        return None

    start_time = time.time()

    res2 = db_uitl.read_one(count_exter_sql)
    total_num2 = res2[0]

    # 计算分多少页查询
    page_num = total_num2 / _QUEUE_ZZ["LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[
        "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1

    # hu = HtmlUtil()
    # u = Util()
    # 分页插入queue表
    try:
        ids = []
        for i in range(0, page_num):

            sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]]
            res3 = db_uitl.read_dict(select_exter_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))

                url = row["a_url"]
                domain = get_tld(url)
                # host = hu.get_url_host(url)

                # md5 = u.get_md5(url)
                host = row["a_host"]
                md5 = row["a_md5"]
                category = row["param"]
                list1.append((url, md5, domain, host, category))
            # 批量插入queue
            db_uitl.executemany(insert_seed_sql, list1)

        # 更新status = 1
        db_uitl.execute(update_sql % ",".join(ids))

    except Exception, err:
        db_uitl.rollback()
        traceback.print_exc(err)
Beispiel #6
0
def put_queue_inner():

    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # 统计internally表的符合条件的总记录数
    count_inner_sql = '''
    select count(*) from web_seed_internally where status=0;
    '''

    # web_seed_internally 表的记录
    select_inner_limit_sql = '''
    select id,a_url,param from web_seed_internally where status=0 limit %s,%s;
    '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_internally set status=1 where id in(%s);
        '''
    db_uitl = DBUtil(_ZZ_DB)
    try:
        sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
        res1 = db_uitl.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            print "queue has %d records,not insert!" % total_num1
            return None

        start_time = time.time()

        res2 = db_uitl.read_one(count_inner_sql)
        total_num2 = res2[0]

        # 计算分多少页查询
        page_num = total_num2 / _QUEUE_ZZ[
            "LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[
                "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1

        # 分页插入queue表
        ids = []
        for i in range(0, page_num):
            sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]]
            res3 = db_uitl.read_dict(select_inner_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))
                action = row["a_url"]
                params1 = row["param"]
                type = 2
                list1.append((type, action, params1))
            # 批量插入queue
            db_uitl.executemany(insert_queue_sql, list1)

        # 更新status = 1
        db_uitl.execute(update_sql % ",".join(ids))
        db_uitl.commit()

        end_time = time.time()
        run_time = end_time - start_time
        print "total_num:%d, run_time:%.2f" % (total_num2, run_time)

    except Exception, err:
        db_uitl.rollback()
        traceback.print_exc(err)
Beispiel #7
0
def put_seed_to_queue(page_show_num):
    '''
    采用分页查询种子表数据,批量导入到hainiu_queue
    :param page_show_num: 一次查询条数
    '''
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """

    # 统计种子表符合条件的总记录数
    select_seed_count_sql = """
    select count(*) from hainiu_web_seed where status=0;
    """

    # 分页查询种子表数据SQL
    select_seed_limit_sql = """
    select url, md5, domain, host, category from hainiu_web_seed
    where status=0 limit %s,%s;
     """

    # insert hainiu_queue sql
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """
    logger = LogUtil().get_logger("news_find_queue", "news_find_queue")
    db_util = DBUtil(_HAINIU_DB)
    try:
        #1) 统计hainiu_queue 未处理的记录数
        sql_params = [1]
        # res1 是 ()
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None

        start_time = time.time()

        #2) 统计种子表符合条件的总记录数
        res2 = db_util.read_one(select_seed_count_sql)
        seed_count = res2[0]

        # 计算有多少页
        page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \
            else seed_count / page_show_num + 1

        # 分页查询
        for i in range(page_num):
            sql_params = [i * page_show_num, page_show_num]
            # ({},{},{},{},{})
            res3 = db_util.read_dict(select_seed_limit_sql, sql_params)
            # 插入队列表的数据
            insert_queue_values = []

            params_dict = {}
            for row in res3:
                # url, md5, domain, host, category
                act = row['url']
                md5 = row['md5']
                domain = row['domain']
                host = row['host']
                category = row['category']
                params_dict['md5'] = md5
                params_dict['domain'] = domain
                params_dict['host'] = host
                params_dict['category'] = category

                params_json = json.dumps(params_dict,
                                         ensure_ascii=False,
                                         encoding='utf-8')

                insert_queue_values.append((1, act, params_json))
            # 把查询的数据批量插入到队列表
            db_util.executemany(insert_queue_sql, insert_queue_values)

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (seed_count, run_time))

    except Exception, e:
        logger.exception(e)
def put_inner_to_queue():
    '''

    '''
    page_show_num = 10
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """
    # 统计内链接表符合条件的总记录数
    select_inner_count_sql = """
    select count(*) from hainiu_web_seed_internally where status=0;
    """
    # 分页查询内链接表
    select_inner_limit_sql = """
    select md5,a_url,a_md5,domain,a_host,a_title from hainiu_web_seed_internally WHERE
    status=0 limit 0,%s;
    """
    # 插入hainiu_queue表
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """
    # 更新内链接表的status状态
    update_inner_status_sql = """
    update hainiu_web_seed_internally set status=1 where a_md5=%s and md5=%s
    """
    logger = LogUtil().get_logger("download_news_queue", "download_news_queue")
    db_util = DBUtil(_HAINIU_DB)
    try:
        # 统计hainiu_queue 未处理的记录数
        sql_params = [2]
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None
        # 统计内链接表符合条件的总记录数
        res2 = db_util.read_one(select_inner_count_sql)
        inner_count = res2[0]

        # 计算有多少页
        page_num = inner_count / page_show_num if inner_count % page_show_num == 0 \
            else inner_count / page_show_num + 1
        start_time = time.time()
        # 分页查询
        for page in range(page_num):
            sql_params = [page_show_num]
            res3 = db_util.read_dict(select_inner_limit_sql, sql_params)
            # 插入队列表的记录
            insert_queue_record = []
            # param字典
            param_dict = {}
            # inner表内要进行更新的记录
            update_innner_status_record = []
            for row in res3:
                # md5,a_url,a_md5,domain,a_host,a_title
                md5 = row['md5']
                a_url = row['a_url']
                a_md5 = row['a_md5']
                domain = row['domain']
                a_host = row['a_host']
                a_title = row['a_title']
                # param数据
                param_dict['md5'] = md5
                param_dict['a_md5'] = a_md5
                param_dict['domain'] = domain
                param_dict['a_host'] = a_host
                param_dict['a_title'] = a_title

                param_json = json.dumps(param_dict,
                                        ensure_ascii=False,
                                        encoding='utf-8')
                # 将数据放入列表
                insert_queue_record.append((2, a_url, param_json))
                update_innner_status_record.append((a_md5, md5))

            db_util.executemany(insert_queue_sql, insert_queue_record)
            db_util.executemany(update_inner_status_sql,
                                update_innner_status_record)
        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time))

    except Exception, e:
        traceback.print_exc(e)
        db_util.rollback()