Esempio n. 1
0
    def queue_items(self):
        '''
        从队列中取出要处理的消息,并封装成消费者动作,然后更新队列的状态
        :return:            封装好的消费者动作列表
        '''

        # 会限制本机处理失败之后就不再进行获取的获取,通过机器IP来限制
        # select_queue_sql = """
        # select id,action,params from hainiu_queue where type=1 and fail_ip <>'%s' and fail_times<=%s
        # limit 0,%s for update;
        # """

        select_queue_sql = """
        select id,action,params from hainiu_queue where type=1 and fail_times<=%s
        limit 0,%s for update;
        """

        update_queue_sql = """
        update hainiu_queue set type=0 where id in (%s);
        """
        return_list = []
        try:
            d = DBUtil(config._HAINIU_DB)
            # u = Util()
            # ip = u.get_local_ip()
            # sql = select_queue_sql % (self.fail_times,ip,self.limit)
            sql = select_queue_sql % (self.fail_times, self.limit)
            select_dict = d.read_dict(sql)
            if len(select_dict) == 0:
                return return_list

            query_ids = []
            for record in select_dict:
                id = record["id"]
                action = record["action"]
                params = record["params"]
                query_ids.append(str(id))
                c = HainiuConsumer(id, action, params)
                return_list.append(c)

            ids = ",".join(query_ids)
            sql = update_queue_sql % ids
            d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
        return return_list
Esempio n. 2
0
 def success_action(self, values):
     #1)记录种子url最后爬取成功数, (用来校验最后的爬取是否成功);
     #2)在hainiu_queue 表中删除已经爬取成功的url;
     seed_update_sql = """
     update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5=%s;
     """
     queue_delete_sql = """
     delete from hainiu_queue where id=%s
     """
     db_util = DBUtil(_HAINIU_DB)
     try:
         sql_param = [values[1], values[2], values[3]]
         db_util.execute_no_commit(seed_update_sql, sql_param)
         sql_param = [values[0]]
         db_util.execute_no_commit(queue_delete_sql, sql_param)
         db_util.commit()
     except Exception, e:
         traceback.print_exc(e)
         db_util.rollback()
Esempio n. 3
0
 def success_action(self, values):
     db_util = DBUtil(_HAINIU_DB)
     time_util = TimeUtil()
     #1)在hainiu_queue 表中删除已经下载成功的url;
     queue_delete_sql = """
     delete from hainiu_queue where id=%s
     """
     #2)更新内链表的最后更新时间;
     inner_update_sql = """
     update hainiu_web_seed_internally set update_time= %s where a_md5=%s AND
     md5=%s
     """
     update_time = time_util.get_timestamp()
     try:
         sql_param = [values[0]]
         db_util.execute_no_commit(queue_delete_sql, sql_param)
         sql_param = [update_time, values[1], values[2]]
         db_util.execute_no_commit(inner_update_sql, sql_param)
         db_util.commit()
     except Exception, e:
         traceback.print_exc(e)
         db_util.rollback()
Esempio n. 4
0
    def queue_items(self):
        '''
        通过悲观锁+事务+更新状态来实现多个机器串行拿取数据,
        并把其封装成HainiuConsumerAction对象实例列表返回
        '''
        select_sql = """
        select id,action,params
        from hainiu_queue where type=%s and is_work=%s and fail_ip!=%s and fail_times<%s limit %s for update;
        """

        # 更新SQL-拼字符串
        update_sql = """
        update hainiu_queue set is_work=1 where id in (%s);
        """
        c_actions = []
        # 用于装id,来更新
        ids = []
        db_util = DBUtil(_HAINIU_DB)
        try:
            # sql_params = [1, 0, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM']]
            # 屏蔽ip查询的参数
            ip = Util().get_local_ip()
            sql_params = [
                1, 0, ip, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'],
                _QUEUE_NEWS_FIND['LIMIT_NUM']
            ]
            # ({},{})
            res1 = db_util.read_dict(select_sql, sql_params)
            for row in res1:
                id = row['id']
                ids.append(str(id))
                act = row['action']
                params = row['params']
                c_action = NewsFindConsumerAction(id, act, params)
                c_actions.append(c_action)

            if len(ids) > 0:
                db_util.execute_no_commit(update_sql % ",".join(ids))

            db_util.commit()
        except Exception, e:
            db_util.rollback()
            traceback.print_exc(e)
Esempio n. 5
0
def push_queue_items():
    inert_sql = """
    insert into hainiu_queue (type,params,action) values(1,%s,%s);
    """
    count_sql = """
    select count(1) from hainiu_queue where type=1;
    """
    select_sql = """
    select id from hainiu_queue where type=1 limit %s,%s;
    """
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = inert_sql
        insert_list = [("aaa", "bbb"), ("dffddf", "awwee")]
        d.executemany(sql, insert_list)

        sql = count_sql
        queue_total = d.read_one(sql)[0]
        print "queue_total", queue_total
        page_size = 10
        page = (queue_total / page_size) + 1
        print "page", page

        for i in range(0, page):
            sql = select_sql % (i * page_size, page_size)
            select_list = d.read_tuple(sql)
            print "page", i
            for record in select_list:
                id = record[0]
                print id

    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Esempio n. 6
0
    def queue_items(self):

        # 屏蔽ip的查询方式
        select_sql='''
        select id, action, params from web_queue where type=%s
        and is_work=%s and fail_ip != %s and fail_times < %s limit 0, %s for update;
        '''

        update_sql='''
        update web_queue set is_work=1 where id in(%s);
        '''
        db_util = DBUtil(_ZZ_DB)

        try:
            ip = Util().get_local_ip()
            sql_params = [1, 0, ip, _QUEUE_ZZ["MAX_FAIL_TIMES"], _QUEUE_ZZ["LIMIT_NUM"]]

            res = db_util.read_dict(select_sql, sql_params)
            actions = []

            ids = []
            for row in res:
                id = row["id"]
                ids.append(str(id))
                action = row["action"]
                params = row["params"]

                # 封装对象
                c_action = WebConsumerAction(id, action, params)
                actions.append(c_action)

            if len(actions) != 0:
                # 更新 is_work=1
                db_util.execute_no_commit(update_sql % ",".join(ids))

            db_util.commit()

        except Exception, err:
            actions = []
            db_util.rollback()
            traceback.print_exc(err)
Esempio n. 7
0
    def success_action(self, values):
        '''
        消息动作处理成功之后,从队列中间件删除该消息,表示这个消息最终处理完成

        :param values:      消息动作处理之后的结果
        '''
        delete_sql = """
           delete from hainiu_queue where id=%s
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[0]
            sql = delete_sql % id
            d.execute(sql)
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
Esempio n. 8
0
def create_seed():
    sql = """
    insert into web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    url = "https://news.sina.com.cn/"
    catetory = "新闻"
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._ZZ_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Esempio n. 9
0
def create_seed():
    url = "https://www.autohome.com.cn/all"
    catetory = "汽车"
    sql = """
    insert into hainiu_web_seed (url,md5,domain,host,category,status) values
    ('%s','%s','%s','%s','%s',0);
    """
    hu = HtmlUtil()
    domain = get_tld(url)
    host = hu.get_url_host(url)
    u = Util()
    md5 = u.get_md5(url)

    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        sql = sql % (url, md5, domain, host, catetory)
        d.execute(sql)
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Esempio n. 10
0
    def success_action(self, values):
        # 删除列表对应的记录
        del_sql = '''
        delete from web_queue where id =%s;
        '''
        # 更新seed表状态
        update_sql = '''
        update web_seed set last_crawl_time=%s,last_crawl_internally=%s,last_crawl_externally=%s where md5 =%s;
        '''
        db_util = DBUtil(_ZZ_DB)

        try:
            # 删除队列表
            id = values[0]
            sql_param = [id]
            db_util.execute(del_sql, sql_param)
            # 更新seed表
            # [(1574519076,), 95, 7, '824e29a21f2a02379f78b0675d1fc5eb']
            sql_param =[values[2], values[3],values[4],values[1]]
            db_util.execute(update_sql, sql_param)
        except Exception, err:
            db_util.rollback()
            traceback.print_exc(err)
Esempio n. 11
0
    def action(self):
        is_success = True
        t = TimeUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        in_values = []
        ex_values = []
        a_href = ''
        main_md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)

            soup = BeautifulSoup(html, 'lxml')
            a_docs = soup.find_all("a")
            a_set = set()
            a_param = {}
            out_json_srt = ''
            status = 0
            host = hu.get_url_host(self.url)

            for a in a_docs:
                a_href = self.get_format_url(a,host)
                a_title = a.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if a_set.__contains__(a_href):
                    continue
                a_set.add(a_href)

                req = urllib2.Request(url=a_href)
                a_host = req.get_host() if req.get_host() is not None else ''
                a_md5 = u.get_md5(a_href)

                if a_title != '':
                    a_param['title'] = a_title
                    out_json_srt = json.dumps(a_param,ensure_ascii=False)

                a_xpath = hu.get_dom_parent_xpath_js(a)
                insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,
                                 MySQLdb.escape_string(self.url),
                                 MySQLdb.escape_string(a_href),
                                 MySQLdb.escape_string(a_title),
                                 out_json_srt)

                if a_host.__contains__(domain):
                    in_values.append(insert_values)
                else:
                    ex_values.append(insert_values)

            in_table = 'hainiu_web_seed_internally'
            ex_table = 'hainiu_web_seed_externally'
            insert_sql = """
                insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param)
                      values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time;
            """
            try:
                d = DBUtil(config._HAINIU_DB)
                d.execute_no_commit("set NAMES utf8mb4;")
                if in_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',in_table)
                    d.executemany_no_commit(sql,in_values)
                if ex_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',ex_table)
                    d.executemany_no_commit(sql,ex_values)
                d.commit()
            except:
                is_success = False
                self.rl.exception()
                self.rl.error(sql)
                d.rollback()
            finally:
                d.close()

        except:
            is_success = False
            self.rl.exception()
        finally:
            r.close_phandomjs()

        return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
Esempio n. 12
0
 def queue_items(self):
     select_queue_sql = """
         select id,action,params from hainiu_queue where type=1 and fail_times <= %s limit 0,%s for UPDATE;
     """
     update_queue_sql = """
         update hainiu_queue set type=0 where id in (%s);
     """
     list = []
     try:
         d = DBUtil(config._HAINIU_DB)
         sql = select_queue_sql % (self.fail_times,self.limit)
         tuple = d.read_tuple(sql)
         if len(tuple) == 0:
             return list
         queue_ids = ''
         for t in tuple:
             queue_id = t[0]
             url = t[1]
             param = '' if t[2] is None else t[2]
             queue_ids += str(queue_id) + ','
             c = NewsFindConsumer(url, param, queue_id)
             list.append(c)
         queue_ids = queue_ids[:-1]
         d.execute(update_queue_sql % (queue_ids))
     except:
         self.rl.exception()
         d.rollback()
         d.commit()
     finally:
         d.close()
     return list
Esempio n. 13
0
 def fail_action(self, values):
     update_sql = """
         update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
     """
     update_sql_1 = """
         update hainiu_queue set type=1 where id=%s;
     """
     update_hainiu_news_seed_sql = """
         update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s";
     """
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[5]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute_no_commit(sql)
         main_md5 = values[0]
         sql = update_hainiu_news_seed_sql % (ip, main_md5)
         d.execute_no_commit(sql)
         if (self.try_num == Consumer._WORK_TRY_NUM):
             sql = update_sql_1 % (id)
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
Esempio n. 14
0
 def success_action(self, values):
     delete_sql = """
         delete from hainiu_queue where id=%s;
     """
     update_hainiu_news_seed_sql = """
         update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";"""
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[5]
         sql = delete_sql % id
         d.execute_no_commit(sql)
         sql = update_hainiu_news_seed_sql % (values[3],values[4],values[0])
         d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
Esempio n. 15
0
#-*- encoding: utf-8 -*-
'''
db_test.py
Created on 2019/6/25 11:14
Copyright (c) 2019/6/25, 海牛学院版权所有.
@author: 潘牛
'''
from commons.util.db_util import DBUtil
from configs.config import _HAINIU_DB
db_util = DBUtil(_HAINIU_DB)

# 设置字符集是utf8mb4
db_util.execute_no_commit("set NAMES utf8mb4;")

# 测试 execute(self,sql,params = None):
# sql = """
# insert into hainiu_queue (type,action,params) values (1, 'www.hainiubl.com', 'aa');
# """
# db_util.execute(sql)

# 字符串拼接(不推荐用)
# sql = """
# insert into hainiu_queue (type,action,params) values (%d, '%s', '%s');
# """ % (1, 'www.hainiubl.com', 'aa')
# db_util.execute(sql)


# -------------------------------------
# 测试 execute(self,sql,params != None):
# sql占位符(推荐用法)
# sql = """
Esempio n. 16
0
def put_queue_inner():

    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # # 统计internally表的符合条件的总记录数
    # count_inner_sql='''
    # select count(*) from web_seed_internally where status=0;
    # '''
    #
    # # web_seed_internally 表的记录
    # select_inner_limit_sql='''
    # select id,a_url,param from web_seed_internally where status=0 limit %s,%s;
    # '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_internally set status=1 where md5=%s and a_md5=%s;
        '''
    try:
        # redis_tmp 数据
        redis_d = RedisUtill()
        db_uitl = DBUtil(_ZZ_DB)

        ips = ['192.168.235.136', '192.168.235.137', '192.168.235.138']
        port = '6379'
        list = []
        total_num = 0
        is_get_lock = redis_d.get_lock('seed_lock', 10)
        logger = LogUtil().get_base_logger()

        sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
        res1 = db_uitl.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            logger.info("queue has %d records,not insert!" % total_num1)
            return None

        logger.info("正在获取锁...")
        if is_get_lock:
            logger.info("获取到锁")
            start_time = time.time()

            def scan_limit_to_queue_table(host, port, cursor, match, count):
                r = redis.Redis(host, port)
                rs = r.scan(cursor, match, count)
                # 新游标
                next_num = rs[0]
                # print rs
                li = rs[1]
                for i in li:
                    if i.__contains__('a_url'):
                        list.append(i)

                # 递归出口
                if next_num == 0:
                    return None
                scan_limit_to_queue_table(host, port, next_num, match, count)

            for ip in ips:
                scan_limit_to_queue_table(ip, port, 0, 'seed_temp*', 100)

            # 分页插入queue表
            redis_result = []
            up_inner = []
            delete_list = []
            for k in list:
                if k.__contains__('a_url'):

                    # 确定同一MD5的其他参数的key
                    param = k.replace('a_url', 'param')
                    md5 = k.replace('a_url', 'md5')
                    a_md5 = k.replace('a_url', 'a_md5')

                    action = redis_d.get_value_for_key(k)
                    params = redis_d.get_value_for_key(param)
                    redis_result.append((2, action, params))

                    md5_val = redis_d.get_value_for_key(md5)
                    a_md5_val = redis_d.get_value_for_key(a_md5)
                    up_inner.append((md5_val, a_md5_val))
                    # 添加要删除的列表
                    delete_list.append(k)
                    delete_list.append(param)
                    delete_list.append(md5)
                    delete_list.append(a_md5)
                    total_num += 1

                # 批量插入queue
                if (len(redis_result) == 5):
                    db_uitl.executemany(insert_queue_sql, redis_result)
                    db_uitl.executemany(update_sql, up_inner)
                    redis_result = []
                    up_inner = []
            # 提交不满五个的最后一组
            db_uitl.executemany(insert_queue_sql, redis_result)
            db_uitl.executemany(update_sql, up_inner)
            # 删除redis_tmp
            redis_d.delete_batch(delete_list)

            redis_d.release('seed_lock')
            logger.info("释放锁")
        else:
            logger.info('其他线程正在处理,获取锁超过最大超时时间,退出处理逻辑 ')

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("total_num:%d, run_time:%.2f" % (total_num, run_time))

    except Exception, err:
        db_uitl.rollback()
        redis_d.release('seed_lock')
        traceback.print_exc(err)
Esempio n. 17
0
    def fail_action(self, values):
        '''
        消息动作处理失败之后,更改队列中间件中该消息的失败次数并记录执行机器的IP
        如果达到该机器的最大尝试失败次数,则更改队列中间件中该消息的状态为未处理,目的让其它机器再次尝试去处理该消息

        :param values:      消息动作处理之后的结果
        '''
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set type=1 where id=%s
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[0]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql_1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
Esempio n. 18
0
def put_seed_to_queue(page_show_num):
    '''
    采用分页查询种子表数据,批量导入到hainiu_queue
    :param page_show_num: 一次查询条数
    '''
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """

    # 统计种子表符合条件的总记录数
    select_seed_count_sql = """
    select count(*) from hainiu_web_seed where status=0;
    """

    # 分页查询种子表数据SQL
    select_seed_limit_sql = """
    select url, md5, domain, host, category from hainiu_web_seed
    where status=0 limit %s,%s;
     """

    # insert hainiu_queue sql
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """
    logger = LogUtil().get_logger("news_find_queue", "news_find_queue")
    db_util = DBUtil(_HAINIU_DB)
    try:
        #1) 统计hainiu_queue 未处理的记录数
        sql_params = [1]
        # res1 是 ()
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None

        start_time = time.time()

        #2) 统计种子表符合条件的总记录数
        res2 = db_util.read_one(select_seed_count_sql)
        seed_count = res2[0]

        # 计算有多少页
        page_num = seed_count / page_show_num if seed_count % page_show_num == 0 \
            else seed_count / page_show_num + 1

        # 分页查询
        for i in range(page_num):
            sql_params = [i * page_show_num, page_show_num]
            # ({},{},{},{},{})
            res3 = db_util.read_dict(select_seed_limit_sql, sql_params)
            # 插入队列表的数据
            insert_queue_values = []

            params_dict = {}
            for row in res3:
                # url, md5, domain, host, category
                act = row['url']
                md5 = row['md5']
                domain = row['domain']
                host = row['host']
                category = row['category']
                params_dict['md5'] = md5
                params_dict['domain'] = domain
                params_dict['host'] = host
                params_dict['category'] = category

                params_json = json.dumps(params_dict,
                                         ensure_ascii=False,
                                         encoding='utf-8')

                insert_queue_values.append((1, act, params_json))
            # 把查询的数据批量插入到队列表
            db_util.executemany(insert_queue_sql, insert_queue_values)

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (seed_count, run_time))

    except Exception, e:
        logger.exception(e)
Esempio n. 19
0
def put_queue(page_show_num):

    db_util = DBUtil(_ZZ_DB)
    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # 统计web_seed表的符合条件的总记录数
    count_seed_sql = '''
    select count(*) from web_seed where status=0;
    '''

    # 分页查询web_seed 表的记录
    select_seed_limit_sql = '''
    select id,url,category from web_seed where status=0 limit %s,%s;
    '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # 更新web_seed表中的 status
    update_sql = '''
    update web_seed set status=1 where id in(%s);
    '''

    try:
        sql_params = [0, _QUEUE_ZZ["MAX_FAIL_TIMES"]]
        res1 = db_util.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            print "queue has %d records,not insert!" % total_num1
            return None

        start_time = time.time()

        # 统计web_seed 表符合条件的总记录数
        res2 = db_util.read_one(count_seed_sql)
        total_num2 = res2[0]

        # 计算分多少页查询
        page_num = total_num2 / page_show_num if total_num2 % page_show_num == 0 else total_num2 / page_show_num + 1

        # 分页查询
        ids = []

        for i in range(0, page_num):
            sql_params = [i * page_show_num, page_show_num]
            print sql_params
            res3 = db_util.read_dict(select_seed_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))
                action = row["url"]
                params = row["category"]
                type = 1
                list1.append((type, action, params))

            # 批量插入queue
            db_util.executemany(insert_queue_sql, list1)

        # 更新 status=1
        db_util.execute_no_commit(update_sql % ",".join(ids))

        db_util.commit()

        end_time = time.time()
        run_time = end_time - start_time
        print "total_num:%d, run_time:%.2f" % (total_num2, run_time)

    except Exception, err:
        db_util.rollback()
        traceback.print_exc(err)
Esempio n. 20
0
def put_seed():

    # 统计seed符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_seed where status=%s and fail_times < %s;
    '''

    # 统计web_seed表的符合条件的总记录数
    count_exter_sql = '''
    select count(*) from web_seed_externally where status=0;
    '''

    # web_seed_externally 表的记录
    select_exter_limit_sql = '''
    select id,a_url,a_md5,a_host,param from web_seed_externally where status=0 limit %s,%s;
    '''

    # 插入seed表记录
    insert_seed_sql = '''
    insert into web_seed (url,md5,domain,host,category) values (%s,%s,%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_externally set status=1 where id in(%s);
     '''
    db_uitl = DBUtil(_ZZ_DB)

    sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
    res1 = db_uitl.read_one(count_queue_sql, sql_params)
    total_num1 = res1[0]
    if total_num1 != 0:
        print "queue has %d records,not insert!" % total_num1
        return None

    start_time = time.time()

    res2 = db_uitl.read_one(count_exter_sql)
    total_num2 = res2[0]

    # 计算分多少页查询
    page_num = total_num2 / _QUEUE_ZZ["LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[
        "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1

    # hu = HtmlUtil()
    # u = Util()
    # 分页插入queue表
    try:
        ids = []
        for i in range(0, page_num):

            sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]]
            res3 = db_uitl.read_dict(select_exter_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))

                url = row["a_url"]
                domain = get_tld(url)
                # host = hu.get_url_host(url)

                # md5 = u.get_md5(url)
                host = row["a_host"]
                md5 = row["a_md5"]
                category = row["param"]
                list1.append((url, md5, domain, host, category))
            # 批量插入queue
            db_uitl.executemany(insert_seed_sql, list1)

        # 更新status = 1
        db_uitl.execute(update_sql % ",".join(ids))

    except Exception, err:
        db_uitl.rollback()
        traceback.print_exc(err)
Esempio n. 21
0
def put_queue_inner():

    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # 统计internally表的符合条件的总记录数
    count_inner_sql = '''
    select count(*) from web_seed_internally where status=0;
    '''

    # web_seed_internally 表的记录
    select_inner_limit_sql = '''
    select id,a_url,param from web_seed_internally where status=0 limit %s,%s;
    '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # web_seed_internally status
    update_sql = '''
    update web_seed_internally set status=1 where id in(%s);
        '''
    db_uitl = DBUtil(_ZZ_DB)
    try:
        sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']]
        res1 = db_uitl.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            print "queue has %d records,not insert!" % total_num1
            return None

        start_time = time.time()

        res2 = db_uitl.read_one(count_inner_sql)
        total_num2 = res2[0]

        # 计算分多少页查询
        page_num = total_num2 / _QUEUE_ZZ[
            "LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[
                "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1

        # 分页插入queue表
        ids = []
        for i in range(0, page_num):
            sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]]
            res3 = db_uitl.read_dict(select_inner_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))
                action = row["a_url"]
                params1 = row["param"]
                type = 2
                list1.append((type, action, params1))
            # 批量插入queue
            db_uitl.executemany(insert_queue_sql, list1)

        # 更新status = 1
        db_uitl.execute(update_sql % ",".join(ids))
        db_uitl.commit()

        end_time = time.time()
        run_time = end_time - start_time
        print "total_num:%d, run_time:%.2f" % (total_num2, run_time)

    except Exception, err:
        db_uitl.rollback()
        traceback.print_exc(err)
Esempio n. 22
0
def push_queue_items():
    count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;"""
    select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;"""
    insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);"""
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last news_find queue not finish,last queue %s unFinish' %
                    (queue_total))
            return

        starttime = time.clock()
        total = long(d.read_one(count_news_seed_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = select_news_seed_sql % (i * page_size, page_size)
            list = d.read_tuple(sql)
            values = []
            for l in list:
                url = l[0]
                publisher = get_tld(url)
                publisher = publisher[0:publisher.index((
                    '.'))] if publisher.__contains__('.') else publisher
                param = {}
                param['category'] = l[1]
                param['publisher'] = publisher
                param = json.dumps(param, ensure_ascii=False)
                values.append((url, param))

            if values.__len__() != 0:
                random.shuffle(values)
                d.executemany(insert_news_seed_queue_items_sql, values)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info(
            'push news_find queue finish,total items %s,action time %s\'s' %
            (total, worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()
Esempio n. 23
0
def redis2Hdfs():

    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(_ZZ_DB)

        start = 0
        is_finish = True
        host_set = set()

        f = FileUtil()
        t = TimeUtil()
        time_str = t.now_time(format='%Y%m%d%H%M%S')
        #local_xpath_file_path = '/user/zengqingyong17/spark/xpath_cache_file' + time_str
        local_xpath_file_path = 'E:/python_workspaces/data/xpath/xpath_file' + time_str

        starttime = time.clock()
        r = redis.Redis('nn1.hadoop', '6379', db=6)
        while is_finish:
            values = set()
            rs = r.scan(start, "total_z:*", 10)
            # 新游标
            start = rs[0]
            if start ==0:
                is_finish = False
            # print rs
            for i in rs[1]:
                host = i.split(":")[1]
                total_key = i
                txpath_key = 'txpath_z:%s' % host
                fxpath_key = 'fxpath_z:%s' % host
                total = r.get(total_key)

                # 降序排序获得次数(0,1)
                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"

                if txpath:
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        # 返回txpath_key 中txpath[1]的数值
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 100:
                            values.add(row_format % (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 100:
                            values.add(row_format % (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                # 获得fxpath_key的全部值
                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '1'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format % (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format % (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")

        #上传到HDFS的XPATH配置文件目录
        # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Esempio n. 24
0
 def fail_action(self, values):
     ip = Util().get_local_ip()
     db_util = DBUtil(_HAINIU_DB)
     #1)记录hainiu_queue表错误次数和ip;
     # is_success,self.id,len(inner_list),len(exter_list),md5
     queue_update_sql1 = """
     update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s;
     """
     #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue
     # 表对应的记录的 is_work = 0,让其他机器重试;
     queue_update_sql2 = """
     update hainiu_queue set is_work=0 where id=%s;
     """
     #3)更新种子表的失败次数、失败ip;队列表的数据不删除,有可能是因为目标网站把ip给封了,
     # 在某个时间,写个脚本,把失败的队列数据改状态和失败次数和失败ip,重新爬取试试。
     seed_update_sql = """
     update hainiu_web_seed set  fail_times=fail_times+1,fail_ip=%s where md5=%s
     """
     try:
         sql_params = [ip, values[0]]
         db_util.execute_no_commit(queue_update_sql1, sql_params)
         # 比较失败次数
         if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1:
             sql_params = [self.id]
             db_util.execute_no_commit(queue_update_sql2, sql_params)
         sql_params = [ip, values[3]]
         db_util.execute_no_commit(seed_update_sql, sql_params)
         db_util.commit()
     except Exception, e:
         traceback.print_exc(e)
         db_util.rollback()
Esempio n. 25
0
    def action(self, *values):

        # 插入内链表sql语句
        insert_seed_internally='''
        insert into web_seed_internally
        (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status)
        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time);
        '''
        # 插入外链表sql语句
        insert_seed_externally='''
        insert into web_seed_externally
        (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status)
        values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=VALUES(update_time);
        '''

        # 获取时间
        a_time = TimeUtil()
        db_util = DBUtil(_ZZ_DB)
        # redis_d = RedisUtill()
        total_count = 0
        in_count = 0
        ex_count = 0
        try:
            # 解析主网页信息
            hu = HtmlUtil()

            domain = hu.get_url_domain(self.act)
            host = hu.get_url_host(self.act)
            u = Util()
            md5 = u.get_md5(self.act)

            # 解析a标签信息
            r = RequestUtil()
            # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
            html = r.http_get_phandomjs(self.act)
            # 可以从HTML或XML文件中提取数据的Python第三方库
            soup = BeautifulSoup(html, 'lxml')
            # a链接dom对象列表
            aset = set()
            # 获取host
            a_host = hu.get_url_host(self.act)

            # a_docs = soup.find_all("a",href=re.compile("^(/|.*"+domain+")"))
            a_docs = soup.find_all("a")

            for a in a_docs:

                total_count += 1
                # 获取a标签的href
                a_url = hu.get_format_url(self.act,a,a_host)
                # 获取a标签的内容
                a_title = a.get_text().strip()
                if a_url == '' or a_title == '':
                    continue

                if aset.__contains__(a_url):
                    continue
                aset.add(a_url)

                # 获取a标签的host
                a_host = hu.get_url_host(a_url)

                # 获取a标签href链接url的md5
                a_md5 = u.get_md5(a_url)

                # 获取a标签所对应的xpath
                a_xpath = hu.get_dom_parent_xpath_js_new(a)

                create_time = a_time.get_timestamp()
                create_day = int(a_time.now_day(format='%Y%m%d'))
                create_hour = int(a_time.now_hour())

                params_sql = [self.act,md5,self.params,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,create_time,0]
                if re.compile("^(/|.*"+domain+")").match(a_url) is not None:
                    db_util.execute(insert_seed_internally, params_sql)
                #
                #     # redis
                #     redis_md5 = u.get_md5(md5+"\001"+a_md5)
                #     find_key = redis_d.get_value_for_key('seed:%s:a_url' % redis_md5)
                #     if find_key == None:
                #         # url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time,status
                #         dicts = {'seed:%s:param' % redis_md5 :self.params, 'seed:%s:a_url' % redis_md5 : a_url,
                #                 'seed:%s:md5' % redis_md5 : md5, 'seed:%s:a_md5' % redis_md5 :a_md5}
                #
                #         dicts_temp = {'seed_temp:%s:param' % redis_md5 :self.params,'seed_temp:%s:a_url' % redis_md5 : a_url,
                #                     'seed_temp:%s:md5' % redis_md5 : md5, 'seed_temp:%s:a_md5' % redis_md5 : a_md5}
                #         redis_d.set_batch_datas(dicts)
                #         redis_d.set_batch_datas(dicts_temp)

                    in_count += 1
                else:
                    db_util.execute(insert_seed_externally, params_sql)
                    ex_count += 1

            r.close_phandomjs()
        except Exception, err:
            db_util.rollback()
            traceback.print_exc(err)
Esempio n. 26
0
def put_inner_to_queue():
    '''

    '''
    page_show_num = 10
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """
    # 统计内链接表符合条件的总记录数
    select_inner_count_sql = """
    select count(*) from hainiu_web_seed_internally where status=0;
    """
    # 分页查询内链接表
    select_inner_limit_sql = """
    select md5,a_url,a_md5,domain,a_host,a_title from hainiu_web_seed_internally WHERE
    status=0 limit 0,%s;
    """
    # 插入hainiu_queue表
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """
    # 更新内链接表的status状态
    update_inner_status_sql = """
    update hainiu_web_seed_internally set status=1 where a_md5=%s and md5=%s
    """
    logger = LogUtil().get_logger("download_news_queue", "download_news_queue")
    db_util = DBUtil(_HAINIU_DB)
    try:
        # 统计hainiu_queue 未处理的记录数
        sql_params = [2]
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None
        # 统计内链接表符合条件的总记录数
        res2 = db_util.read_one(select_inner_count_sql)
        inner_count = res2[0]

        # 计算有多少页
        page_num = inner_count / page_show_num if inner_count % page_show_num == 0 \
            else inner_count / page_show_num + 1
        start_time = time.time()
        # 分页查询
        for page in range(page_num):
            sql_params = [page_show_num]
            res3 = db_util.read_dict(select_inner_limit_sql, sql_params)
            # 插入队列表的记录
            insert_queue_record = []
            # param字典
            param_dict = {}
            # inner表内要进行更新的记录
            update_innner_status_record = []
            for row in res3:
                # md5,a_url,a_md5,domain,a_host,a_title
                md5 = row['md5']
                a_url = row['a_url']
                a_md5 = row['a_md5']
                domain = row['domain']
                a_host = row['a_host']
                a_title = row['a_title']
                # param数据
                param_dict['md5'] = md5
                param_dict['a_md5'] = a_md5
                param_dict['domain'] = domain
                param_dict['a_host'] = a_host
                param_dict['a_title'] = a_title

                param_json = json.dumps(param_dict,
                                        ensure_ascii=False,
                                        encoding='utf-8')
                # 将数据放入列表
                insert_queue_record.append((2, a_url, param_json))
                update_innner_status_record.append((a_md5, md5))

            db_util.executemany(insert_queue_sql, insert_queue_record)
            db_util.executemany(update_inner_status_sql,
                                update_innner_status_record)
        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time))

    except Exception, e:
        traceback.print_exc(e)
        db_util.rollback()
Esempio n. 27
0
    def fail_action(self, values):
        # 每次失败都需要更新ip 和 失败次数
        update_sql1='''
        update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s;
        '''
        # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试
        update_sql2='''
        update web_queue set is_work = 0 where id = %s;
        '''
        # 更新seed表状态
        update_seed_sql = '''
        update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s;
        '''
        # 更新externally表状态
        update_exter_sql = '''
        update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s;
        '''

        db_util = DBUtil(_ZZ_DB)

        try:
            id = values[0]
            ip = Util().get_local_ip()
            # 每次更新失败ip 和失败次数
            # queue表
            sql_params = [ip, id]
            db_util.execute_no_commit(update_sql1, sql_params)
            # seed 表
            sql_params = [ip, values[1]]
            db_util.execute(update_seed_sql, sql_params)
            # externally表
            db_util.execute(update_exter_sql, sql_params)

            if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1:
                db_util.execute_no_commit(update_sql2 % id)

            db_util.commit()

        except Exception,err:
            db_util.rollback()
            traceback.print_exc(err)
Esempio n. 28
0
    def action(self):
        #爬取 hainiu_queue 中符合要求的url 请求页面的所有 a标签url
        r = RequestUtil()
        hu = HtmlUtil()
        u = Util()
        #
        is_success = True
        db_util = DBUtil(_HAINIU_DB)
        time_util = TimeUtil()
        # 内外链表的列表
        inner_list = []
        exter_list = []
        #获取种子的md5
        md5 = u.get_md5(self.act)
        try:
            # 通过phandomjs 请求url,返回网页,包括网页的ajax请求
            html = r.http_get_phandomjs(self.act)
            #可以从HTML或XML文件中提取数据的Python第三方库
            soup = BeautifulSoup(html, 'lxml')
            # a链接dom对象列表
            a_docs = soup.find_all("a")
            if len(a_docs) == 0:
                is_success = False
            aset = set()
            #获取种子的domain
            domain = hu.get_url_domain(self.act)
            #获取种子的host
            host = hu.get_url_host(self.act)

            # 时间(create_time、create_day、create_hour、update_time)
            # create_time=time_util.get_timestamp()
            #
            # create_day = int(time_util.now_day().replace('-', ''))
            # create_hour=int(time_util.now_hour())
            # update_time=create_time
            create_time = time_util.get_timestamp()
            # 获取年月日格式
            create_day = int(time_util.now_day(format='%Y%m%d'))
            # 获取小时
            create_hour = int(time_util.now_hour())
            update_time = create_time

            # params_json = json.dumps(self.params, ensure_ascii=False, encoding='utf-8')

            for a_doc in a_docs:
                #获取a标签的href
                a_href = hu.get_format_url(self.act, a_doc, host)
                #获取a标签的内容
                a_title = a_doc.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if aset.__contains__(a_href):
                    continue
                aset.add(a_href)
                #获取a标签的host
                a_host = hu.get_url_host(a_href)

                #获取a标签href链接url的md5
                a_md5 = u.get_md5(a_href)

                #获取a标签所对应的xpath
                a_xpath = hu.get_dom_parent_xpath_js_new(a_doc)
                # 一行数据
                row_data = (self.act, md5, self.params, domain, host, a_href,
                            a_md5, a_host, a_xpath, a_title, create_time,
                            create_day, create_hour, update_time)
                if a_href.__contains__(domain):
                    inner_list.append(row_data)
                else:
                    exter_list.append(row_data)
            # 并解析存入内链表或外链表,在存入时,如果url已存在,只做
            # update 操作。(保证链接页面不会重复爬取)
            if len(inner_list) > 0:
                inner_insert_sql = """
              insert into hainiu_web_seed_internally
              (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,
              create_day,create_hour,update_time)
              values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
              ON DUPLICATE KEY UPDATE update_time=values(update_time);
            """
                db_util.executemany_no_commit(inner_insert_sql, inner_list)
            if len(exter_list) > 0:
                exter_insert_sql = """
              insert into hainiu_web_seed_externally
              (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time,
              create_day,create_hour,update_time)
              values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
              ON DUPLICATE KEY UPDATE update_time=values(update_time);
            """
                db_util.executemany_no_commit(exter_insert_sql, exter_list)
            db_util.commit()
        except Exception, e:
            is_success = False
            db_util.rollback()
            traceback.print_exc(e)
Esempio n. 29
0
def xpath_config_file():
    select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0"""
    rl = LogUtil().get_base_logger()
    try:
        # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler',
        #             'CHARSET': 'utf8', 'PORT': 3306}
        d = DBUtil(config._HAINIU_DB)
        # d = DBUtil(_HAINIU_DB)
        r = redis.Redis('nn1.hadoop', 6379, db=6)
        # r = redis.Redis('redis.hadoop', 6379, db=6)
        f = FileUtil()
        t = TimeUtil()
        c = Client("http://nn1.hadoop:50070")

        time_str = t.now_time(format='%Y%m%d%H%M%S')
        # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str
        local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str

        start_cursor = 0
        is_finish = True
        starttime = time.clock()
        host_set = set()

        while is_finish:
            values = set()
            limit = r.scan(start_cursor, 'total:*', 10)
            if limit[0] == 0:
                is_finish = False
            start_cursor = limit[0]
            for h in limit[1]:
                host = h.split(":")[1]
                total_key = h
                txpath_key = 'txpath:%s' % host
                fxpath_key = 'fxpath:%s' % host
                total = r.get(total_key)

                txpath = r.zrevrange(txpath_key, 0, 1)
                row_format = "%s\t%s\t%s\t%s"
                if txpath:
                    # print 'txpath:%s' % txpath
                    txpath_num = int(r.zscore(txpath_key, txpath[0]))
                    if txpath.__len__() == 2:
                        txpath_num_1 = int(r.zscore(txpath_key, txpath[1]))
                        txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0

                    # print 'txpath_max_num:%s' % txpath_num
                    if txpath_num / float(total) >= 0.8:
                        values.add(row_format % (host, txpath[0], 'true', '0'))
                        host_set.add(host)
                    else:
                        if txpath_num >= 1:
                            values.add(row_format %
                                       (host, txpath[0], 'true', '0'))
                            host_set.add(host)
                        if txpath_num_1 is not None and txpath_num_1 >= 1:
                            values.add(row_format %
                                       (host, txpath[1], 'true', '0'))
                            host_set.add(host)

                fxpath = r.smembers(fxpath_key)
                if fxpath:
                    # print 'fxpath:%s' % fxpath
                    for fx in fxpath:
                        values.add(row_format % (host, fx, 'false', '0'))
                    host_set.add(host)

                sql = select_xpath_rule_sql % host
                list_rule = d.read_tuple(sql)
                for rule in list_rule:
                    type = rule[2]
                    if type == 0:
                        values.add(row_format %
                                   (rule[0], rule[1], 'true', '2'))
                        host_set.add(host)
                    elif type == 1:
                        values.add(row_format %
                                   (rule[0], rule[1], 'false', '3'))
                        host_set.add(host)

            f.write_file_line_pattern(local_xpath_file_path, values, "a")
        #上传到HDFS的XPATH配置文件目录
        c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('total host %s,action time %s\'s' %
                (host_set.__len__(), worksec))
    except:
        rl.exception()
        d.rollback()
    finally:
        d.close()
Esempio n. 30
0
def push_queue_items():
    count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;"""
    insert_news_seed_internally_queue_items_sql = """
        insert into hainiu_queue (type,action,params) values(3,%s,%s);
    """
    count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;"""
    selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;"""
    update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);"""
    rl = LogUtil().get_base_logger()
    try:
        d = DBUtil(config._HAINIU_DB)
        queue_total = d.read_one(count_news_seed_queue_sql)[0]
        if queue_total != 0:
            rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total))
            return


        starttime = time.clock()
        d = DBUtil(config._HAINIU_DB)
        total = long(d.read_one(count_news_seed_internally_sql)[0])
        page_size = 1000
        page = total / page_size
        for i in range(0, page + 1):
            sql = selec_news_seed_internally_sql % (0, page_size)
            list = d.read_tuple(sql)
            values = []
            id_values = []
            for l in list:
                url = l[0]
                url = url if url is not None else ''
                param = l[1]
                param = param if param is not None else ''
                values.append((url,param))
                id = l[2]
                id_values.append(str(id))
            if id_values.__len__() != 0:
                random.shuffle(values)
                d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values)
                ids = ','.join(id_values)
                sql = update_news_seed_internally_sql % (ids)
                d.execute(sql)
        endtime = time.clock()
        worksec = int(round((endtime - starttime)))
        rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec))
    except:
        rl.exception()
        rl.error(sql)
        d.rollback()
    finally:
        d.close()