def fail_action(self, values):
        '''
        消息动作处理失败之后,更改队列中间件中该消息的失败次数并记录执行机器的IP
        如果达到该机器的最大尝试失败次数,则更改队列中间件中该消息的状态为未处理,目的让其它机器再次尝试去处理该消息

        :param values:      消息动作处理之后的结果
        '''
        update_sql = """
            update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
        """
        update_sql_1 = """
            update hainiu_queue set type=1 where id=%s
        """
        try:
            d = DBUtil(config._HAINIU_DB)
            id = values[0]
            u = Util()
            ip = u.get_local_ip()
            sql = update_sql % (ip, id)
            d.execute_no_commit(sql)
            if (self.try_num == Consumer._WORK_TRY_NUM):
                sql = update_sql_1 % id
                d.execute_no_commit(sql)
            d.commit()
        except:
            self.rl.exception()
            self.rl.error(sql)
            d.rollback()
        finally:
            d.close()
Beispiel #2
0
def put_inner_to_queue():
    redis_util = RedisUtill()
    '''

    '''
    page_show_num = 10
    # 统计hainiu_queue 未处理的记录数
    select_queue_count_sql = """
    select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0;
    """
    # 插入hainiu_queue表
    insert_queue_sql = """
    insert into hainiu_queue (type,action,params) values (%s, %s, %s);
    """

    logger = LogUtil().get_logger("download_news_queue", "download_news_queue")
    db_util = DBUtil(_HAINIU_DB)
    db_util.execute_no_commit("set NAMES utf8mb4;")
    try:
        # 统计hainiu_queue 未处理的记录数
        sql_params = [2]
        res1 = db_util.read_one(select_queue_count_sql, sql_params)
        queue_count = res1[0]
        start_time = time.time()
        if queue_count >= 5:
            logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count)
            return None
        inner_count = 0
        for ip in ips:
            key_list = []
            scan_limit_to_queue_table(ip, port, 0, 'down:*', 20, key_list)

            inner_count = inner_count + len(key_list)
            # 根据key列表上Redis里获取value列表
            values = redis_util.get_values_batch_keys(key_list)
            # 导入hainiu_queue表
            insert_queue_record = []
            for value in values:
                queue_param = json.loads(value)
                a_url = queue_param['a_url']
                insert_queue_record.append((2, a_url, value))

            db_util.executemany_no_commit(insert_queue_sql,
                                          insert_queue_record)
            db_util.commit()
            # 把导入表后的key列表从redis里删掉
            redis_util.delete_batch(key_list)

        end_time = time.time()
        run_time = end_time - start_time
        logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time))

    except Exception, e:
        traceback.print_exc(e)
        db_util.rollback()
    def queue_items(self):
        '''
        通过悲观锁+事务+更新状态来实现多个机器串行拿取数据,
        并把其封装成HainiuConsumerAction对象实例列表返回
        '''
        select_sql = """
        select id,action,params
        from hainiu_queue where type=%s and is_work=%s and fail_ip!=%s and fail_times<%s limit %s for update;
        """

        # 更新SQL-拼字符串
        update_sql = """
        update hainiu_queue set is_work=1 where id in (%s);
        """
        c_actions = []
        # 用于装id,来更新
        ids = []
        db_util = DBUtil(_HAINIU_DB)
        try:
            # sql_params = [1, 0, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM']]
            # 屏蔽ip查询的参数
            ip = Util().get_local_ip()
            sql_params = [
                1, 0, ip, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'],
                _QUEUE_NEWS_FIND['LIMIT_NUM']
            ]
            # ({},{})
            res1 = db_util.read_dict(select_sql, sql_params)
            for row in res1:
                id = row['id']
                ids.append(str(id))
                act = row['action']
                params = row['params']
                c_action = NewsFindConsumerAction(id, act, params)
                c_actions.append(c_action)

            if len(ids) > 0:
                db_util.execute_no_commit(update_sql % ",".join(ids))

            db_util.commit()
        except Exception, e:
            db_util.rollback()
            traceback.print_exc(e)
 def success_action(self, values):
     #1)记录种子url最后爬取成功数, (用来校验最后的爬取是否成功);
     #2)在hainiu_queue 表中删除已经爬取成功的url;
     seed_update_sql = """
     update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5=%s;
     """
     queue_delete_sql = """
     delete from hainiu_queue where id=%s
     """
     db_util = DBUtil(_HAINIU_DB)
     try:
         sql_param = [values[1], values[2], values[3]]
         db_util.execute_no_commit(seed_update_sql, sql_param)
         sql_param = [values[0]]
         db_util.execute_no_commit(queue_delete_sql, sql_param)
         db_util.commit()
     except Exception, e:
         traceback.print_exc(e)
         db_util.rollback()
Beispiel #5
0
    def queue_items(self):

        # 屏蔽ip的查询方式
        select_sql='''
        select id, action, params from web_queue where type=%s
        and is_work=%s and fail_ip != %s and fail_times < %s limit 0, %s for update;
        '''

        update_sql='''
        update web_queue set is_work=1 where id in(%s);
        '''
        db_util = DBUtil(_ZZ_DB)

        try:
            ip = Util().get_local_ip()
            sql_params = [1, 0, ip, _QUEUE_ZZ["MAX_FAIL_TIMES"], _QUEUE_ZZ["LIMIT_NUM"]]

            res = db_util.read_dict(select_sql, sql_params)
            actions = []

            ids = []
            for row in res:
                id = row["id"]
                ids.append(str(id))
                action = row["action"]
                params = row["params"]

                # 封装对象
                c_action = WebConsumerAction(id, action, params)
                actions.append(c_action)

            if len(actions) != 0:
                # 更新 is_work=1
                db_util.execute_no_commit(update_sql % ",".join(ids))

            db_util.commit()

        except Exception, err:
            actions = []
            db_util.rollback()
            traceback.print_exc(err)
Beispiel #6
0
    def fail_action(self, values):
        # 每次失败都需要更新ip 和 失败次数
        update_sql1='''
        update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s;
        '''
        # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试
        update_sql2='''
        update web_queue set is_work = 0 where id = %s;
        '''
        # 更新seed表状态
        update_seed_sql = '''
        update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s;
        '''
        # 更新externally表状态
        update_exter_sql = '''
        update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s;
        '''

        db_util = DBUtil(_ZZ_DB)

        try:
            id = values[0]
            ip = Util().get_local_ip()
            # 每次更新失败ip 和失败次数
            # queue表
            sql_params = [ip, id]
            db_util.execute_no_commit(update_sql1, sql_params)
            # seed 表
            sql_params = [ip, values[1]]
            db_util.execute(update_seed_sql, sql_params)
            # externally表
            db_util.execute(update_exter_sql, sql_params)

            if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1:
                db_util.execute_no_commit(update_sql2 % id)

            db_util.commit()

        except Exception,err:
            db_util.rollback()
            traceback.print_exc(err)
Beispiel #7
0
 def success_action(self, values):
     delete_sql = """
         delete from hainiu_queue where id=%s;
     """
     update_hainiu_news_seed_sql = """
         update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";"""
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[5]
         sql = delete_sql % id
         d.execute_no_commit(sql)
         sql = update_hainiu_news_seed_sql % (values[3],values[4],values[0])
         d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
Beispiel #8
0
 def fail_action(self, values):
     update_sql = """
         update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s;
     """
     update_sql_1 = """
         update hainiu_queue set type=1 where id=%s;
     """
     update_hainiu_news_seed_sql = """
         update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s";
     """
     try:
         d = DBUtil(config._HAINIU_DB)
         id = values[5]
         u = Util()
         ip = u.get_local_ip()
         sql = update_sql % (ip, id)
         d.execute_no_commit(sql)
         main_md5 = values[0]
         sql = update_hainiu_news_seed_sql % (ip, main_md5)
         d.execute_no_commit(sql)
         if (self.try_num == Consumer._WORK_TRY_NUM):
             sql = update_sql_1 % (id)
             d.execute_no_commit(sql)
         d.commit()
     except:
         self.rl.exception()
         self.rl.error(sql)
         d.rollback()
         d.commit()
     finally:
         d.close()
 def fail_action(self, values):
     ip = Util().get_local_ip()
     db_util = DBUtil(_HAINIU_DB)
     #1)记录hainiu_queue表错误次数和ip;
     # is_success,self.id,len(inner_list),len(exter_list),md5
     queue_update_sql1 = """
     update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s;
     """
     #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue
     # 表对应的记录的 is_work = 0,让其他机器重试;
     queue_update_sql2 = """
     update hainiu_queue set is_work=0 where id=%s;
     """
     #3)更新种子表的失败次数、失败ip;队列表的数据不删除,有可能是因为目标网站把ip给封了,
     # 在某个时间,写个脚本,把失败的队列数据改状态和失败次数和失败ip,重新爬取试试。
     seed_update_sql = """
     update hainiu_web_seed set  fail_times=fail_times+1,fail_ip=%s where md5=%s
     """
     try:
         sql_params = [ip, values[0]]
         db_util.execute_no_commit(queue_update_sql1, sql_params)
         # 比较失败次数
         if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1:
             sql_params = [self.id]
             db_util.execute_no_commit(queue_update_sql2, sql_params)
         sql_params = [ip, values[3]]
         db_util.execute_no_commit(seed_update_sql, sql_params)
         db_util.commit()
     except Exception, e:
         traceback.print_exc(e)
         db_util.rollback()
 def fail_action(self, values):
     ip = Util().get_local_ip()
     db_util = DBUtil(_HAINIU_DB)
     #1)记录队列表错误次数和ip;
     queue_update_sql1 = """
     update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s;
     """
     #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue 表对应的记录的
     #is_work = 0,让其他机器重试;
     queue_update_sql2 = """
     update hainiu_queue set is_work=0 where id=%s;
     """
     #3)更新内链表的失败次数和失败ip,队列表的数据不删除;
     inner_update_sql = """
     update hainiu_web_seed_internally set  fail_times=fail_times+1,fail_ip=%s where md5=%s and a_md5=%s
     """
     try:
         # 1)
         sql_params = [ip, values[0]]
         db_util.execute_no_commit(queue_update_sql1, sql_params)
         # 2)
         # 比较失败次数
         if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1:
             sql_params = [self.id]
             db_util.execute_no_commit(queue_update_sql2, sql_params)
         sql_params = [ip, values[1], values[2]]
         db_util.execute_no_commit(inner_update_sql, sql_params)
         db_util.commit()
     except Exception, e:
         db_util.rollback()
         traceback.print_exc(e)
 def success_action(self, values):
     db_util = DBUtil(_HAINIU_DB)
     time_util = TimeUtil()
     #1)在hainiu_queue 表中删除已经下载成功的url;
     queue_delete_sql = """
     delete from hainiu_queue where id=%s
     """
     #2)更新内链表的最后更新时间;
     inner_update_sql = """
     update hainiu_web_seed_internally set update_time= %s where a_md5=%s AND
     md5=%s
     """
     update_time = time_util.get_timestamp()
     try:
         sql_param = [values[0]]
         db_util.execute_no_commit(queue_delete_sql, sql_param)
         sql_param = [update_time, values[1], values[2]]
         db_util.execute_no_commit(inner_update_sql, sql_param)
         db_util.commit()
     except Exception, e:
         traceback.print_exc(e)
         db_util.rollback()
Beispiel #12
0
    def action(self):
        is_success = True
        t = TimeUtil()
        u = Util()
        hu = HtmlUtil()
        r = RequestUtil()
        in_values = []
        ex_values = []
        a_href = ''
        main_md5 = u.get_md5(self.url)
        now_time = datetime.now()
        update_time = int(time.mktime(now_time.timetuple()))
        create_time = update_time
        create_day = int(t.now_day().replace('-', ''))
        create_hour = int(t.now_hour())
        try:
            html = r.http_get_phandomjs(self.url)
            domain = get_tld(self.url)

            soup = BeautifulSoup(html, 'lxml')
            a_docs = soup.find_all("a")
            a_set = set()
            a_param = {}
            out_json_srt = ''
            status = 0
            host = hu.get_url_host(self.url)

            for a in a_docs:
                a_href = self.get_format_url(a,host)
                a_title = a.get_text().strip()
                if a_href == '' or a_title == '':
                    continue
                if a_set.__contains__(a_href):
                    continue
                a_set.add(a_href)

                req = urllib2.Request(url=a_href)
                a_host = req.get_host() if req.get_host() is not None else ''
                a_md5 = u.get_md5(a_href)

                if a_title != '':
                    a_param['title'] = a_title
                    out_json_srt = json.dumps(a_param,ensure_ascii=False)

                a_xpath = hu.get_dom_parent_xpath_js(a)
                insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,
                                 MySQLdb.escape_string(self.url),
                                 MySQLdb.escape_string(a_href),
                                 MySQLdb.escape_string(a_title),
                                 out_json_srt)

                if a_host.__contains__(domain):
                    in_values.append(insert_values)
                else:
                    ex_values.append(insert_values)

            in_table = 'hainiu_web_seed_internally'
            ex_table = 'hainiu_web_seed_externally'
            insert_sql = """
                insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param)
                      values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time;
            """
            try:
                d = DBUtil(config._HAINIU_DB)
                d.execute_no_commit("set NAMES utf8mb4;")
                if in_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',in_table)
                    d.executemany_no_commit(sql,in_values)
                if ex_values.__len__() != 0:
                    sql = insert_sql.replace('<table>',ex_table)
                    d.executemany_no_commit(sql,ex_values)
                d.commit()
            except:
                is_success = False
                self.rl.exception()
                self.rl.error(sql)
                d.rollback()
            finally:
                d.close()

        except:
            is_success = False
            self.rl.exception()
        finally:
            r.close_phandomjs()

        return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
Beispiel #13
0
def put_queue(page_show_num):

    db_util = DBUtil(_ZZ_DB)
    # 统计queue符合条件的记录数
    count_queue_sql = '''
    select count(*) from web_queue where is_work=%s and fail_times < %s;
    '''

    # 统计web_seed表的符合条件的总记录数
    count_seed_sql = '''
    select count(*) from web_seed where status=0;
    '''

    # 分页查询web_seed 表的记录
    select_seed_limit_sql = '''
    select id,url,category from web_seed where status=0 limit %s,%s;
    '''

    # 插入queue表记录
    insert_queue_sql = '''
    insert into web_queue (type,action,params) values(%s,%s,%s);
    '''

    # 更新web_seed表中的 status
    update_sql = '''
    update web_seed set status=1 where id in(%s);
    '''

    try:
        sql_params = [0, _QUEUE_ZZ["MAX_FAIL_TIMES"]]
        res1 = db_util.read_one(count_queue_sql, sql_params)
        total_num1 = res1[0]
        if total_num1 != 0:
            print "queue has %d records,not insert!" % total_num1
            return None

        start_time = time.time()

        # 统计web_seed 表符合条件的总记录数
        res2 = db_util.read_one(count_seed_sql)
        total_num2 = res2[0]

        # 计算分多少页查询
        page_num = total_num2 / page_show_num if total_num2 % page_show_num == 0 else total_num2 / page_show_num + 1

        # 分页查询
        ids = []

        for i in range(0, page_num):
            sql_params = [i * page_show_num, page_show_num]
            print sql_params
            res3 = db_util.read_dict(select_seed_limit_sql, sql_params)

            list1 = []

            for row in res3:
                id = row["id"]
                ids.append(str(id))
                action = row["url"]
                params = row["category"]
                type = 1
                list1.append((type, action, params))

            # 批量插入queue
            db_util.executemany(insert_queue_sql, list1)

        # 更新 status=1
        db_util.execute_no_commit(update_sql % ",".join(ids))

        db_util.commit()

        end_time = time.time()
        run_time = end_time - start_time
        print "total_num:%d, run_time:%.2f" % (total_num2, run_time)

    except Exception, err:
        db_util.rollback()
        traceback.print_exc(err)
Beispiel #14
0
#-*- encoding: utf-8 -*-
'''
db_test.py
Created on 2019/6/25 11:14
Copyright (c) 2019/6/25, 海牛学院版权所有.
@author: 潘牛
'''
from commons.util.db_util import DBUtil
from configs.config import _HAINIU_DB
db_util = DBUtil(_HAINIU_DB)

# 设置字符集是utf8mb4
db_util.execute_no_commit("set NAMES utf8mb4;")

# 测试 execute(self,sql,params = None):
# sql = """
# insert into hainiu_queue (type,action,params) values (1, 'www.hainiubl.com', 'aa');
# """
# db_util.execute(sql)

# 字符串拼接(不推荐用)
# sql = """
# insert into hainiu_queue (type,action,params) values (%d, '%s', '%s');
# """ % (1, 'www.hainiubl.com', 'aa')
# db_util.execute(sql)


# -------------------------------------
# 测试 execute(self,sql,params != None):
# sql占位符(推荐用法)
# sql = """