def fail_action(self, values): ''' 消息动作处理失败之后,更改队列中间件中该消息的失败次数并记录执行机器的IP 如果达到该机器的最大尝试失败次数,则更改队列中间件中该消息的状态为未处理,目的让其它机器再次尝试去处理该消息 :param values: 消息动作处理之后的结果 ''' update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s """ try: d = DBUtil(config._HAINIU_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % id d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close()
def put_inner_to_queue(): redis_util = RedisUtill() ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) db_util.execute_no_commit("set NAMES utf8mb4;") try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] start_time = time.time() if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None inner_count = 0 for ip in ips: key_list = [] scan_limit_to_queue_table(ip, port, 0, 'down:*', 20, key_list) inner_count = inner_count + len(key_list) # 根据key列表上Redis里获取value列表 values = redis_util.get_values_batch_keys(key_list) # 导入hainiu_queue表 insert_queue_record = [] for value in values: queue_param = json.loads(value) a_url = queue_param['a_url'] insert_queue_record.append((2, a_url, value)) db_util.executemany_no_commit(insert_queue_sql, insert_queue_record) db_util.commit() # 把导入表后的key列表从redis里删掉 redis_util.delete_batch(key_list) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()
def queue_items(self): ''' 通过悲观锁+事务+更新状态来实现多个机器串行拿取数据, 并把其封装成HainiuConsumerAction对象实例列表返回 ''' select_sql = """ select id,action,params from hainiu_queue where type=%s and is_work=%s and fail_ip!=%s and fail_times<%s limit %s for update; """ # 更新SQL-拼字符串 update_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ c_actions = [] # 用于装id,来更新 ids = [] db_util = DBUtil(_HAINIU_DB) try: # sql_params = [1, 0, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM']] # 屏蔽ip查询的参数 ip = Util().get_local_ip() sql_params = [ 1, 0, ip, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM'] ] # ({},{}) res1 = db_util.read_dict(select_sql, sql_params) for row in res1: id = row['id'] ids.append(str(id)) act = row['action'] params = row['params'] c_action = NewsFindConsumerAction(id, act, params) c_actions.append(c_action) if len(ids) > 0: db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, e: db_util.rollback() traceback.print_exc(e)
def success_action(self, values): #1)记录种子url最后爬取成功数, (用来校验最后的爬取是否成功); #2)在hainiu_queue 表中删除已经爬取成功的url; seed_update_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5=%s; """ queue_delete_sql = """ delete from hainiu_queue where id=%s """ db_util = DBUtil(_HAINIU_DB) try: sql_param = [values[1], values[2], values[3]] db_util.execute_no_commit(seed_update_sql, sql_param) sql_param = [values[0]] db_util.execute_no_commit(queue_delete_sql, sql_param) db_util.commit() except Exception, e: traceback.print_exc(e) db_util.rollback()
def queue_items(self): # 屏蔽ip的查询方式 select_sql=''' select id, action, params from web_queue where type=%s and is_work=%s and fail_ip != %s and fail_times < %s limit 0, %s for update; ''' update_sql=''' update web_queue set is_work=1 where id in(%s); ''' db_util = DBUtil(_ZZ_DB) try: ip = Util().get_local_ip() sql_params = [1, 0, ip, _QUEUE_ZZ["MAX_FAIL_TIMES"], _QUEUE_ZZ["LIMIT_NUM"]] res = db_util.read_dict(select_sql, sql_params) actions = [] ids = [] for row in res: id = row["id"] ids.append(str(id)) action = row["action"] params = row["params"] # 封装对象 c_action = WebConsumerAction(id, action, params) actions.append(c_action) if len(actions) != 0: # 更新 is_work=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, err: actions = [] db_util.rollback() traceback.print_exc(err)
def fail_action(self, values): # 每次失败都需要更新ip 和 失败次数 update_sql1=''' update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s; ''' # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试 update_sql2=''' update web_queue set is_work = 0 where id = %s; ''' # 更新seed表状态 update_seed_sql = ''' update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s; ''' # 更新externally表状态 update_exter_sql = ''' update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s; ''' db_util = DBUtil(_ZZ_DB) try: id = values[0] ip = Util().get_local_ip() # 每次更新失败ip 和失败次数 # queue表 sql_params = [ip, id] db_util.execute_no_commit(update_sql1, sql_params) # seed 表 sql_params = [ip, values[1]] db_util.execute(update_seed_sql, sql_params) # externally表 db_util.execute(update_exter_sql, sql_params) if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1: db_util.execute_no_commit(update_sql2 % id) db_util.commit() except Exception,err: db_util.rollback() traceback.print_exc(err)
def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";""" try: d = DBUtil(config._HAINIU_DB) id = values[5] sql = delete_sql % id d.execute_no_commit(sql) sql = update_hainiu_news_seed_sql % (values[3],values[4],values[0]) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[5] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) main_md5 = values[0] sql = update_hainiu_news_seed_sql % (ip, main_md5) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
def fail_action(self, values): ip = Util().get_local_ip() db_util = DBUtil(_HAINIU_DB) #1)记录hainiu_queue表错误次数和ip; # is_success,self.id,len(inner_list),len(exter_list),md5 queue_update_sql1 = """ update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s; """ #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue # 表对应的记录的 is_work = 0,让其他机器重试; queue_update_sql2 = """ update hainiu_queue set is_work=0 where id=%s; """ #3)更新种子表的失败次数、失败ip;队列表的数据不删除,有可能是因为目标网站把ip给封了, # 在某个时间,写个脚本,把失败的队列数据改状态和失败次数和失败ip,重新爬取试试。 seed_update_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip=%s where md5=%s """ try: sql_params = [ip, values[0]] db_util.execute_no_commit(queue_update_sql1, sql_params) # 比较失败次数 if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1: sql_params = [self.id] db_util.execute_no_commit(queue_update_sql2, sql_params) sql_params = [ip, values[3]] db_util.execute_no_commit(seed_update_sql, sql_params) db_util.commit() except Exception, e: traceback.print_exc(e) db_util.rollback()
def fail_action(self, values): ip = Util().get_local_ip() db_util = DBUtil(_HAINIU_DB) #1)记录队列表错误次数和ip; queue_update_sql1 = """ update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s; """ #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue 表对应的记录的 #is_work = 0,让其他机器重试; queue_update_sql2 = """ update hainiu_queue set is_work=0 where id=%s; """ #3)更新内链表的失败次数和失败ip,队列表的数据不删除; inner_update_sql = """ update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip=%s where md5=%s and a_md5=%s """ try: # 1) sql_params = [ip, values[0]] db_util.execute_no_commit(queue_update_sql1, sql_params) # 2) # 比较失败次数 if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1: sql_params = [self.id] db_util.execute_no_commit(queue_update_sql2, sql_params) sql_params = [ip, values[1], values[2]] db_util.execute_no_commit(inner_update_sql, sql_params) db_util.commit() except Exception, e: db_util.rollback() traceback.print_exc(e)
def success_action(self, values): db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() #1)在hainiu_queue 表中删除已经下载成功的url; queue_delete_sql = """ delete from hainiu_queue where id=%s """ #2)更新内链表的最后更新时间; inner_update_sql = """ update hainiu_web_seed_internally set update_time= %s where a_md5=%s AND md5=%s """ update_time = time_util.get_timestamp() try: sql_param = [values[0]] db_util.execute_no_commit(queue_delete_sql, sql_param) sql_param = [update_time, values[1], values[2]] db_util.execute_no_commit(inner_update_sql, sql_param) db_util.commit() except Exception, e: traceback.print_exc(e) db_util.rollback()
def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = self.get_format_url(a,host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param,ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) if a_host.__contains__(domain): in_values.append(insert_values) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time; """ try: d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>',in_table) d.executemany_no_commit(sql,in_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>',ex_table) d.executemany_no_commit(sql,ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
def put_queue(page_show_num): db_util = DBUtil(_ZZ_DB) # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_seed_sql = ''' select count(*) from web_seed where status=0; ''' # 分页查询web_seed 表的记录 select_seed_limit_sql = ''' select id,url,category from web_seed where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # 更新web_seed表中的 status update_sql = ''' update web_seed set status=1 where id in(%s); ''' try: sql_params = [0, _QUEUE_ZZ["MAX_FAIL_TIMES"]] res1 = db_util.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() # 统计web_seed 表符合条件的总记录数 res2 = db_util.read_one(count_seed_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / page_show_num if total_num2 % page_show_num == 0 else total_num2 / page_show_num + 1 # 分页查询 ids = [] for i in range(0, page_num): sql_params = [i * page_show_num, page_show_num] print sql_params res3 = db_util.read_dict(select_seed_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["url"] params = row["category"] type = 1 list1.append((type, action, params)) # 批量插入queue db_util.executemany(insert_queue_sql, list1) # 更新 status=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_util.rollback() traceback.print_exc(err)
#-*- encoding: utf-8 -*- ''' db_test.py Created on 2019/6/25 11:14 Copyright (c) 2019/6/25, 海牛学院版权所有. @author: 潘牛 ''' from commons.util.db_util import DBUtil from configs.config import _HAINIU_DB db_util = DBUtil(_HAINIU_DB) # 设置字符集是utf8mb4 db_util.execute_no_commit("set NAMES utf8mb4;") # 测试 execute(self,sql,params = None): # sql = """ # insert into hainiu_queue (type,action,params) values (1, 'www.hainiubl.com', 'aa'); # """ # db_util.execute(sql) # 字符串拼接(不推荐用) # sql = """ # insert into hainiu_queue (type,action,params) values (%d, '%s', '%s'); # """ % (1, 'www.hainiubl.com', 'aa') # db_util.execute(sql) # ------------------------------------- # 测试 execute(self,sql,params != None): # sql占位符(推荐用法) # sql = """