def queue_items(self): select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and fail_times <= %s limit 0,%s for UPDATE; """ update_queue_sql = """ update hainiu_queue set type=0 where id in (%s); """ list = [] try: d = DBUtil(config._HAINIU_DB) sql = select_queue_sql % (self.fail_times,self.limit) tuple = d.read_tuple(sql) if len(tuple) == 0: return list queue_ids = '' for t in tuple: queue_id = t[0] url = t[1] param = '' if t[2] is None else t[2] queue_ids += str(queue_id) + ',' c = NewsFindConsumer(url, param, queue_id) list.append(c) queue_ids = queue_ids[:-1] d.execute(update_queue_sql % (queue_ids)) except: self.rl.exception() d.rollback() d.commit() finally: d.close() return list
def fail_action(self, values): ''' 消息动作处理失败之后,更改队列中间件中该消息的失败次数并记录执行机器的IP 如果达到该机器的最大尝试失败次数,则更改队列中间件中该消息的状态为未处理,目的让其它机器再次尝试去处理该消息 :param values: 消息动作处理之后的结果 ''' update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s """ try: d = DBUtil(config._HAINIU_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % id d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close()
def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[5] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) main_md5 = values[0] sql = update_hainiu_news_seed_sql % (ip, main_md5) d.execute_no_commit(sql) if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
def fail_action(self, values): ip = Util().get_local_ip() db_util = DBUtil(_HAINIU_DB) #1)记录队列表错误次数和ip; queue_update_sql1 = """ update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s; """ #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue 表对应的记录的 #is_work = 0,让其他机器重试; queue_update_sql2 = """ update hainiu_queue set is_work=0 where id=%s; """ #3)更新内链表的失败次数和失败ip,队列表的数据不删除; inner_update_sql = """ update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip=%s where md5=%s and a_md5=%s """ try: # 1) sql_params = [ip, values[0]] db_util.execute_no_commit(queue_update_sql1, sql_params) # 2) # 比较失败次数 if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1: sql_params = [self.id] db_util.execute_no_commit(queue_update_sql2, sql_params) sql_params = [ip, values[1], values[2]] db_util.execute_no_commit(inner_update_sql, sql_params) db_util.commit() except Exception, e: db_util.rollback() traceback.print_exc(e)
def fail_action(self, values): ip = Util().get_local_ip() db_util = DBUtil(_HAINIU_DB) #1)记录hainiu_queue表错误次数和ip; # is_success,self.id,len(inner_list),len(exter_list),md5 queue_update_sql1 = """ update hainiu_queue set fail_times=fail_times+1,fail_ip=%s where id=%s; """ #2)当某个机器的错误次数达到了当前机器设定的最大重试次数,把hainiu_queue # 表对应的记录的 is_work = 0,让其他机器重试; queue_update_sql2 = """ update hainiu_queue set is_work=0 where id=%s; """ #3)更新种子表的失败次数、失败ip;队列表的数据不删除,有可能是因为目标网站把ip给封了, # 在某个时间,写个脚本,把失败的队列数据改状态和失败次数和失败ip,重新爬取试试。 seed_update_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip=%s where md5=%s """ try: sql_params = [ip, values[0]] db_util.execute_no_commit(queue_update_sql1, sql_params) # 比较失败次数 if self.current_retry_num == _QUEUE_NEWS_FIND['C_RETRY_TIMES'] - 1: sql_params = [self.id] db_util.execute_no_commit(queue_update_sql2, sql_params) sql_params = [ip, values[3]] db_util.execute_no_commit(seed_update_sql, sql_params) db_util.commit() except Exception, e: traceback.print_exc(e) db_util.rollback()
def put_inner_to_queue(): redis_util = RedisUtill() ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) db_util.execute_no_commit("set NAMES utf8mb4;") try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] start_time = time.time() if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None inner_count = 0 for ip in ips: key_list = [] scan_limit_to_queue_table(ip, port, 0, 'down:*', 20, key_list) inner_count = inner_count + len(key_list) # 根据key列表上Redis里获取value列表 values = redis_util.get_values_batch_keys(key_list) # 导入hainiu_queue表 insert_queue_record = [] for value in values: queue_param = json.loads(value) a_url = queue_param['a_url'] insert_queue_record.append((2, a_url, value)) db_util.executemany_no_commit(insert_queue_sql, insert_queue_record) db_util.commit() # 把导入表后的key列表从redis里删掉 redis_util.delete_batch(key_list) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()
def queue_items(self): ''' 从队列中取出要处理的消息,并封装成消费者动作,然后更新队列的状态 :return: 封装好的消费者动作列表 ''' # 会限制本机处理失败之后就不再进行获取的获取,通过机器IP来限制 # select_queue_sql = """ # select id,action,params from hainiu_queue where type=1 and fail_ip <>'%s' and fail_times<=%s # limit 0,%s for update; # """ select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and fail_times<=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set type=0 where id in (%s); """ return_list = [] try: d = DBUtil(config._HAINIU_DB) # u = Util() # ip = u.get_local_ip() # sql = select_queue_sql % (self.fail_times,ip,self.limit) sql = select_queue_sql % (self.fail_times, self.limit) select_dict = d.read_dict(sql) if len(select_dict) == 0: return return_list query_ids = [] for record in select_dict: id = record["id"] action = record["action"] params = record["params"] query_ids.append(str(id)) c = HainiuConsumer(id, action, params) return_list.append(c) ids = ",".join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """ insert into hainiu_queue (type,action,params) values(3,%s,%s); """ count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param = param if param is not None else '' values.append((url,param)) id = l[2] id_values.append(str(id)) if id_values.__len__() != 0: random.shuffle(values) d.executemany_no_commit(insert_news_seed_internally_queue_items_sql,values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('push seed_internally queue finish,total items %s,action time %s\'s' % (total,worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_tld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def queue_items(self): ''' 通过悲观锁+事务+更新状态来实现多个机器串行拿取数据, 并把其封装成HainiuConsumerAction对象实例列表返回 ''' select_sql = """ select id,action,params from hainiu_queue where type=%s and is_work=%s and fail_ip!=%s and fail_times<%s limit %s for update; """ # 更新SQL-拼字符串 update_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ c_actions = [] # 用于装id,来更新 ids = [] db_util = DBUtil(_HAINIU_DB) try: # sql_params = [1, 0, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM']] # 屏蔽ip查询的参数 ip = Util().get_local_ip() sql_params = [ 1, 0, ip, _QUEUE_NEWS_FIND['MAX_FAIL_TIMES'], _QUEUE_NEWS_FIND['LIMIT_NUM'] ] # ({},{}) res1 = db_util.read_dict(select_sql, sql_params) for row in res1: id = row['id'] ids.append(str(id)) act = row['action'] params = row['params'] c_action = NewsFindConsumerAction(id, act, params) c_actions.append(c_action) if len(ids) > 0: db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, e: db_util.rollback() traceback.print_exc(e)
def success_action(self, values): #1)记录种子url最后爬取成功数, (用来校验最后的爬取是否成功); #2)在hainiu_queue 表中删除已经爬取成功的url; seed_update_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5=%s; """ queue_delete_sql = """ delete from hainiu_queue where id=%s """ db_util = DBUtil(_HAINIU_DB) try: sql_param = [values[1], values[2], values[3]] db_util.execute_no_commit(seed_update_sql, sql_param) sql_param = [values[0]] db_util.execute_no_commit(queue_delete_sql, sql_param) db_util.commit() except Exception, e: traceback.print_exc(e) db_util.rollback()
def success_action(self, values): ''' 消息动作处理成功之后,从队列中间件删除该消息,表示这个消息最终处理完成 :param values: 消息动作处理之后的结果 ''' delete_sql = """ delete from hainiu_queue where id=%s """ try: d = DBUtil(config._HAINIU_DB) id = values[0] sql = delete_sql % id d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close()
def fail_action(self, values): # 每次失败都需要更新ip 和 失败次数 update_sql1=''' update web_queue set fail_ip = %s , fail_times = fail_times + 1 where id = %s; ''' # 当失败次数到达每台机器的最大重试次数,就将该记录的is_work=0 ,让其重试 update_sql2=''' update web_queue set is_work = 0 where id = %s; ''' # 更新seed表状态 update_seed_sql = ''' update web_seed set fail_times=fail_times + 1,fail_ip=%s where md5 =%s; ''' # 更新externally表状态 update_exter_sql = ''' update web_seed_externally set fail_times=fail_times + 1,fail_ip=%s where a_md5 =%s; ''' db_util = DBUtil(_ZZ_DB) try: id = values[0] ip = Util().get_local_ip() # 每次更新失败ip 和失败次数 # queue表 sql_params = [ip, id] db_util.execute_no_commit(update_sql1, sql_params) # seed 表 sql_params = [ip, values[1]] db_util.execute(update_seed_sql, sql_params) # externally表 db_util.execute(update_exter_sql, sql_params) if self.current_retry_num == _QUEUE_ZZ["C_RETRY_TIMES"] - 1: db_util.execute_no_commit(update_sql2 % id) db_util.commit() except Exception,err: db_util.rollback() traceback.print_exc(err)
def queue_items(self): # 屏蔽ip的查询方式 select_sql=''' select id, action, params from web_queue where type=%s and is_work=%s and fail_ip != %s and fail_times < %s limit 0, %s for update; ''' update_sql=''' update web_queue set is_work=1 where id in(%s); ''' db_util = DBUtil(_ZZ_DB) try: ip = Util().get_local_ip() sql_params = [1, 0, ip, _QUEUE_ZZ["MAX_FAIL_TIMES"], _QUEUE_ZZ["LIMIT_NUM"]] res = db_util.read_dict(select_sql, sql_params) actions = [] ids = [] for row in res: id = row["id"] ids.append(str(id)) action = row["action"] params = row["params"] # 封装对象 c_action = WebConsumerAction(id, action, params) actions.append(c_action) if len(actions) != 0: # 更新 is_work=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() except Exception, err: actions = [] db_util.rollback() traceback.print_exc(err)
def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";""" try: d = DBUtil(config._HAINIU_DB) id = values[5] sql = delete_sql % id d.execute_no_commit(sql) sql = update_hainiu_news_seed_sql % (values[3],values[4],values[0]) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
def success_action(self, values): db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() #1)在hainiu_queue 表中删除已经下载成功的url; queue_delete_sql = """ delete from hainiu_queue where id=%s """ #2)更新内链表的最后更新时间; inner_update_sql = """ update hainiu_web_seed_internally set update_time= %s where a_md5=%s AND md5=%s """ update_time = time_util.get_timestamp() try: sql_param = [values[0]] db_util.execute_no_commit(queue_delete_sql, sql_param) sql_param = [update_time, values[1], values[2]] db_util.execute_no_commit(inner_update_sql, sql_param) db_util.commit() except Exception, e: traceback.print_exc(e) db_util.rollback()
def push_queue_items(): inert_sql = """ insert into hainiu_queue (type,params,action) values(1,%s,%s); """ count_sql = """ select count(1) from hainiu_queue where type=1; """ select_sql = """ select id from hainiu_queue where type=1 limit %s,%s; """ rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = inert_sql insert_list = [("aaa", "bbb"), ("dffddf", "awwee")] d.executemany(sql, insert_list) sql = count_sql queue_total = d.read_one(sql)[0] print "queue_total", queue_total page_size = 10 page = (queue_total / page_size) + 1 print "page", page for i in range(0, page): sql = select_sql % (i * page_size, page_size) select_list = d.read_tuple(sql) print "page", i for record in select_list: id = record[0] print id except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def success_action(self, values): # 删除列表对应的记录 del_sql = ''' delete from web_queue where id =%s; ''' # 更新seed表状态 update_sql = ''' update web_seed set last_crawl_time=%s,last_crawl_internally=%s,last_crawl_externally=%s where md5 =%s; ''' db_util = DBUtil(_ZZ_DB) try: # 删除队列表 id = values[0] sql_param = [id] db_util.execute(del_sql, sql_param) # 更新seed表 # [(1574519076,), 95, 7, '824e29a21f2a02379f78b0675d1fc5eb'] sql_param =[values[2], values[3],values[4],values[1]] db_util.execute(update_sql, sql_param) except Exception, err: db_util.rollback() traceback.print_exc(err)
def create_seed(): sql = """ insert into web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ url = "https://news.sina.com.cn/" catetory = "新闻" hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._ZZ_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def create_seed(): url = "https://www.autohome.com.cn/all" catetory = "汽车" sql = """ insert into hainiu_web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0); """ hu = HtmlUtil() domain = get_tld(url) host = hu.get_url_host(url) u = Util() md5 = u.get_md5(url) rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) sql = sql % (url, md5, domain, host, catetory) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def xpath_config_file(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: # _HAINIU_DB = {'HOST': '192.168.137.190', 'USER': '******', 'PASSWD': '12345678', 'DB': 'hainiucrawler', # 'CHARSET': 'utf8', 'PORT': 3306} d = DBUtil(config._HAINIU_DB) # d = DBUtil(_HAINIU_DB) r = redis.Redis('nn1.hadoop', 6379, db=6) # r = redis.Redis('redis.hadoop', 6379, db=6) f = FileUtil() t = TimeUtil() c = Client("http://nn1.hadoop:50070") time_str = t.now_time(format='%Y%m%d%H%M%S') # local_xpath_file_path = '/Users/leohe/Data/input/xpath_cache_file/xpath_file' + time_str local_xpath_file_path = '/home/qingniu/xpath_cache_file/xpath_file' + time_str start_cursor = 0 is_finish = True starttime = time.clock() host_set = set() while is_finish: values = set() limit = r.scan(start_cursor, 'total:*', 10) if limit[0] == 0: is_finish = False start_cursor = limit[0] for h in limit[1]: host = h.split(":")[1] total_key = h txpath_key = 'txpath:%s' % host fxpath_key = 'fxpath:%s' % host total = r.get(total_key) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: # print 'txpath:%s' % txpath txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 # print 'txpath_max_num:%s' % txpath_num if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 1: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 1: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '0')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
def put_inner_to_queue(): ''' ''' page_show_num = 10 # 统计hainiu_queue 未处理的记录数 select_queue_count_sql = """ select count(*) from hainiu_queue where type=%s and is_work=0 and fail_times=0; """ # 统计内链接表符合条件的总记录数 select_inner_count_sql = """ select count(*) from hainiu_web_seed_internally where status=0; """ # 分页查询内链接表 select_inner_limit_sql = """ select md5,a_url,a_md5,domain,a_host,a_title from hainiu_web_seed_internally WHERE status=0 limit 0,%s; """ # 插入hainiu_queue表 insert_queue_sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ # 更新内链接表的status状态 update_inner_status_sql = """ update hainiu_web_seed_internally set status=1 where a_md5=%s and md5=%s """ logger = LogUtil().get_logger("download_news_queue", "download_news_queue") db_util = DBUtil(_HAINIU_DB) try: # 统计hainiu_queue 未处理的记录数 sql_params = [2] res1 = db_util.read_one(select_queue_count_sql, sql_params) queue_count = res1[0] if queue_count >= 5: logger.info("hainiu_queue 有 %d 条未处理的记录,不需要导入!" % queue_count) return None # 统计内链接表符合条件的总记录数 res2 = db_util.read_one(select_inner_count_sql) inner_count = res2[0] # 计算有多少页 page_num = inner_count / page_show_num if inner_count % page_show_num == 0 \ else inner_count / page_show_num + 1 start_time = time.time() # 分页查询 for page in range(page_num): sql_params = [page_show_num] res3 = db_util.read_dict(select_inner_limit_sql, sql_params) # 插入队列表的记录 insert_queue_record = [] # param字典 param_dict = {} # inner表内要进行更新的记录 update_innner_status_record = [] for row in res3: # md5,a_url,a_md5,domain,a_host,a_title md5 = row['md5'] a_url = row['a_url'] a_md5 = row['a_md5'] domain = row['domain'] a_host = row['a_host'] a_title = row['a_title'] # param数据 param_dict['md5'] = md5 param_dict['a_md5'] = a_md5 param_dict['domain'] = domain param_dict['a_host'] = a_host param_dict['a_title'] = a_title param_json = json.dumps(param_dict, ensure_ascii=False, encoding='utf-8') # 将数据放入列表 insert_queue_record.append((2, a_url, param_json)) update_innner_status_record.append((a_md5, md5)) db_util.executemany(insert_queue_sql, insert_queue_record) db_util.executemany(update_inner_status_sql, update_innner_status_record) end_time = time.time() run_time = end_time - start_time logger.info("本地导入 %d 条数据, 用时 %.2f 秒" % (inner_count, run_time)) except Exception, e: traceback.print_exc(e) db_util.rollback()
def put_queue_inner(): # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # # 统计internally表的符合条件的总记录数 # count_inner_sql=''' # select count(*) from web_seed_internally where status=0; # ''' # # # web_seed_internally 表的记录 # select_inner_limit_sql=''' # select id,a_url,param from web_seed_internally where status=0 limit %s,%s; # ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_internally set status=1 where md5=%s and a_md5=%s; ''' try: # redis_tmp 数据 redis_d = RedisUtill() db_uitl = DBUtil(_ZZ_DB) ips = ['192.168.235.136', '192.168.235.137', '192.168.235.138'] port = '6379' list = [] total_num = 0 is_get_lock = redis_d.get_lock('seed_lock', 10) logger = LogUtil().get_base_logger() sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: logger.info("queue has %d records,not insert!" % total_num1) return None logger.info("正在获取锁...") if is_get_lock: logger.info("获取到锁") start_time = time.time() def scan_limit_to_queue_table(host, port, cursor, match, count): r = redis.Redis(host, port) rs = r.scan(cursor, match, count) # 新游标 next_num = rs[0] # print rs li = rs[1] for i in li: if i.__contains__('a_url'): list.append(i) # 递归出口 if next_num == 0: return None scan_limit_to_queue_table(host, port, next_num, match, count) for ip in ips: scan_limit_to_queue_table(ip, port, 0, 'seed_temp*', 100) # 分页插入queue表 redis_result = [] up_inner = [] delete_list = [] for k in list: if k.__contains__('a_url'): # 确定同一MD5的其他参数的key param = k.replace('a_url', 'param') md5 = k.replace('a_url', 'md5') a_md5 = k.replace('a_url', 'a_md5') action = redis_d.get_value_for_key(k) params = redis_d.get_value_for_key(param) redis_result.append((2, action, params)) md5_val = redis_d.get_value_for_key(md5) a_md5_val = redis_d.get_value_for_key(a_md5) up_inner.append((md5_val, a_md5_val)) # 添加要删除的列表 delete_list.append(k) delete_list.append(param) delete_list.append(md5) delete_list.append(a_md5) total_num += 1 # 批量插入queue if (len(redis_result) == 5): db_uitl.executemany(insert_queue_sql, redis_result) db_uitl.executemany(update_sql, up_inner) redis_result = [] up_inner = [] # 提交不满五个的最后一组 db_uitl.executemany(insert_queue_sql, redis_result) db_uitl.executemany(update_sql, up_inner) # 删除redis_tmp redis_d.delete_batch(delete_list) redis_d.release('seed_lock') logger.info("释放锁") else: logger.info('其他线程正在处理,获取锁超过最大超时时间,退出处理逻辑 ') end_time = time.time() run_time = end_time - start_time logger.info("total_num:%d, run_time:%.2f" % (total_num, run_time)) except Exception, err: db_uitl.rollback() redis_d.release('seed_lock') traceback.print_exc(err)
def put_queue_inner(): # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计internally表的符合条件的总记录数 count_inner_sql = ''' select count(*) from web_seed_internally where status=0; ''' # web_seed_internally 表的记录 select_inner_limit_sql = ''' select id,a_url,param from web_seed_internally where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_internally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) try: sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_inner_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ[ "LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # 分页插入queue表 ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_inner_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["a_url"] params1 = row["param"] type = 2 list1.append((type, action, params1)) # 批量插入queue db_uitl.executemany(insert_queue_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) db_uitl.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
sql = """ insert into hainiu_queue (type,action,params) values (%s, %s, %s); """ try: params = [1, 'www.hainiubl.com', "ff"] db_util.execute_no_commit(sql, params) time.sleep(5) 1/0 params = [1, 'www.hainiubl.com', "gg"] db_util.execute_no_commit(sql, params) time.sleep(5) db_util.commit() except Exception, e: print e db_util.rollback() finally: db_util.close() # ------------------------------------- # 悲观锁 + 事务
def redis2Hdfs(): select_xpath_rule_sql = """select host,xpath,type from stream_extract_xpath_rule where host='%s' and status=0""" rl = LogUtil().get_base_logger() try: d = DBUtil(_ZZ_DB) start = 0 is_finish = True host_set = set() f = FileUtil() t = TimeUtil() time_str = t.now_time(format='%Y%m%d%H%M%S') #local_xpath_file_path = '/user/zengqingyong17/spark/xpath_cache_file' + time_str local_xpath_file_path = 'E:/python_workspaces/data/xpath/xpath_file' + time_str starttime = time.clock() r = redis.Redis('nn1.hadoop', '6379', db=6) while is_finish: values = set() rs = r.scan(start, "total_z:*", 10) # 新游标 start = rs[0] if start ==0: is_finish = False # print rs for i in rs[1]: host = i.split(":")[1] total_key = i txpath_key = 'txpath_z:%s' % host fxpath_key = 'fxpath_z:%s' % host total = r.get(total_key) # 降序排序获得次数(0,1) txpath = r.zrevrange(txpath_key, 0, 1) row_format = "%s\t%s\t%s\t%s" if txpath: txpath_num = int(r.zscore(txpath_key, txpath[0])) if txpath.__len__() == 2: # 返回txpath_key 中txpath[1]的数值 txpath_num_1 = int(r.zscore(txpath_key, txpath[1])) txpath_num_1 = txpath_num_1 if txpath_num_1 is not None else 0 if txpath_num / float(total) >= 0.8: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) else: if txpath_num >= 100: values.add(row_format % (host, txpath[0], 'true', '0')) host_set.add(host) if txpath_num_1 is not None and txpath_num_1 >= 100: values.add(row_format % (host, txpath[1], 'true', '0')) host_set.add(host) # 获得fxpath_key的全部值 fxpath = r.smembers(fxpath_key) if fxpath: # print 'fxpath:%s' % fxpath for fx in fxpath: values.add(row_format % (host, fx, 'false', '1')) host_set.add(host) sql = select_xpath_rule_sql % host list_rule = d.read_tuple(sql) for rule in list_rule: type = rule[2] if type == 0: values.add(row_format % (rule[0], rule[1], 'true', '2')) host_set.add(host) elif type == 1: values.add(row_format % (rule[0], rule[1], 'false', '3')) host_set.add(host) f.write_file_line_pattern(local_xpath_file_path, values, "a") #上传到HDFS的XPATH配置文件目录 # c.upload("/user/qingniu/xpath_cache_file/", local_xpath_file_path) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info('total host %s,action time %s\'s' % (host_set.__len__(), worksec)) except: rl.exception() d.rollback() finally: d.close()
def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) now_time = datetime.now() update_time = int(time.mktime(now_time.timetuple())) create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = get_tld(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = self.get_format_url(a,host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param,ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) if a_host.__contains__(domain): in_values.append(insert_values) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=update_time; """ try: d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>',in_table) d.executemany_no_commit(sql,in_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>',ex_table) d.executemany_no_commit(sql,ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [main_md5,self.url,a_href,in_values.__len__(),ex_values.__len__(),self.queue_id])
def put_seed(): # 统计seed符合条件的记录数 count_queue_sql = ''' select count(*) from web_seed where status=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_exter_sql = ''' select count(*) from web_seed_externally where status=0; ''' # web_seed_externally 表的记录 select_exter_limit_sql = ''' select id,a_url,a_md5,a_host,param from web_seed_externally where status=0 limit %s,%s; ''' # 插入seed表记录 insert_seed_sql = ''' insert into web_seed (url,md5,domain,host,category) values (%s,%s,%s,%s,%s); ''' # web_seed_internally status update_sql = ''' update web_seed_externally set status=1 where id in(%s); ''' db_uitl = DBUtil(_ZZ_DB) sql_params = [0, _QUEUE_ZZ['MAX_FAIL_TIMES']] res1 = db_uitl.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() res2 = db_uitl.read_one(count_exter_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / _QUEUE_ZZ["LIMIT_NUM"] if total_num2 % _QUEUE_ZZ[ "LIMIT_NUM"] == 0 else total_num2 / _QUEUE_ZZ["LIMIT_NUM"] + 1 # hu = HtmlUtil() # u = Util() # 分页插入queue表 try: ids = [] for i in range(0, page_num): sql_params = [i * _QUEUE_ZZ["LIMIT_NUM"], _QUEUE_ZZ["LIMIT_NUM"]] res3 = db_uitl.read_dict(select_exter_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) url = row["a_url"] domain = get_tld(url) # host = hu.get_url_host(url) # md5 = u.get_md5(url) host = row["a_host"] md5 = row["a_md5"] category = row["param"] list1.append((url, md5, domain, host, category)) # 批量插入queue db_uitl.executemany(insert_seed_sql, list1) # 更新status = 1 db_uitl.execute(update_sql % ",".join(ids)) except Exception, err: db_uitl.rollback() traceback.print_exc(err)
def put_queue(page_show_num): db_util = DBUtil(_ZZ_DB) # 统计queue符合条件的记录数 count_queue_sql = ''' select count(*) from web_queue where is_work=%s and fail_times < %s; ''' # 统计web_seed表的符合条件的总记录数 count_seed_sql = ''' select count(*) from web_seed where status=0; ''' # 分页查询web_seed 表的记录 select_seed_limit_sql = ''' select id,url,category from web_seed where status=0 limit %s,%s; ''' # 插入queue表记录 insert_queue_sql = ''' insert into web_queue (type,action,params) values(%s,%s,%s); ''' # 更新web_seed表中的 status update_sql = ''' update web_seed set status=1 where id in(%s); ''' try: sql_params = [0, _QUEUE_ZZ["MAX_FAIL_TIMES"]] res1 = db_util.read_one(count_queue_sql, sql_params) total_num1 = res1[0] if total_num1 != 0: print "queue has %d records,not insert!" % total_num1 return None start_time = time.time() # 统计web_seed 表符合条件的总记录数 res2 = db_util.read_one(count_seed_sql) total_num2 = res2[0] # 计算分多少页查询 page_num = total_num2 / page_show_num if total_num2 % page_show_num == 0 else total_num2 / page_show_num + 1 # 分页查询 ids = [] for i in range(0, page_num): sql_params = [i * page_show_num, page_show_num] print sql_params res3 = db_util.read_dict(select_seed_limit_sql, sql_params) list1 = [] for row in res3: id = row["id"] ids.append(str(id)) action = row["url"] params = row["category"] type = 1 list1.append((type, action, params)) # 批量插入queue db_util.executemany(insert_queue_sql, list1) # 更新 status=1 db_util.execute_no_commit(update_sql % ",".join(ids)) db_util.commit() end_time = time.time() run_time = end_time - start_time print "total_num:%d, run_time:%.2f" % (total_num2, run_time) except Exception, err: db_util.rollback() traceback.print_exc(err)
def action(self): #爬取 hainiu_queue 中符合要求的url 请求页面的所有 a标签url r = RequestUtil() hu = HtmlUtil() u = Util() # is_success = True db_util = DBUtil(_HAINIU_DB) time_util = TimeUtil() # 内外链表的列表 inner_list = [] exter_list = [] #获取种子的md5 md5 = u.get_md5(self.act) try: # 通过phandomjs 请求url,返回网页,包括网页的ajax请求 html = r.http_get_phandomjs(self.act) #可以从HTML或XML文件中提取数据的Python第三方库 soup = BeautifulSoup(html, 'lxml') # a链接dom对象列表 a_docs = soup.find_all("a") if len(a_docs) == 0: is_success = False aset = set() #获取种子的domain domain = hu.get_url_domain(self.act) #获取种子的host host = hu.get_url_host(self.act) # 时间(create_time、create_day、create_hour、update_time) # create_time=time_util.get_timestamp() # # create_day = int(time_util.now_day().replace('-', '')) # create_hour=int(time_util.now_hour()) # update_time=create_time create_time = time_util.get_timestamp() # 获取年月日格式 create_day = int(time_util.now_day(format='%Y%m%d')) # 获取小时 create_hour = int(time_util.now_hour()) update_time = create_time # params_json = json.dumps(self.params, ensure_ascii=False, encoding='utf-8') for a_doc in a_docs: #获取a标签的href a_href = hu.get_format_url(self.act, a_doc, host) #获取a标签的内容 a_title = a_doc.get_text().strip() if a_href == '' or a_title == '': continue if aset.__contains__(a_href): continue aset.add(a_href) #获取a标签的host a_host = hu.get_url_host(a_href) #获取a标签href链接url的md5 a_md5 = u.get_md5(a_href) #获取a标签所对应的xpath a_xpath = hu.get_dom_parent_xpath_js_new(a_doc) # 一行数据 row_data = (self.act, md5, self.params, domain, host, a_href, a_md5, a_host, a_xpath, a_title, create_time, create_day, create_hour, update_time) if a_href.__contains__(domain): inner_list.append(row_data) else: exter_list.append(row_data) # 并解析存入内链表或外链表,在存入时,如果url已存在,只做 # update 操作。(保证链接页面不会重复爬取) if len(inner_list) > 0: inner_insert_sql = """ insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(inner_insert_sql, inner_list) if len(exter_list) > 0: exter_insert_sql = """ insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5,a_host,a_xpath,a_title,create_time, create_day,create_hour,update_time) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time); """ db_util.executemany_no_commit(exter_insert_sql, exter_list) db_util.commit() except Exception, e: is_success = False db_util.rollback() traceback.print_exc(e)