def success_action(self): """ 删除队列的数据信息 """ sql = "delete from hainiu_queue where id=%s" % self.id try: db_util = DBUtil(db_config) db_util.execute(sql) except Exception, message: self.logger.exception(message)
def fail_action(self): """ 1)更新失败次数、设置失败ip 2)如果失败次数达到了当前机器的最大失败次数,将is_work更新0; """ update_sql1 = """ update hainiu_queue set fail_times=fail_times+1, fail_ip='%s' where id=%s and fail_times < %s; """ update_sql2 = """ update hainiu_queue set is_work=0 where id=%s; """ try: db_util = DBUtil(db_config) u = Util() ip = u.get_local_ip() db_util.execute_no_commit(update_sql1 % (ip, self.id, self.max_fail_times)) num_1 = self.current_retry_times + 1 self.logger.info("self.current_retry_times1==> %d" % num_1) if self.current_retry_times +1 == Consumer._MAX_RETRY_TIMES \ and self.current_retry_times + 1 <self.max_fail_times: db_util.execute_no_commit(update_sql2 % self.id) db_util.commit() except Exception, message: self.logger.exception(message)
def queue_items(self): #多台机器的时候,查询带上 fail_ip != ip # select_sql = """ # select id, action, params from hainiu_queue \ # where type='1' and is_work = 0 and fail_ip != '%s' and fail_times < %d limit 0, %d for update; # """ #行锁 select_sql = """ select id, action, params from hainiu_queue \ where type='1' and is_work = 0 and fail_times < %d limit 0, %d for update; """ update_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ list = [] try: db_util = DBUtil(db_config) #多个行 result = db_util.read_dict(select_sql % (self.max_fail_times, self.limit_num)) ids = [] for row_dict in result: id = row_dict['id'] action = row_dict['action'] params = row_dict['params'] c_action = HainiuConsumerAction(id, action, params, self.max_fail_times) list.append(c_action) #[1,2,3,4] ids.append(str(id)) if len(ids) != 0: ids = ','.join(ids) db_util.execute_no_commit(update_sql % ids) db_util.commit() except Exception, message: db_util.rollback_close() self.logger.exception(message)
def put_queue(self, show_num): select_count_sql = """ select count(*) from hainiu_web_seed where status = 0; """ select_limit_sql = """ select url, category from hainiu_web_seed where status = 0 limit %s, %s; """ insert_sql = """ insert into hainiu_queue (type, action, params) values (%s, %s, %s); """ db_util = DBUtil(db_config) try: #计算总数 total_num = db_util.read_one(select_count_sql) #计算总页数 page_num = total_num[0]/show_num if total_num[0] % show_num == 0 else total_num[0]/show_num + 1 i = 0 while i < page_num: limit_1 = i * show_num limit_2 = show_num print '%d , %d' % (limit_1, limit_2) sql = select_limit_sql % (limit_1, limit_2) print "select_limit_sql==> %s " % sql i += 1 #分页查询结果 result = db_util.read_dict(sql) values = [] for row_dict in result: url = row_dict['url'] category = row_dict['category'] #[(1, 'url1', 'c1'),(1,'url2','c2')] values.append((1, url, category)) print "insert values ==> %s" % values #将查询的结果进行批量insert插入 db_util.executemany(insert_sql, values) #time.sleep(5) except Exception, message: traceback.print_exc(message)
def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 2 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param1 = param if param is not None else '' id = l[2] param = '%s##%s' % (str(id), param1) values.append((url, param)) id_values.append(str(id)) if id_values.__len__() != 0: d.executemany_no_commit( insert_news_seed_internally_queue_items_sql, values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() redis_util = RedisUtill() redis_dict_values = {} # redis_dict_keys = [] in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) update_time = t.get_timestamp() print update_time create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = hu.get_url_domain(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = hu.get_format_url(a, host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param, ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5, domain, host, a_md5, a_host, a_xpath, create_time, create_day, create_hour, update_time, status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) # print insert_values if a_host.__contains__(domain): in_values.append(insert_values) dict_exist_key = "exist:%s" % a_md5 redis_dict_values[dict_exist_key] = a_href redis_dict_keys.append(dict_exist_key) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time) ; """ try: d = DBUtil(config._HAINIU_DB) #设置会话字符集为 utf8mb4 d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>', in_table) d.executemany_no_commit(sql, in_values) #拿key去redis查是否存在 exist:a_md5,得到这些key对应的values,也就是url列表 redis_exist_values = redis_util.get_values_batch_keys( redis_dict_keys) #将存在的values列表转换成exist:a_md5形式 redis_exist_keys = [ "exist:%s" % u.get_md5(rev) for rev in redis_exist_values if rev != None ] #判断本次入库的数据中那些是在redis中存在的,如果不存在就生成down:a_md5 exits:a_md5这两个key放到redis中 redis_dict_down_values = {} for key, value in redis_dict_values.items(): if key not in redis_exist_keys: redis_dict_down_values["down:%s" % u.get_md5(value)] = value redis_dict_down_values[key] = value if redis_dict_down_values.__len__() != 0: redis_util.set_batch_datas(redis_dict_down_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>', ex_table) d.executemany_no_commit(sql, ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [ main_md5, self.url, a_href, in_values.__len__(), ex_values.__len__(), self.queue_id ])
def push_queue_items(): # 符合 写入的种子的队列数据的数量 count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" # 生成写入队列数据 条件: type=3 insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" # 日志 rl = LogUtil().get_base_logger() redisdb = RedisUtill() try: # 开始时间 starttime = time.clock() redis_data_statu = True # 线程锁 lock_key = 'get_news_seed_internally_data' sql = "" total_all = 0 d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中 sql = count_news_seed_queue_sql queue_total = d.read_one(sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) # return while redis_data_statu: is_lock = redisdb.get_conn().exists(lock_key) if is_lock == False: #锁上线程 --- 10 秒失效 lockd = redisdb.get_lock(lock_key, 10) if lockd == False: rl.info('无法获取线程锁,退出采集下载queue线程 ') continue ips = config._REDIS_CLUSTER_CONFIG['IPS'] port = config._REDIS_CLUSTER_CONFIG['PORT'] def scan_limit_to_queue_table(host, port, cursor, match, count): total_num = 0 r = redis.Redis(host, port) rs = r.scan(cursor, match, count) next_num = rs[0] key_list = [] value_list = [] for k in rs[1]: key_list.append(k) total_num += 1 # print key_list print total_num values = redisdb.get_values_batch_keys(key_list) for v in values: value_list.append((v, '')) print value_list sql = insert_news_seed_internally_queue_items_sql d.executemany(sql, value_list) redisdb.delete_batch(rs[1]) if next_num == 0: return total_num return total_num + scan_limit_to_queue_table( host, port, next_num, match, count) total_num = 0 for ip in ips: total_num += scan_limit_to_queue_table( ip, port, 0, 'down:*', 10) print '======' print total_num if total_num > 0: break redisdb.release(lock_key) else: rl.info('其他线程正在处理,请等待 ') time.sleep(0.3) endtime = time.time() # 一共执行的时间 worksec = int(round((endtime - starttime))) # 日志 rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total_all, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: redisdb.release(lock_key) d.close()
def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[5] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) main_md5 = values[0] sql = update_hainiu_news_seed_sql % (ip, main_md5) d.execute_no_commit(sql) if (self.current_retry_times == Consumer._MAX_RETRY_TIMES): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
def queue_items(self): # select_queue_sql = """ # select id,action,params from hainiu_queue where # type=1 and is_work =0 and fail_times <=%s and fail_ip <> '%s' # limit 0,%s for update; # """ select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and is_work =0 and fail_times <=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ list = [] try: d = DBUtil(config._HAINIU_DB) sql = select_queue_sql % (self.fail_times, self.limit) tuple = d.read_tuple(sql) if len(tuple) == 0: return list queue_ids = '' for t in tuple: queue_id = t[0] url = t[1] param = '' if t[2] is None else t[2] queue_ids += str(queue_id) + ',' c = NewsFindConsumer(url, param, queue_id) list.append(c) queue_ids = queue_ids[:-1] d.execute(update_queue_sql % (queue_ids)) except: self.rl.exception() d.rollback() d.commit() finally: d.close() return list
def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";""" try: d = DBUtil(config._HAINIU_DB) id = values[5] sql = delete_sql % id # TODO 测试不删除队列表 d.execute_no_commit(sql) sql = update_hainiu_news_seed_sql % (values[3], values[4], values[0]) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_fld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def action(self): is_success = True t = TimeUtil() file_util = FileUtil() u = Util() hu = HtmlUtil() r = RequestUtil() values = [] md5 = u.get_md5(self.url) update_time = t.get_timestamp() create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) now_minute = int(t.now_min()) #以5分钟为间隔时间计算 for i in xrange(60,-5,-5): if now_minute>=i: now_minute=i break #格式化成yyyyMMddHHmm,如:201903181505 now_minute = t.now_time(format='%Y%m%d%H') + ('0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute)) values.append(MySQLdb.escape_string(self.url)) values.append(md5) values.append(create_time) values.append(create_day) values.append(create_hour) values.append('') values.append(MySQLdb.escape_string(self.param)) values.append(update_time) try: html = r.http_get_phandomjs(self.url) domain = hu.get_url_domain(self.url) values[5] = domain soup = BeautifulSoup(html, 'lxml') title_doc = soup.find('title') title = title_doc.contents[0] if title_doc is not None and len(title_doc.contents) == 1 else '' host = hu.get_url_host(self.url) values.append(host) values.append(MySQLdb.escape_string(title)) # k = KafkaUtil(config._KAFKA_CONFIG) # html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4) # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html) # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str) # push_str = bytes(push_str) # is_success = k.push_message(push_str) if is_success: self.save_file(create_time,file_util,now_minute,u,self.url,html) else: self.logger.error("kafka push error") except: is_success = False values.append('') values.append('') self.logger.exception() finally: r.close_phandomjs() try: if is_success: values.append(1) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY UPDATE update_time=values(update_time); """ else: ip = u.get_local_ip() values.append(ip) values.append(2) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s) on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip); """ d = DBUtil(config._HAINIU_DB) sql = insert_web_page_sql % tuple(values) d.execute(sql) except: is_success = False self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close() return super(self.__class__, self).result(is_success, [md5,update_time,self.queue_id])
def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set is_work=0 where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip="%s",update_time=%s where a_md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[2] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (ip, values[1], values[0]) d.execute_no_commit(sql) if (self.current_retry_times == Consumer._MAX_RETRY_TIMES): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close()
def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set update_time=%s where a_md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[2] sql = delete_sql % id # TODO 测试不删除队列表 d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (values[2],values[0]) d.execute_no_commit(sql) d.commit() except: self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close()