class HainiuConsumerAction(ConsumerAction): def __init__(self, id, act, params, max_fail_times): super(self.__class__, self).__init__() self.id = id self.act = act self.params = params self.max_fail_times = max_fail_times self.logger = LogUtil().get_logger("HainiuConsumerAction", "HainiuConsumerAction") def action(self): print 'id=%s, action=%s, parmas=%s' % (self.id, self.act, self.params) return self.result(True, [self.id, self.act, self.params]) def success_action(self): """ 删除队列的数据信息 """ sql = "delete from hainiu_queue where id=%s" % self.id try: db_util = DBUtil(db_config) db_util.execute(sql) except Exception, message: self.logger.exception(message) finally:
def __init__(self, queue, p_action, name, p_sleep_time, c_max_num, c_max_sleep_time, c_retry_times): """ 生产者线程初始化参数 :param queue: 队列 :param p_action: 生产动作对象实例 :param name: 线程名称 :param p_sleep_time: 生产线程每多长时间工作一次 :param c_max_num: 消费线程的最大线程数 :param c_max_sleep_time: 消费线程工作间隔最大休眠时间 :param c_retry_times: 消费动作对象action 最大重试次数 """ super(self.__class__, self).__init__() self.queue = queue self.p_action = p_action self.name = name self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_retry_times = c_retry_times #校验p_action 是不是 ProducerAction的子类,如果不是抛异常 if not isinstance(self.p_action, ProducerAction): raise Exception("%s is not ProducerAction instance" % self.p_action.__name__) #初始化logger self.logger = LogUtil().get_logger("producer_%s" % self.name, "producer_%s" % self.name)
def __init__(self, url, param, queue_id, pro_flag, queue_name): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.pro_flag = pro_flag self.queue_name = queue_name self.logger = LogUtil().get_logger('consumer', 'consumer' + queue_name)
def __init__(self, id, act, params, max_fail_times): super(self.__class__, self).__init__() self.id = id self.act = act self.params = params self.max_fail_times = max_fail_times self.logger = LogUtil().get_logger("HainiuConsumerAction", "HainiuConsumerAction")
def __init__(self, queue, name, max_sleep_time, retry_times): super(self.__class__, self).__init__() self.queue = queue self.name = name self.max_sleep_time = max_sleep_time self.retry_times = retry_times Consumer._MAX_RETRY_TIMES = retry_times #初始化日志 self.logger = LogUtil().get_logger("comsumer_%s" % self.name, "comsumer_%s" % self.name)
def push_queue_items(): count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" count_news_seed_internally_sql = """select count(*) from hainiu_web_seed_internally where status=0 for update;""" selec_news_seed_internally_sql = """select a_url,param,id from hainiu_web_seed_internally where status=0 limit %s,%s;""" update_news_seed_internally_sql = """update hainiu_web_seed_internally set status=1 where id in (%s);""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() d = DBUtil(config._HAINIU_DB) total = long(d.read_one(count_news_seed_internally_sql)[0]) page_size = 2 page = total / page_size for i in range(0, page + 1): sql = selec_news_seed_internally_sql % (0, page_size) list = d.read_tuple(sql) values = [] id_values = [] for l in list: url = l[0] url = url if url is not None else '' param = l[1] param1 = param if param is not None else '' id = l[2] param = '%s##%s' % (str(id), param1) values.append((url, param)) id_values.append(str(id)) if id_values.__len__() != 0: d.executemany_no_commit( insert_news_seed_internally_queue_items_sql, values) ids = ','.join(id_values) sql = update_news_seed_internally_sql % (ids) d.execute(sql) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
class HainiuProducerAction(ProducerAction): def __init__(self, max_fail_times, limit_num): super(self.__class__, self).__init__() self.max_fail_times = max_fail_times self.limit_num = limit_num self.logger = LogUtil().get_logger('HainiuProducerAction', 'HainiuProducerAction') def queue_items(self): #多台机器的时候,查询带上 fail_ip != ip # select_sql = """ # select id, action, params from hainiu_queue \ # where type='1' and is_work = 0 and fail_ip != '%s' and fail_times < %d limit 0, %d for update; # """ #行锁 select_sql = """ select id, action, params from hainiu_queue \ where type='1' and is_work = 0 and fail_times < %d limit 0, %d for update; """ update_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ list = [] try: db_util = DBUtil(db_config) #多个行 result = db_util.read_dict(select_sql % (self.max_fail_times, self.limit_num)) ids = [] for row_dict in result: id = row_dict['id'] action = row_dict['action'] params = row_dict['params'] c_action = HainiuConsumerAction(id, action, params, self.max_fail_times) list.append(c_action) #[1,2,3,4] ids.append(str(id)) if len(ids) != 0: ids = ','.join(ids) db_util.execute_no_commit(update_sql % ids) db_util.commit() except Exception, message: db_util.rollback_close() self.logger.exception(message) finally:
class DownLoadProducer(ProducerAction): def __init__(self, limit, pro_flag, fail_times, queue_name): self.limit = limit self.fail_times = fail_times self.pro_flag = pro_flag self.queue_name = queue_name self.rl = LogUtil().get_logger('producer', 'producer' + queue_name) def queue_items(self): # select_queue_sql = """ # select id,action,params from hainiu_queue where # type=3 and is_work =0 and fail_times <=%s and fail_ip <> '%s' # limit 0,%s for update; # """ select_queue_sql = """ select id,action,params from hainiu_queue where type=3 and is_work =0 and fail_times <=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ list = [] try: d = DBUtil(config._HAINIU_DB) sql = select_queue_sql % (self.fail_times,self.limit) tuple = d.read_tuple(sql) if len(tuple) == 0: return list queue_ids = '' for t in tuple: queue_id = t[0] url = t[1] param = '' if t[2] is None else t[2] queue_ids += str(queue_id) + ',' c = DownLoadConsumer(url, param, queue_id, self.pro_flag, self.queue_name) list.append(c) queue_ids = queue_ids[:-1] d.execute(update_queue_sql % (queue_ids)) except: self.rl.exception() d.rollback() d.commit() finally: d.close() return list
def push_queue_items(): count_news_seed_sql = """select count(*) from hainiu_web_seed where status=0;""" select_news_seed_sql = """select url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s;""" insert_news_seed_queue_items_sql = """insert into hainiu_queue (type,action,params) values(1,%s,%s);""" count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=1 and fail_times=0;""" rl = LogUtil().get_base_logger() try: d = DBUtil(config._HAINIU_DB) queue_total = d.read_one(count_news_seed_queue_sql)[0] if queue_total != 0: rl.info('last news_find queue not finish,last queue %s unFinish' % (queue_total)) return starttime = time.clock() total = long(d.read_one(count_news_seed_sql)[0]) page_size = 1000 page = total / page_size for i in range(0, page + 1): sql = select_news_seed_sql % (i * page_size, page_size) list = d.read_tuple(sql) values = [] for l in list: url = l[0] publisher = get_fld(url) publisher = publisher[0:publisher.index(( '.'))] if publisher.__contains__('.') else publisher param = {} param['category'] = l[1] param['publisher'] = publisher param = json.dumps(param, ensure_ascii=False) values.append((url, param)) if values.__len__() != 0: random.shuffle(values) d.executemany(insert_news_seed_queue_items_sql, values) endtime = time.clock() worksec = int(round((endtime - starttime))) rl.info( 'push news_find queue finish,total items %s,action time %s\'s' % (total, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
class DemoConsumerAction(ConsumerAction): def __init__(self, name): super(self.__class__, self).__init__() self.name = name self.logger = LogUtil().get_logger("DemoConsumerAction", 'DemoConsumerAction') def action(self): self.logger.info('consume %s' % self.name) flag = True return self.result(flag, [self.name]) def success_action(self): print 'success_op() ==> %s' % self.name def fail_action(self): print 'fail_op() ==> %s' % self.name
class Producer(threading.Thread): """ 生产者线程 """ def __init__(self, queue, p_action, name, p_sleep_time, c_max_num, c_max_sleep_time, c_retry_times): """ 生产者线程初始化参数 :param queue: 队列 :param p_action: 生产动作对象实例 :param name: 线程名称 :param p_sleep_time: 生产线程每多长时间工作一次 :param c_max_num: 消费线程的最大线程数 :param c_max_sleep_time: 消费线程工作间隔最大休眠时间 :param c_retry_times: 消费动作对象action 最大重试次数 """ super(self.__class__, self).__init__() self.queue = queue self.p_action = p_action self.name = name self.p_sleep_time = p_sleep_time self.c_max_num = c_max_num self.c_max_sleep_time = c_max_sleep_time self.c_retry_times = c_retry_times #校验p_action 是不是 ProducerAction的子类,如果不是抛异常 if not isinstance(self.p_action, ProducerAction): raise Exception("%s is not ProducerAction instance" % self.p_action.__name__) #初始化logger self.logger = LogUtil().get_logger("producer_%s" % self.name, "producer_%s" % self.name) def run(self): list = [] while True: try: #获取starttime start_time = time.time() #判断list是否是空的,如果是,就调用 p_action.queue_ites(), # 生产 ConsumerAction 子类实例列表 if len(list) == 0: list = self.p_action.queue_items() #计算本次生产了多少 total_num = len(list) #打印日志 self.logger.info( "queue.name=【producer_%s】, current time produce %d " "actions" % (self.name, total_num)) while True: #列表空了,就出去继续生产 if len(list) == 0: break #当队列的未完成数量小于等于最大消费线程数,就往queue里面put if self.queue.unfinished_tasks <= self.c_max_num: c_action = list.pop() self.queue.put(c_action) # 获取endtime end_time = time.time() run_time = end_time - start_time # 计算每分钟生产多少个 if run_time == 0: rate = total_num else: rate = round(float(total_num * 60) / run_time, 2) self.logger.info( "queue.name=【producer_%s】, total_num=%d," " produce %d actions/min, sleep_time=%d" % (self.name, total_num, rate, self.p_sleep_time)) # 睡眠 time.sleep(self.p_sleep_time) except Exception, message: self.logger.exception(message)
class Consumer(threading.Thread): _MAX_RETRY_TIMES = 0 def __init__(self, queue, name, max_sleep_time, retry_times): super(self.__class__, self).__init__() self.queue = queue self.name = name self.max_sleep_time = max_sleep_time self.retry_times = retry_times Consumer._MAX_RETRY_TIMES = retry_times #初始化日志 self.logger = LogUtil().get_logger("comsumer_%s" % self.name, "comsumer_%s" % self.name) def run(self): while True: try: #如果队列是空的,就睡眠一会,继续判断 if self.queue.empty(): time.sleep(self.max_sleep_time) continue #获取开始时间 start_time = time.time() #从队列(queue)里取出action action = self.queue.get() action.consumer_thread_name = self.name #在调用action()进行消费 result = action.action() rs = 'SUCCESS' if result[0] else 'FAIL' #获取结束时间 end_time = time.time() #获取随机休眠时间 random_sleep_time = round( random.uniform(0.2, self.max_sleep_time), 2) run_time = end_time - start_time #打印日志 self.logger.info( "queue.name=【comsumer_%s】, run_time=%d, sleep_time=%d, retry_times=%d, " " result=%s, detail=%s" % (self.name, run_time, random_sleep_time, action.current_retry_times, rs, result[1:])) #判断结果成功还是失败,如果是失败,并且失败次数小于最大重试次数,需要重试 if not result[ 0] and action.current_retry_times < self.retry_times: action.current_retry_times += 1 self.queue.put(action) #无论成功失败都要执行 self.queue.task_done() #随机睡眠 time.sleep(random_sleep_time) except Exception, message: self.logger.exception(message)
class DownLoadConsumer(ConsumerAction): def __init__(self, url, param, queue_id, pro_flag, queue_name): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.pro_flag = pro_flag self.queue_name = queue_name self.logger = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success = True t = TimeUtil() file_util = FileUtil() u = Util() hu = HtmlUtil() r = RequestUtil() values = [] md5 = u.get_md5(self.url) update_time = t.get_timestamp() create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) now_minute = int(t.now_min()) #以5分钟为间隔时间计算 for i in xrange(60,-5,-5): if now_minute>=i: now_minute=i break #格式化成yyyyMMddHHmm,如:201903181505 now_minute = t.now_time(format='%Y%m%d%H') + ('0%s' % (str(now_minute)) if now_minute < 10 else str(now_minute)) values.append(MySQLdb.escape_string(self.url)) values.append(md5) values.append(create_time) values.append(create_day) values.append(create_hour) values.append('') values.append(MySQLdb.escape_string(self.param)) values.append(update_time) try: html = r.http_get_phandomjs(self.url) domain = hu.get_url_domain(self.url) values[5] = domain soup = BeautifulSoup(html, 'lxml') title_doc = soup.find('title') title = title_doc.contents[0] if title_doc is not None and len(title_doc.contents) == 1 else '' host = hu.get_url_host(self.url) values.append(host) values.append(MySQLdb.escape_string(title)) # k = KafkaUtil(config._KAFKA_CONFIG) # html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4) # push_str = content._SEQ3.join(('%s','%s')) % (self.url,html) # push_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(push_str),push_str) # push_str = bytes(push_str) # is_success = k.push_message(push_str) if is_success: self.save_file(create_time,file_util,now_minute,u,self.url,html) else: self.logger.error("kafka push error") except: is_success = False values.append('') values.append('') self.logger.exception() finally: r.close_phandomjs() try: if is_success: values.append(1) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s",%s) on DUPLICATE KEY UPDATE update_time=values(update_time); """ else: ip = u.get_local_ip() values.append(ip) values.append(2) insert_web_page_sql = """ insert into hainiu_web_page (url,md5,create_time,create_day,create_hour,domain,param,update_time,host, title,fail_ip,status) values ("%s","%s",%s,%s,%s,"%s","%s",%s,"%s","%s","%s",%s) on DUPLICATE KEY UPDATE fail_times=fail_times+1,fail_ip=values(fail_ip); """ d = DBUtil(config._HAINIU_DB) sql = insert_web_page_sql % tuple(values) d.execute(sql) except: is_success = False self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close() return super(self.__class__, self).result(is_success, [md5,update_time,self.queue_id]) def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set update_time=%s where a_md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[2] sql = delete_sql % id # TODO 测试不删除队列表 d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (values[2],values[0]) d.execute_no_commit(sql) d.commit() except: self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close() def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set is_work=0 where id=%s; """ update_hainiu_news_internally_sql = """ update hainiu_web_seed_internally set fail_times=fail_times+1,fail_ip="%s",update_time=%s where a_md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[2] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) sql = update_hainiu_news_internally_sql % (ip, values[1], values[0]) d.execute_no_commit(sql) if (self.current_retry_times == Consumer._MAX_RETRY_TIMES): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.logger.exception() self.logger.error(sql) d.rollback() d.commit() finally: d.close() def save_file(self, create_time, file_util, now_minute, u, url, html): #downloadnews_1_one_201903181505 # TODO 单机调试下载文件 # self.consumer_thread_name = "downloadnews" # html_file_path_cache[self.consumer_thread_name] = 'downloadnews_one_201903211115' now_file_name = '%s_%s_%s' % (self.consumer_thread_name, self.pro_flag, now_minute) #从文件缓存字典中根据当前线程名称获取 last_file_name last_file_name = u.get_dict_value(html_file_path_cache, self.consumer_thread_name) print 'last_file_name==>%s' % last_file_name print 'now_file_name==>%s' % now_file_name #再把now_file_name 作为当前线程的名称的value 放入字典 html_file_path_cache[self.consumer_thread_name] = now_file_name #/tmp/python/hainiu_cralwer/data/tmp/downloadnews_1_one tmp_path = config._LOCAL_DATA_DIR % ('%s/%s_%s' % ('tmp', self.consumer_thread_name, self.pro_flag)) #默认换行 start_char = content._SEQ2 #如果是首次写文件或者 写新文件,不换行 if last_file_name is None or now_file_name != last_file_name: start_char = '' #如果最后的文件存在且里面有数据,则mv到done目录下,并重命名 if os.path.exists(tmp_path) and os.path.getsize(tmp_path) > 0: #/tmp/python/hainiu_cralwer/data/done/downloadnews_1_one_201903181505_1545376668 done_path = config._LOCAL_DATA_DIR % ('%s/%s_%s' % ('done', now_file_name, create_time)) shutil.move(tmp_path, done_path) #如果不是新文件,就继续往里写数据 html = html.replace(content._SEQ1,'').replace(content._SEQ2,content._SEQ4) record_str = content._SEQ3.join(('%s','%s')) % (url,html) record_str = content._SEQ3.join(('%s','%s')) % (u.get_md5(record_str),record_str) html_record_format_str = start_char + record_str file_util.write_file_content_pattern(tmp_path, html_record_format_str, pattern='a')
def __init__(self, max_fail_times, limit_num): super(self.__class__, self).__init__() self.max_fail_times = max_fail_times self.limit_num = limit_num self.logger = LogUtil().get_logger('HainiuProducerAction', 'HainiuProducerAction')
def __init__(self, name): super(self.__class__, self).__init__() self.name = name self.logger = LogUtil().get_logger("DemoConsumerAction", 'DemoConsumerAction')
def push_queue_items(): # 符合 写入的种子的队列数据的数量 count_news_seed_queue_sql = """select count(*) from hainiu_queue where type=3 and fail_times=0;""" # 生成写入队列数据 条件: type=3 insert_news_seed_internally_queue_items_sql = """insert into hainiu_queue (type,action,params) values(3,%s,%s);""" # 日志 rl = LogUtil().get_base_logger() redisdb = RedisUtill() try: # 开始时间 starttime = time.clock() redis_data_statu = True # 线程锁 lock_key = 'get_news_seed_internally_data' sql = "" total_all = 0 d = DBUtil(config._HAINIU_DB) d.execute_no_commit("set NAMES utf8mb4;") #符合 写入的种子的队列数据的数量 --- 之前的队列数据还没有处理完,所以不重新写队列数据到队列中 sql = count_news_seed_queue_sql queue_total = d.read_one(sql)[0] if queue_total != 0: rl.info( 'last download_page queue not finish,last queue %s unFinish' % (queue_total)) # return while redis_data_statu: is_lock = redisdb.get_conn().exists(lock_key) if is_lock == False: #锁上线程 --- 10 秒失效 lockd = redisdb.get_lock(lock_key, 10) if lockd == False: rl.info('无法获取线程锁,退出采集下载queue线程 ') continue ips = config._REDIS_CLUSTER_CONFIG['IPS'] port = config._REDIS_CLUSTER_CONFIG['PORT'] def scan_limit_to_queue_table(host, port, cursor, match, count): total_num = 0 r = redis.Redis(host, port) rs = r.scan(cursor, match, count) next_num = rs[0] key_list = [] value_list = [] for k in rs[1]: key_list.append(k) total_num += 1 # print key_list print total_num values = redisdb.get_values_batch_keys(key_list) for v in values: value_list.append((v, '')) print value_list sql = insert_news_seed_internally_queue_items_sql d.executemany(sql, value_list) redisdb.delete_batch(rs[1]) if next_num == 0: return total_num return total_num + scan_limit_to_queue_table( host, port, next_num, match, count) total_num = 0 for ip in ips: total_num += scan_limit_to_queue_table( ip, port, 0, 'down:*', 10) print '======' print total_num if total_num > 0: break redisdb.release(lock_key) else: rl.info('其他线程正在处理,请等待 ') time.sleep(0.3) endtime = time.time() # 一共执行的时间 worksec = int(round((endtime - starttime))) # 日志 rl.info( 'push seed_internally queue finish,total items %s,action time %s\'s' % (total_all, worksec)) except: rl.exception() rl.error(sql) d.rollback() finally: redisdb.release(lock_key) d.close()
def __init__(self, limit, fail_times): self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('NewsFindProducer', 'NewsFindProducer')
def __init__(self, limit, pro_flag, fail_times, queue_name): self.limit = limit self.fail_times = fail_times self.pro_flag = pro_flag self.queue_name = queue_name self.rl = LogUtil().get_logger('producer', 'producer' + queue_name)
def __init__(self, url, param, queue_id): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.rl = LogUtil().get_logger('NewsFindConsumer', 'NewsFindConsumer')
class NewsFindConsumer(ConsumerAction): def __init__(self, url, param, queue_id): ConsumerAction.__init__(self) self.url = url[:-1] if url.endswith('/') else url self.param = param self.queue_id = queue_id self.rl = LogUtil().get_logger('NewsFindConsumer', 'NewsFindConsumer') def action(self): is_success = True t = TimeUtil() u = Util() hu = HtmlUtil() r = RequestUtil() redis_util = RedisUtill() redis_dict_values = {} # redis_dict_keys = [] in_values = [] ex_values = [] a_href = '' main_md5 = u.get_md5(self.url) update_time = t.get_timestamp() print update_time create_time = update_time create_day = int(t.now_day().replace('-', '')) create_hour = int(t.now_hour()) try: html = r.http_get_phandomjs(self.url) domain = hu.get_url_domain(self.url) soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") a_set = set() a_param = {} out_json_srt = '' status = 0 host = hu.get_url_host(self.url) for a in a_docs: a_href = hu.get_format_url(a, host) a_title = a.get_text().strip() if a_href == '' or a_title == '': continue if a_set.__contains__(a_href): continue a_set.add(a_href) req = urllib2.Request(url=a_href) a_host = req.get_host() if req.get_host() is not None else '' a_md5 = u.get_md5(a_href) if a_title != '': a_param['title'] = a_title out_json_srt = json.dumps(a_param, ensure_ascii=False) a_xpath = hu.get_dom_parent_xpath_js(a) insert_values = (main_md5, domain, host, a_md5, a_host, a_xpath, create_time, create_day, create_hour, update_time, status, MySQLdb.escape_string(self.url), MySQLdb.escape_string(a_href), MySQLdb.escape_string(a_title), out_json_srt) # print insert_values if a_host.__contains__(domain): in_values.append(insert_values) dict_exist_key = "exist:%s" % a_md5 redis_dict_values[dict_exist_key] = a_href redis_dict_keys.append(dict_exist_key) else: ex_values.append(insert_values) in_table = 'hainiu_web_seed_internally' ex_table = 'hainiu_web_seed_externally' insert_sql = """ insert into <table> (md5,domain,host,a_md5,a_host,a_xpath,create_time,create_day,create_hour,update_time,status,url,a_url,a_title,param) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE update_time=values(update_time) ; """ try: d = DBUtil(config._HAINIU_DB) #设置会话字符集为 utf8mb4 d.execute_no_commit("set NAMES utf8mb4;") if in_values.__len__() != 0: sql = insert_sql.replace('<table>', in_table) d.executemany_no_commit(sql, in_values) #拿key去redis查是否存在 exist:a_md5,得到这些key对应的values,也就是url列表 redis_exist_values = redis_util.get_values_batch_keys( redis_dict_keys) #将存在的values列表转换成exist:a_md5形式 redis_exist_keys = [ "exist:%s" % u.get_md5(rev) for rev in redis_exist_values if rev != None ] #判断本次入库的数据中那些是在redis中存在的,如果不存在就生成down:a_md5 exits:a_md5这两个key放到redis中 redis_dict_down_values = {} for key, value in redis_dict_values.items(): if key not in redis_exist_keys: redis_dict_down_values["down:%s" % u.get_md5(value)] = value redis_dict_down_values[key] = value if redis_dict_down_values.__len__() != 0: redis_util.set_batch_datas(redis_dict_down_values) if ex_values.__len__() != 0: sql = insert_sql.replace('<table>', ex_table) d.executemany_no_commit(sql, ex_values) d.commit() except: is_success = False self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() except: is_success = False self.rl.exception() finally: r.close_phandomjs() return super(self.__class__, self).result(is_success, [ main_md5, self.url, a_href, in_values.__len__(), ex_values.__len__(), self.queue_id ]) def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set last_crawl_internally=%s,last_crawl_externally=%s,last_crawl_time=now() where md5="%s";""" try: d = DBUtil(config._HAINIU_DB) id = values[5] sql = delete_sql % id # TODO 测试不删除队列表 d.execute_no_commit(sql) sql = update_hainiu_news_seed_sql % (values[3], values[4], values[0]) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close() def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql_1 = """ update hainiu_queue set type=1 where id=%s; """ update_hainiu_news_seed_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip="%s" where md5="%s"; """ try: d = DBUtil(config._HAINIU_DB) id = values[5] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) main_md5 = values[0] sql = update_hainiu_news_seed_sql % (ip, main_md5) d.execute_no_commit(sql) if (self.current_retry_times == Consumer._MAX_RETRY_TIMES): sql = update_sql_1 % (id) d.execute_no_commit(sql) d.commit() except: self.rl.exception() self.rl.error(sql) d.rollback() d.commit() finally: d.close()