def action(self): is_success=True try: #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到hainiu_queue中 #成功了就把hainiu_web_seed的status状态修改为0,一遍下一小时继续爬取 print self.ac,self.params,self.id time.sleep(5) insert_sql = """ insert into hainiu_queue(type,params,action) values (0,'%s','%s'); """ update_queue_sql = """ update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); """ rl = LogUtil().get_base_logger() try: print "进到消费者线程" db = DBUtil(config._OGC_DB) print insert_sql print self.params,self.ac sql=insert_sql % (self.params,self.ac) print sql db.execute(sql) except: rl.exception() rl.error(insert_sql) rl.error(update_queue_sql) db.rollback() finally: db.close() except: is_success=False self.rl.exception() return super(self.__class__,self).result(is_success,[self.id,self.ac,self.params])
def push_queue_items(): insert_sql=""" insert into hainiu_queue(type,params,action) values (1,%s,%s); """ count_sql=""" select count(1) from hainiu_web_seed; """ select_sql=""" select url,category from hainiu_web_seed limit %s,%s; """ rl=LogUtil().get_base_logger() try: d=DBUtil(config._OGC_DB) sql=count_sql queue_total=d.read_one(sql)[0] print "queue total",queue_total page_size=1 page=queue_total/page_size for i in range(0,page): sql=select_sql % (i*page_size,page_size) select_list=d.read_tuple(sql) print "page",i insert_list=[] for record in select_list: url=record[0] category=record[1] insert_list.append((category,url)) print url,category d.executemany(insert_sql,insert_list) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
class KafkaUtil: __kafka_connect_cache = {} __lock = threading.Lock() # def __init__(self, kafka_conf): host_list = [host for host in kafka_conf['HOST'].split(',')] random.shuffle(host_list) host_str = ",".join(host_list) self.cache_key = "_".join((host_str, kafka_conf['TOPIC'])) self.host = host_str self.topic = kafka_conf['TOPIC'] self.rl = LogUtil().get_logger('consuer', 'consumer_kafka') # def push_message(self, message): self.__lock.acquire() u = Util() producer = u.get_dict_value(self.__kafka_connect_cache, self.cache_key) if producer is None: client = KafkaClient(hosts=self.host) topic = client.topics[self.topic] producer = topic.get_producer() self.__kafka_connect_cache[self.cache_key] = producer is_success = True try: producer.produce(message) except: is_success = False del self.__kafka_connect_cache[self.cache_key] self.rl.error('kafka push error chacheKey is %s' % (self.cache_key)) self.rl.exception() self.__lock.release() return is_success
class Producer(threading.Thread): def __init__(self, queue, action, name, max_num, sleep_time, work_sleep_time, work_try_num): super(self.__class__, self).__init__() self.queue = queue self.action = action self.name = name self.max_num = max_num self.sleep_time = sleep_time self.work_sleep_time = work_sleep_time self.work_try_num = work_try_num self.rl = LogUtil().get_logger("producer", "producer" + self.name) if not isinstance(action, base_producer_action.ProducerAction): raise Exception("action not extends producer base") def run(self): action_list = [] while True: try: start_time = time.clock() if len(action_list) == 0: action_list = self.action.queue_items() total_items = len(action_list) self.rl.info('get queue %s total items is %s ' % (self.name, total_items)) while True: if len(action_list) == 0: break unfinished_tasks = self.queue.unfinished_tasks if unfinished_tasks <= self.max_num: action = action_list.pop() self.queue.put(action) end_time = time.clock() work_time = int(round(end_time - start_time)) work_mins = work_time / 60 self.rl.info('put queue %s total items is %s ,total time is %s \'s,(at %s items per mins' % \ (self.name, total_items, work_time, int(total_items) if work_mins == 0 else round(float(total_items / work_mins), 2))) time.sleep(self.sleep_time) except: self.rl.exception() def start_work(self): for i in range(0, self.max_num): qc = queue_consumer.Consumer(self.queue, self.name + "_" + str(i), self.work_sleep_time, self.work_try_num) qc.start() time.sleep(5) self.start()
class NewsFindActionProducer(ProducerAction): def __init__(self,limit,fail_times): super(self.__class__, self).__init__() self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name) def queue_items(self): ip=Util().get_local_ip() select_seed_sql=""" select id,url,category,domain,host,last_crawl_time from hainiu_web_seed where fail_times<=%s and locate('%s',fail_ip)=0 and status=0 limit 0,%s for update; """ update_queue_sql=""" update hainiu_web_seed set status=1,last_crawl_time='%s' where id in (%s); """ return_list=[] try: d=DBUtil(config._OGC_DB) sql=select_seed_sql % (self.fail_times,ip,self.limit) select_dict=d.read_dict(sql) # print select_dict query_ids=[] t=TimeUtil() for each in select_dict: id=each['id'] url=each['url'] category=each['category'] domain=each['domain'] host=each['host'] last_crawl_time=str(each['last_crawl_time']) if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13],'%Y-%m-%d %H'))<=\ int(t.str2timestamp(t.get_dif_time(hour=-1,format='%Y-%m-%d %H'),format='%Y-%m-%d %H')): #进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过 query_ids.append(str(id)) action=url params=category c = NewsFindActionConsumer(id, action, params) return_list.append(c) if query_ids: ids=','.join(query_ids) sql=update_queue_sql % (t.now_time(),ids) print t.now_time(),ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def push_queue_items(): rl = LogUtil().get_base_logger() page_size = 10 insert_seed_sql = """ insert into hainiu_queue(type,params,action) values (0,%s,%s); """ count_seed_sql = """ select count(1) from hainiu_web_seed; """ select_seed_sql = """ select id,url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s for update; """ update_queue_sql = """ update hainiu_web_seed set last_crawl_time='%s' where id in (%s); """ t = TimeUtil() try: d = DBUtil(config._OGC_DB) queue_total = d.read_one(count_seed_sql)[0] page_num = queue_total / page_size + 1 query_ids = [] print page_num, page_size for i in range(0, page_num): sql = select_seed_sql % (i * page_size, page_size) select_list = d.read_tuple(sql) insert_list = [] for record in select_list: id = record[0] url = record[1] category = record[2] last_crawl_time = str(record[3]) if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13], '%Y-%m-%d %H')) <= \ int(t.str2timestamp(t.get_dif_time(hour=-1, format='%Y-%m-%d %H'), format='%Y-%m-%d %H')): # 进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过 insert_list.append((category, url)) query_ids.append(str(id)) d.executemany(insert_seed_sql, insert_list) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % (t.now_time(), ids) print t.now_time(), ids d.execute(sql) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def push_queue_items(): redis = RedisUtil() rl = LogUtil().get_base_logger() # redis_key = redis.kyes_limit_scan(pattern="down*", limit=10) # print redis_key # type为2意思是从redis中获取的待下载链接,为3意思是已经被消费者拿去下载了 insert_queue_sql = """ insert into hainiu_queue(type,params,action) values (2,'from redis','%s'); """ try: db = DBUtil(config._OGC_DB) redis_len = len(redis.get_conn().keys()) page_size = 10 page_num = redis_len / page_size # redis_len = len(redis.get_conn().keys("down*")) # sum=0 for i in range(0, page_num): redis_key = redis.kyes_limit_scan(pattern="down*", limit=page_size * (i + 1), cursor=0) if len(redis_key) != 0: redis_value = redis.get_values_batch_keys(redis_key) for each in redis_value: print each sql = insert_queue_sql % (each) db.execute_no_commit(sql) db.commit() redis.delete_batch(redis_key) #避免后面无用的扫描 # if sum==redis_len: # break #下面是一下全取出来,没有分页的方法 # redis_key=redis.get_conn().keys("down*") # print redis_key # if len(redis_key) !=0: # redis_value = redis.get_values_batch_keys(redis_key) # for each in redis_value: # print redis_value # sql=insert_queue_sql%(each[5:]) # db.execute_no_commit(sql) # db.commit() # redis.delete_batch(redis_key) except: rl.exception() rl.error(insert_queue_sql) db.rollback() finally: db.close()
def action(self): is_success=True try: #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到redis中 # 插入两条数据,如果数据已经存在了就pass,如果数据不存在就插入hainiu_queue中 rl = LogUtil().get_base_logger() try: print "come in consumer thread" call_beautiful(self.url) except: rl.exception() finally: pass except: is_success=False self.rl.exception() return super(self.__class__,self).result(is_success,[self.id,self.url,self.params])
class NewsFindQueueProducer(ProducerAction): def __init__(self, limit, fail_times): super(self.__class__, self).__init__() self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name) def queue_items(self): ip = Util().get_local_ip() select_queue_sql = """ select id,action,params from hainiu_queue where type=0 and fail_times<=%s and locate('%s',fail_ip)=0 limit 0,%s for update; """ #type=1意思是url已经分配给消费者了 update_queue_sql = """ update hainiu_queue set type=1 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, ip, self.limit) select_dict = d.read_dict(sql) print select_dict query_ids = [] for each in select_dict: id = each['id'] url = each['action'] category = each['params'] query_ids.append(str(id)) c = NewsFindQueueConsumer(id, url, category) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
class DownloadActionProducer(ProducerAction): def __init__(self, limit, fail_times): super(self.__class__, self).__init__() self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger("producer", "producer" + queue_name) def queue_items(self): ip = Util().get_local_ip() select_queue_sql = """ select id,action,params from hainiu_queue where fail_times<=%s and locate('%s',fail_ip)=0 and type=2 limit 0,%s for update; """ #type=3 已被消费者进程拿取过了 update_queue_sql = """ update hainiu_queue set type=3 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, ip, self.limit) select_dict = d.read_dict(sql) query_ids = [] t = TimeUtil() for each in select_dict: id = each['id'] action = each['action'] params = each['params'] query_ids.append(str(id)) c = DownloadActionConsumer(id, action, params) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def send_sms(self, content, phone=config._ALERT_PHONE): """send alter sms for phone with content """ l = LogUtil().get_base_logger() try: send_url = 'http://send.sms.hainiu.com:8080/s?command=cralwer&phone=%s&' % ( phone) send_url += urllib.urlencode( {'content': content.decode('utf-8').encode('gbk')}) r = urllib2.urlopen(send_url).read() print "here01" if '0-OK' != r: print "here" l.error("短信发送失败,短信服务器返回状态为:%s,手机号:%s,内容:%s" % (r, phone, content)) return False except: l.exception() return False return True
class OGCProducer(ProducerAction): def __init__(self, limit, fail_times): super(self.__class__, self).__init__() self.limit = limit self.fail_times = fail_times self.rl = LogUtil().get_logger('producer', 'producer' + queue_name) def queue_items(self): select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and is_work=0 and fail_times<=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, self.limit) select_dict = d.read_dict(sql) query_ids = [] for record in select_dict: id = record['id'] action = record['action'] params = record['params'] query_ids.append(str(id)) c = OGCConsumer(id, action, params) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error() d.rollback() finally: d.close() return return_list
def create_seed(): url="http://www.autohome.com.cn/all" category="汽车" sql=""" insert into hainiu_web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0) """ hu=HtmlUtil() domain=get_tld(url) host=hu.get_url_host(url) u=Util() md5=u.get_md5(url) rl=LogUtil().get_base_logger() try: d=DBUtil(config._OGC_DB) sql=sql % (url,md5,domain,host,category) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
class OGCConsumer(ConsumerAction): def __init__(self, id, ac, params): super(self.__class__, self).__init__() self.id = id self.ac = ac self.params = params self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success = True try: print self.ac, self.params except: is_success = False self.rl.exception() #这里是另外的写法 return super(self.__class__, self).result(is_success, [self.id, self.ac, self.params]) def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s """ try: d = DBUtil(config._OGC_DB) id = values[0] sql = delete_sql % id d.execute(sql) except: self.rl.exception() self.rl.error() d.rollback() finally: d.close() def fail_action(self, values): update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql1 = """ update hainiu_queue set is_work=0 where id =%s """ try: d = DBUtil(config._OGC_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) if (self.try_num == Consumer.work_try_num): sql = update_sql1 % id d.execute_no_commit(sql) d.commit() except: self.rl.error() self.rl.exception() finally: d.close()
class OGCConsumerAction(base_consumer_action.ConsumerAction): def __init__(self, text): super(self.__class__, self).__init__() self.text = text self.rl = LogUtil().get_base_logger() def action(self): result = True r_test = '' try: #这里是消费动作 r_test = "OGC" + str(self.text) except: result = False self.rl.exception() return self.result(result, [r_test]) def fail_action(self, values): if self.try_num >= queue_consumer.Consumer._WORK_TRY_NUM: pass def success_action(self, values): pass
class Consumer(threading.Thread): _WORK_TRY_NUM = 0 def __init__(self, queue, name, sleep_time, work_try_num): super(self.__class__, self).__init__() self.queue = queue self.name = name self.sleep_time = sleep_time self.work_try_num = work_try_num Consumer._WORK_TRY_NUM = work_try_num self.rl = LogUtil().get_logger( 'consumer', 'consumer' + self.name[:self.name.find("_")]) def run(self): while True: try: # 这是一个阻塞方法 action = self.queue.get() if not isinstance(action, base_consumer_action.ConsumerAction): raise Exception("action not extends consumer base") sleep_time = random.randint(0, self.sleep_time * 10) * 0.1 time.sleep(sleep_time) action.consumer_thread_name = self.name start_time = time.clock() re = action.action() end_time = time.clock() work_time = int(round(end_time - start_time)) self.rl.info(("queue name %s finish,sleep time %s \'s,action time %s \'s" "action retry %s times,result:%s") % \ (self.name, sleep_time, work_time, action.try_num, re.__str__() if re is not None else "")) if not re[0] and action.try_num < self.work_try_num: action.try_num += 1 self.queue.put(action) self.queue.task_done() except: self.rl.exception()
class DownloadActionConsumer(ConsumerAction): def __init__(self, id, action, params): super(self.__class__, self).__init__() self.id = id self.url = action self.params = params self.rl = LogUtil().get_logger("consumer", "consumer" + queue_name) def action(self): is_success = True try: # 这里应该是进行消费,也就是把hainiu_queue送过来的链接进行爬取url,然后放到hainiu_web_page中 #并且保存文件到本地,还有推到kafka中 r = RequestUtil() hu = HtmlUtil() u = Util() f = FileUtil() t = TimeUtil() db = DBUtil(config._OGC_DB) html = r.http_get_phandomjs(self.url) r.close_phandomjs() charset = hu.get_doc_charset(etree.HTML(html)) html = html.decode(charset).encode(sys.getfilesystemencoding()) title = get_title(html).decode(sys.getfilesystemencoding()) html_string = str(html).replace('\n', '').replace('\r\n', '') md5_html_string = u.get_md5(html_string) base_path = config._LOCAL_DATA_DIR % os.sep + 'done' file_path = config._LOCAL_DATA_DIR % os.sep + 'done' + os.sep + md5_html_string # 写文件 f.create_path(base_path) f.write_file_content(file_path, md5_html_string + "\001" + html_string) # 推kafka kafka_util = KafkaUtil(config._KAFKA_CONFIG) kafka_util.push_message(html_string) try: #把结果记录写入hianiu_web_page中 insert_web_page_sql = """ insert into hainiu_web_page (url,md5,param,domain,host,title,create_time, create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"); """ create_time = int(t.str2timestamp(t.now_time())) create_day = int(t.now_day().replace("-", "")) create_hour = int(t.now_hour()) update_time = int(t.str2timestamp(t.now_time())) sql = insert_web_page_sql % ( self.url, md5_html_string, "{title:" + self.params + "}", get_fld(self.url), hu.get_url_host(self.url), title, create_time, create_day, create_hour, update_time) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() except: is_success = False self.rl.exception() return super(self.__class__, self).result(is_success, [self.id, self.url, self.params]) def success_action(self, values): # 成功了就把hainiu_queue的记录删除 delete_queue_sql = """ delete from hainiu_queue where id in (%s); """ try: sql = delete_queue_sql % values[0] db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() def fail_action(self, values): print "come in fail_action" #失败了就将记录type恢复为2,并累加fail_times update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql1 = """ update hainiu_queue set type=2 where id =%s """ try: d = DBUtil(config._OGC_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute(sql) d.execute_no_commit(sql) #超过单机器尝试次数,工作状态置为不工作 if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql1 % id d.execute_no_commit(sql) d.commit() except: self.rl.error(sql) self.rl.exception() finally: d.close()
class NewsFindActionConsumer(ConsumerAction): def __init__(self,id,action,params): super(self.__class__,self).__init__() self.id = id self.ac = action self.params = params self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success=True try: #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到hainiu_queue中 #成功了就把hainiu_web_seed的status状态修改为0,一遍下一小时继续爬取 print self.ac,self.params,self.id time.sleep(5) insert_sql = """ insert into hainiu_queue(type,params,action) values (0,'%s','%s'); """ update_queue_sql = """ update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); """ rl = LogUtil().get_base_logger() try: print "进到消费者线程" db = DBUtil(config._OGC_DB) print insert_sql print self.params,self.ac sql=insert_sql % (self.params,self.ac) print sql db.execute(sql) except: rl.exception() rl.error(insert_sql) rl.error(update_queue_sql) db.rollback() finally: db.close() except: is_success=False self.rl.exception() return super(self.__class__,self).result(is_success,[self.id,self.ac,self.params]) def success_action(self,values): print "success" update_queue_sql = """ update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); """ try: sql = update_queue_sql % (TimeUtil().now_time(), values[0]) db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() def fail_action(self,values): update_sql=""" update hainiu_web_seed set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ #超过尝试次数就把工作状态设为不工作状态 update_sql1=""" update hainiu_web_seed set status=0,last_crawl_time='' where id =%s """ try: d=DBUtil(config._OGC_DB) id=values[0] u=Util ip=u.get_local_ip() sql=update_sql % (ip,id) d.execute_no_commit(sql) if(self.try_num==Consumer.work_try_num): sql=update_sql1 % id d.execute_no_commit(sql) d.commit() except: self.rl.error(sql) self.rl.exception() finally: d.close()
def call_beautiful(url): ''' 给定url,获取 :param url: :return: ''' # url='http://roll.news.qq.com' r = RequestUtil() hu = HtmlUtil() t = TimeUtil() html = r.http_get_phandomjs(url) charset = hu.get_doc_charset(etree.HTML(html)) domain = get_fld(url) host = hu.get_url_host(url) u = Util() rl = LogUtil().get_base_logger() print "domain:", domain, ":host:", host soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") for a in a_docs: a_href = get_format_url(url, a, host, charset) if a_href and a.text: print a.text print a_href xpath = hu.get_dom_parent_xpath_js(a) create_time = int(t.str2timestamp(t.now_time())) create_day = int(t.now_day().replace("-", "")) create_hour = int(t.now_hour()) update_time = int(t.str2timestamp(t.now_time())) if get_fld(a_href) == domain: print a_href #说明是内链接,写入redis数据库 redis_conn = RedisUtil().get_conn() redis = RedisUtil() key1 = "exist:" + u.get_md5(a_href) print redis_conn.keys(key1) if not redis_conn.keys(key1): key2 = "down:" + u.get_md5(a_href) dicts = {key1: a_href, key2: a_href} redis.set_batch_datas(dicts) #同时写入mysql-internal数据库保存信息 try: db = DBUtil(config._OGC_DB) insert_internal_sql = """ insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5, a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) values("%s," * 13,"%s") on duplicate key update update_time=update_time +1; """ #values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"); sql = insert_internal_sql % ( url, u.get_md5(url), "{title:" + a.text + "}", domain, host, a_href, u.get_md5(a_href), hu.get_url_host(a_href), xpath, a.text, create_time, create_day, create_hour, update_time) db.execute(sql) except: rl.exception() rl.error(sql) db.rollback() finally: db.close() else: #外连接写入mysql数据库,因为这部分只写,不会爬取 db = DBUtil(config._OGC_DB) insert_external_sql = """ insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5, a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) values("%s," *13 ,"%s") on duplicate key update update_time=update_time +1; """ sql = insert_external_sql % ( url, u.get_md5(url), a.text, domain, host, a_href, u.get_md5(a_href), hu.get_url_host(a_href), xpath, a.text, create_time, create_day, create_hour, update_time) try: db.execute(sql) except: rl.exception() rl.error(sql) db.rollback() finally: db.close()
class NewsFindQueueConsumer(ConsumerAction): def __init__(self, id, action, params): super(self.__class__, self).__init__() self.id = id self.url = action self.params = params self.rl = LogUtil().get_logger('consumer', 'consumer' + queue_name) def action(self): is_success = True try: #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到redis中 # 插入两条数据,如果数据已经存在了就pass,如果数据不存在就插入hainiu_queue中 rl = LogUtil().get_base_logger() try: print "进到消费者线程" call_beautiful(self.url) except: rl.exception() finally: pass except: is_success = False self.rl.exception() return super(self.__class__, self).result(is_success, [self.id, self.url, self.params]) def success_action(self, values): #成功之后应该删除hainiu_queue表中的数据,这里为了测试方便先修改状态,之后改成删除 update_queue_sql = """ update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); """ try: sql = update_queue_sql % (TimeUtil().now_time(), self.id) db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() def fail_action(self, values): #失败之后恢复type为0,以便让其他线程继续访问 update_sql = """ update hainiu_web_seed set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql1 = """ update hainiu_web_seed set status=0,last_crawl_time='' where id =%s """ try: d = DBUtil(config._OGC_DB) id = values[0] u = Util ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute_no_commit(sql) if (self.try_num == Consumer.work_try_num): sql = update_sql1 % id d.execute_no_commit(sql) d.commit() except: self.rl.error(sql) self.rl.exception() finally: d.close()