def action(self): is_success=True try: #这里应该是进行消费,也就是把hainiu_web_seed送过来的链接进行爬取url,然后放到hainiu_queue中 #成功了就把hainiu_web_seed的status状态修改为0,一遍下一小时继续爬取 print self.ac,self.params,self.id time.sleep(5) insert_sql = """ insert into hainiu_queue(type,params,action) values (0,'%s','%s'); """ update_queue_sql = """ update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); """ rl = LogUtil().get_base_logger() try: print "进到消费者线程" db = DBUtil(config._OGC_DB) print insert_sql print self.params,self.ac sql=insert_sql % (self.params,self.ac) print sql db.execute(sql) except: rl.exception() rl.error(insert_sql) rl.error(update_queue_sql) db.rollback() finally: db.close() except: is_success=False self.rl.exception() return super(self.__class__,self).result(is_success,[self.id,self.ac,self.params])
def fail_action(self, values): print "come in fail_action" #失败了就将记录type恢复为2,并累加fail_times update_sql = """ update hainiu_queue set fail_times=fail_times+1,fail_ip='%s' where id=%s; """ update_sql1 = """ update hainiu_queue set type=2 where id =%s """ try: d = DBUtil(config._OGC_DB) id = values[0] u = Util() ip = u.get_local_ip() sql = update_sql % (ip, id) d.execute(sql) d.execute_no_commit(sql) #超过单机器尝试次数,工作状态置为不工作 if (self.try_num == Consumer._WORK_TRY_NUM): sql = update_sql1 % id d.execute_no_commit(sql) d.commit() except: self.rl.error(sql) self.rl.exception() finally: d.close()
def success_action(self,values): #成功之后应该删除hainiu_queue表中的数据,这里为了测试方便先修改状态,之后改成删除 # update_queue_sql = """ # update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); # """ # try: # sql = update_queue_sql % (TimeUtil().now_time(), self.id) # db = DBUtil(config._OGC_DB) # db.execute(sql) # except: # self.rl.exception() # self.rl.error(sql) # db.rollback() # finally: # db.close() delete_queue_sql = """ delete from hainiu_queue where id in (%s); """ try: sql = delete_queue_sql % values[0] db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close()
def queue_items(self): select_queue_sql = """ select id,action,params from hainiu_queue where type=1 and is_work=0 and fail_times<=%s limit 0,%s for update; """ update_queue_sql = """ update hainiu_queue set is_work=1 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, self.limit) select_dict = d.read_dict(sql) query_ids = [] for record in select_dict: id = record['id'] action = record['action'] params = record['params'] query_ids.append(str(id)) c = OGCConsumer(id, action, params) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error() d.rollback() finally: d.close() return return_list
def action(self): is_success = True try: # 这里应该是进行消费,也就是把hainiu_queue送过来的链接进行爬取url,然后放到hainiu_web_page中 #并且保存文件到本地,还有推到kafka中 r = RequestUtil() hu = HtmlUtil() u = Util() f = FileUtil() t = TimeUtil() db = DBUtil(config._OGC_DB) html = r.http_get_phandomjs(self.url) r.close_phandomjs() charset = hu.get_doc_charset(etree.HTML(html)) html = html.decode(charset).encode(sys.getfilesystemencoding()) title = get_title(html).decode(sys.getfilesystemencoding()) html_string = str(html).replace('\n', '').replace('\r\n', '') md5_html_string = u.get_md5(html_string) base_path = config._LOCAL_DATA_DIR % os.sep + 'done' file_path = config._LOCAL_DATA_DIR % os.sep + 'done' + os.sep + md5_html_string # 写文件 f.create_path(base_path) f.write_file_content(file_path, md5_html_string + "\001" + html_string) # 推kafka kafka_util = KafkaUtil(config._KAFKA_CONFIG) kafka_util.push_message(html_string) try: #把结果记录写入hianiu_web_page中 insert_web_page_sql = """ insert into hainiu_web_page (url,md5,param,domain,host,title,create_time, create_day,create_hour,update_time) values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"); """ create_time = int(t.str2timestamp(t.now_time())) create_day = int(t.now_day().replace("-", "")) create_hour = int(t.now_hour()) update_time = int(t.str2timestamp(t.now_time())) sql = insert_web_page_sql % ( self.url, md5_html_string, "{title:" + self.params + "}", get_fld(self.url), hu.get_url_host(self.url), title, create_time, create_day, create_hour, update_time) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close() except: is_success = False self.rl.exception() return super(self.__class__, self).result(is_success, [self.id, self.url, self.params])
def push_queue_items(): rl = LogUtil().get_base_logger() page_size = 10 insert_seed_sql = """ insert into hainiu_queue(type,params,action) values (0,%s,%s); """ count_seed_sql = """ select count(1) from hainiu_web_seed; """ select_seed_sql = """ select id,url,category,last_crawl_time from hainiu_web_seed where status=0 limit %s,%s for update; """ update_queue_sql = """ update hainiu_web_seed set last_crawl_time='%s' where id in (%s); """ t = TimeUtil() try: d = DBUtil(config._OGC_DB) queue_total = d.read_one(count_seed_sql)[0] page_num = queue_total / page_size + 1 query_ids = [] print page_num, page_size for i in range(0, page_num): sql = select_seed_sql % (i * page_size, page_size) select_list = d.read_tuple(sql) insert_list = [] for record in select_list: id = record[0] url = record[1] category = record[2] last_crawl_time = str(record[3]) if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13], '%Y-%m-%d %H')) <= \ int(t.str2timestamp(t.get_dif_time(hour=-1, format='%Y-%m-%d %H'), format='%Y-%m-%d %H')): # 进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过 insert_list.append((category, url)) query_ids.append(str(id)) d.executemany(insert_seed_sql, insert_list) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % (t.now_time(), ids) print t.now_time(), ids d.execute(sql) except: rl.exception() rl.error(sql) d.rollback() finally: d.close()
def success_action(self,values): print "success" update_queue_sql = """ update hainiu_web_seed set status=0,last_crawl_time='%s' where id in (%s); """ try: sql = update_queue_sql % (TimeUtil().now_time(), values[0]) db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close()
def success_action(self, values): # 成功了就把hainiu_queue的记录删除 delete_queue_sql = """ delete from hainiu_queue where id in (%s); """ try: sql = delete_queue_sql % values[0] db = DBUtil(config._OGC_DB) db.execute(sql) except: self.rl.exception() self.rl.error(sql) db.rollback() finally: db.close()
def success_action(self, values): delete_sql = """ delete from hainiu_queue where id=%s """ try: d = DBUtil(config._OGC_DB) id = values[0] sql = delete_sql % id d.execute(sql) except: self.rl.exception() self.rl.error() d.rollback() finally: d.close()
def queue_items(self): ip=Util().get_local_ip() select_seed_sql=""" select id,url,category,domain,host,last_crawl_time from hainiu_web_seed where fail_times<=%s and locate('%s',fail_ip)=0 and status=0 limit 0,%s for update; """ update_queue_sql=""" update hainiu_web_seed set status=1,last_crawl_time='%s' where id in (%s); """ return_list=[] try: d=DBUtil(config._OGC_DB) sql=select_seed_sql % (self.fail_times,ip,self.limit) select_dict=d.read_dict(sql) # print select_dict query_ids=[] t=TimeUtil() for each in select_dict: id=each['id'] url=each['url'] category=each['category'] domain=each['domain'] host=each['host'] last_crawl_time=str(each['last_crawl_time']) if last_crawl_time is None or int(t.str2timestamp(last_crawl_time[:13],'%Y-%m-%d %H'))<=\ int(t.str2timestamp(t.get_dif_time(hour=-1,format='%Y-%m-%d %H'),format='%Y-%m-%d %H')): #进入这里的都是过去爬取的时间在一小时之前,或者没有爬取过 query_ids.append(str(id)) action=url params=category c = NewsFindActionConsumer(id, action, params) return_list.append(c) if query_ids: ids=','.join(query_ids) sql=update_queue_sql % (t.now_time(),ids) print t.now_time(),ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def create_seed(): url="http://www.autohome.com.cn/all" category="汽车" sql=""" insert into hainiu_web_seed (url,md5,domain,host,category,status) values ('%s','%s','%s','%s','%s',0) """ hu=HtmlUtil() domain=get_tld(url) host=hu.get_url_host(url) u=Util() md5=u.get_md5(url) rl=LogUtil().get_base_logger() try: d=DBUtil(config._OGC_DB) sql=sql % (url,md5,domain,host,category) d.execute(sql) except: rl.exception() d.rollback() finally: d.close()
def queue_items(self): ip = Util().get_local_ip() select_queue_sql = """ select id,action,params from hainiu_queue where type=0 and fail_times<=%s and locate('%s',fail_ip)=0 limit 0,%s for update; """ #type=1意思是url已经分配给消费者了 update_queue_sql = """ update hainiu_queue set type=1 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, ip, self.limit) select_dict = d.read_dict(sql) print select_dict query_ids = [] for each in select_dict: id = each['id'] url = each['action'] category = each['params'] query_ids.append(str(id)) c = NewsFindQueueConsumer(id, url, category) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def queue_items(self): ip = Util().get_local_ip() select_queue_sql = """ select id,action,params from hainiu_queue where fail_times<=%s and locate('%s',fail_ip)=0 and type=2 limit 0,%s for update; """ #type=3 已被消费者进程拿取过了 update_queue_sql = """ update hainiu_queue set type=3 where id in (%s); """ return_list = [] try: d = DBUtil(config._OGC_DB) sql = select_queue_sql % (self.fail_times, ip, self.limit) select_dict = d.read_dict(sql) query_ids = [] t = TimeUtil() for each in select_dict: id = each['id'] action = each['action'] params = each['params'] query_ids.append(str(id)) c = DownloadActionConsumer(id, action, params) return_list.append(c) if query_ids: ids = ','.join(query_ids) sql = update_queue_sql % ids d.execute(sql) except: self.rl.exception() self.rl.error(sql) d.rollback() finally: d.close() return return_list
def call_beautiful(url): ''' 给定url,获取 :param url: :return: ''' # url='http://roll.news.qq.com' r = RequestUtil() hu = HtmlUtil() t = TimeUtil() html = r.http_get_phandomjs(url) charset = hu.get_doc_charset(etree.HTML(html)) domain = get_fld(url) host = hu.get_url_host(url) u = Util() rl = LogUtil().get_base_logger() print "domain:", domain, ":host:", host soup = BeautifulSoup(html, 'lxml') a_docs = soup.find_all("a") for a in a_docs: a_href = get_format_url(url, a, host, charset) if a_href and a.text: print a.text print a_href xpath = hu.get_dom_parent_xpath_js(a) create_time = int(t.str2timestamp(t.now_time())) create_day = int(t.now_day().replace("-", "")) create_hour = int(t.now_hour()) update_time = int(t.str2timestamp(t.now_time())) if get_fld(a_href) == domain: print a_href #说明是内链接,写入redis数据库 redis_conn = RedisUtil().get_conn() redis = RedisUtil() key1 = "exist:" + u.get_md5(a_href) print redis_conn.keys(key1) if not redis_conn.keys(key1): key2 = "down:" + u.get_md5(a_href) dicts = {key1: a_href, key2: a_href} redis.set_batch_datas(dicts) #同时写入mysql-internal数据库保存信息 try: db = DBUtil(config._OGC_DB) insert_internal_sql = """ insert into hainiu_web_seed_internally (url,md5,param,domain,host,a_url,a_md5, a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) values("%s," * 13,"%s") on duplicate key update update_time=update_time +1; """ #values("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"); sql = insert_internal_sql % ( url, u.get_md5(url), "{title:" + a.text + "}", domain, host, a_href, u.get_md5(a_href), hu.get_url_host(a_href), xpath, a.text, create_time, create_day, create_hour, update_time) db.execute(sql) except: rl.exception() rl.error(sql) db.rollback() finally: db.close() else: #外连接写入mysql数据库,因为这部分只写,不会爬取 db = DBUtil(config._OGC_DB) insert_external_sql = """ insert into hainiu_web_seed_externally (url,md5,param,domain,host,a_url,a_md5, a_host,a_xpath,a_title,create_time,create_day,create_hour,update_time) values("%s," *13 ,"%s") on duplicate key update update_time=update_time +1; """ sql = insert_external_sql % ( url, u.get_md5(url), a.text, domain, host, a_href, u.get_md5(a_href), hu.get_url_host(a_href), xpath, a.text, create_time, create_day, create_hour, update_time) try: db.execute(sql) except: rl.exception() rl.error(sql) db.rollback() finally: db.close()