def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) uid_or_uname = "" try: page = craw_object.get_page() uid_or_uname = page_parser_from_search_for_uid(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return uid_or_uname return uid_or_uname
def crawl(self): # 实时且原创的微博 self.data['advancedfilter'] = '1' self.data['keyword'] = self.keyword self.data['hasori'] = '1' # self.data['nick'] = '' # self.data['starttime'] = '' # self.data['endtime'] = '' self.data['sort'] = 'time' self.data['smblog'] = '搜索' url = 'http://weibo.cn/search/' loginer = Loginer() cookie = loginer.get_cookie(); proxy = loginer.get_proxy(); craw_object = Crawler_with_proxy(url, cookie, proxy) weibo_list = [] try: page = craw_object.get_page_with_form(self.data) weibo_list = page_parser_from_search(page) except : print traceback.format_exc() loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() if len(weibo_list) == 0: loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() else: return weibo_list
def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) uid_or_uname = "" try: page = craw_object.get_page() uid_or_uname = page_parser_from_search_for_uid(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return uid_or_uname return uid_or_uname
def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) weibo_list = [] try: page = craw_object.get_page() weibo_list = page_parser_from_search(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list if len(weibo_list) == 0: if zero_aviable_check_validity(page): WeiboSearchLog().get_scheduler_logger().info( self.name + " get nothing, sina does not have ! " + url) return weibo_list if weibo_guangchang_forbidden(page): WeiboSearchLog().get_scheduler_logger().info( self.name + " get nothing, forbidden ! ! " + url) crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " get nothing, change proxy ! " + url) crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list else: if int(url[url.rfind('=') + 1:]) == 1: total_num = weibo_list[0].all_weibo_num self.put_second_and_more_url_queue(total_num, url) WeiboSearchLog().get_scheduler_logger().info(self.name + " crawl success! " + url) return weibo_list
def crawl(self, url, is_again=True, two_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) comment_list = [] page = "" try: page = craw_object.get_page() comment_list = page_parser_from_search_for_comment( page) # 解析页面,生成一条条的 comment except: print traceback.format_exc() crawl_comment.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_comment.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return comment_list if len(comment_list) == 0: # ## 还没有人针对这条微博发表评论! if no_one_commented(page): WeiboSearchLog().get_scheduler_logger().info( self.name + " nobody commented !") return comment_list crawl_comment.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " comment_list is 0 , change proxy !") crawl_comment.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return comment_list else: return comment_list
def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) weibo_list = [] try: page = craw_object.get_page() weibo_list = page_parser_from_search(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list if len(weibo_list) == 0: if zero_aviable_check_validity(page): WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, sina does not have ! " + url) return weibo_list if weibo_guangchang_forbidden(page): WeiboSearchLog().get_scheduler_logger().info(self.name + " get nothing, forbidden ! ! " + url) crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " get nothing, change proxy ! " + url) crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return weibo_list else: if int(url[url.rfind('=') + 1:]) == 1: total_num = weibo_list[0].all_weibo_num self.put_second_and_more_url_queue(total_num, url) WeiboSearchLog().get_scheduler_logger().info(self.name + " crawl success! " + url) return weibo_list
def crawl(self, url, is_again=True, two_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) repost_list = [] page = "" try: page = craw_object.get_page() repost_list = page_parser_from_search_for_repost(page) # 解析页面,生成一条条的 repost except: print traceback.format_exc() crawl_repost.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_repost.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return repost_list if len(repost_list) == 0: # ## 还没有人针对这条微博发表评论! # if no_one_commented(page): # WeiboSearchLog().get_scheduler_logger().info(self.name + " nobody commented !") # return repost_list; crawl_repost.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " repost_list is 0 , change proxy !") crawl_repost.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return repost_list else: return repost_list
def crawl(self): # 实时且原创的微博 self.data['advancedfilter'] = '1' self.data['keyword'] = self.keyword self.data['hasori'] = '1' # self.data['nick'] = '' # self.data['starttime'] = '' # self.data['endtime'] = '' self.data['sort'] = 'time' self.data['smblog'] = '搜索' url = 'http://weibo.cn/search/' loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) weibo_list = [] try: page = craw_object.get_page_with_form(self.data) weibo_list = page_parser_from_search(page) except: print traceback.format_exc() loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() if len(weibo_list) == 0: loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " get nothing, change proxy !") time.sleep(int(random.random() * 10)) return self.crawl() else: return weibo_list