def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) uid_or_uname = "" try: page = craw_object.get_page() uid_or_uname = page_parser_from_search_for_uid(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return uid_or_uname return uid_or_uname
def crawl(self, url, is_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) uid_or_uname = "" try: page = craw_object.get_page() uid_or_uname = page_parser_from_search_for_uid(page) except: print traceback.format_exc() crawl_set_time_with_keyword.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_set_time_with_keyword.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: self.second_url_queue.put(url) return uid_or_uname return uid_or_uname
def crawl(self, url, is_again=True, two_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) comment_list = [] page = "" try: page = craw_object.get_page() comment_list = page_parser_from_search_for_comment( page) # 解析页面,生成一条条的 comment except: print traceback.format_exc() crawl_comment.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " proxy exception , change proxy !") crawl_comment.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return comment_list if len(comment_list) == 0: # ## 还没有人针对这条微博发表评论! if no_one_commented(page): WeiboSearchLog().get_scheduler_logger().info( self.name + " nobody commented !") return comment_list crawl_comment.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning( self.name + " comment_list is 0 , change proxy !") crawl_comment.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return comment_list else: return comment_list
def crawl(self, uid_or_nickname, is_again=False): # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ url = '' # if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ # len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: # WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname) # return "nothing" quote_uid_or_nickname = "" try: quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip())) except: print traceback.format_exc() print uid_or_nickname url = "http://weibo.cn/" + uid_or_nickname + "/info" # if quote_uid_or_nickname == uid_or_nickname: # url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" # else: # url = "http://weibo.cn/n/" + quote_uid_or_nickname # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) user_info = "" try: page = craw_object.get_page() user_info = page_parser_from_search_for_UserInfo(page, url) except: if is_again: return self.crawl(url, is_again=False) else: return user_info return user_info
def crawl(self, uid_or_nickname, is_again=False): # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ url = '' if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\ len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0: WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname) return "nothing" quote_uid_or_nickname = "" try: quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip())) except: print traceback.format_exc() print uid_or_nickname # url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" if quote_uid_or_nickname == uid_or_nickname: url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0" else: url = "http://weibo.cn/n/" + quote_uid_or_nickname # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) user_info = "" try: page = craw_object.get_page() user_info = page_parser_from_search_for_UserInfo(page, url) except: if is_again: return self.crawl(url, is_again=False) else: return user_info return user_info
def crawl(self, url, is_again=True, two_again=True): loginer = Loginer() cookie = loginer.get_cookie() proxy = loginer.get_proxy() craw_object = Crawler_with_proxy(url, cookie, proxy) WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url) repost_list = [] page = "" try: page = craw_object.get_page() repost_list = page_parser_from_search_for_repost(page) # 解析页面,生成一条条的 repost except: print traceback.format_exc() crawl_repost.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !") crawl_repost.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return repost_list if len(repost_list) == 0: # ## 还没有人针对这条微博发表评论! # if no_one_commented(page): # WeiboSearchLog().get_scheduler_logger().info(self.name + " nobody commented !") # return repost_list; crawl_repost.del_proxy_lock.acquire() if proxy == loginer.get_proxy(): loginer.del_proxy() WeiboSearchLog().get_scheduler_logger().warning(self.name + " repost_list is 0 , change proxy !") crawl_repost.del_proxy_lock.release() if is_again: return self.crawl(url, is_again=False) else: if two_again: return self.crawl(url, is_again=False, two_again=False) return repost_list else: return repost_list