Esempio n. 1
0
    def crawl(self, url, is_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        uid_or_uname = ""
        try:
            page = craw_object.get_page()

            uid_or_uname = page_parser_from_search_for_uid(page)
        except:
            print traceback.format_exc()
            crawl_set_time_with_keyword.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_set_time_with_keyword.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                self.second_url_queue.put(url)
                return uid_or_uname
        return uid_or_uname
Esempio n. 2
0
 def crawl(self, url, is_again=True):
     loginer = Loginer()
     cookie = loginer.get_cookie()
     proxy = loginer.get_proxy()
     craw_object = Crawler_with_proxy(url, cookie, proxy)
     
     WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
     
     uid_or_uname = ""
     try:
         page = craw_object.get_page()
         
         uid_or_uname = page_parser_from_search_for_uid(page)
     except:
         print traceback.format_exc()
         crawl_set_time_with_keyword.del_proxy_lock.acquire()
         if proxy == loginer.get_proxy():
             loginer.del_proxy()
             WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
         crawl_set_time_with_keyword.del_proxy_lock.release()
         if is_again:
             return self.crawl(url, is_again=False)
         else:
             self.second_url_queue.put(url)
             return uid_or_uname
     return uid_or_uname
    def crawl(self, url, is_again=True, two_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        comment_list = []
        page = ""
        try:
            page = craw_object.get_page()
            comment_list = page_parser_from_search_for_comment(
                page)  # 解析页面,生成一条条的 comment
        except:
            print traceback.format_exc()
            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " proxy exception , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list

        if len(comment_list) == 0:
            # ## 还没有人针对这条微博发表评论!
            if no_one_commented(page):
                WeiboSearchLog().get_scheduler_logger().info(
                    self.name + " nobody commented !")
                return comment_list

            crawl_comment.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(
                    self.name + " comment_list is 0 , change proxy !")
            crawl_comment.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return comment_list
        else:
            return comment_list
Esempio n. 4
0
    def crawl(self, uid_or_nickname, is_again=False):

        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        url = ''
        #         if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\
        #             len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0:
        #             WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname)
        #             return "nothing"

        quote_uid_or_nickname = ""
        try:
            quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip()))
        except:
            print traceback.format_exc()
            print uid_or_nickname

        url = "http://weibo.cn/" + uid_or_nickname + "/info"

        #         if quote_uid_or_nickname == uid_or_nickname:
        #             url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
        #         else:
        #             url = "http://weibo.cn/n/" + quote_uid_or_nickname

        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()

        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name +
                                                     " start to crawl ! " +
                                                     url)

        user_info = ""
        try:
            page = craw_object.get_page()

            user_info = page_parser_from_search_for_UserInfo(page, url)
        except:
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                return user_info

        return user_info
Esempio n. 5
0
    def crawl(self, uid_or_nickname, is_again=False):
        
        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        url = ''
        if len(UserInfo_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(nickname=str(uid_or_nickname)))) != 0 or\
            len(Bie_Ming_store.objects(Q(uid_or_uname=str(uid_or_nickname)) | Q(bie_ming=str(uid_or_nickname)))) != 0:
            WeiboSearchLog().get_scheduler_logger().info("already in the database : " + uid_or_nickname)
            return "nothing"
        
        quote_uid_or_nickname = ""
        try:
            quote_uid_or_nickname = quote_plus(str(uid_or_nickname.strip()))
        except:
            print  traceback.format_exc()
            print  uid_or_nickname
        
#         url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
            
        if quote_uid_or_nickname == uid_or_nickname:
            url = "http://weibo.cn/" + uid_or_nickname + "?f=search_0"
        else:
            url = "http://weibo.cn/n/" + quote_uid_or_nickname
        
        # $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
        
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        
        craw_object = Crawler_with_proxy(url, cookie, proxy)
        
        WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)
        
        user_info = ""
        try:
            page = craw_object.get_page()
            
            user_info = page_parser_from_search_for_UserInfo(page, url)
        except:
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                return user_info
            
            
        return user_info
    def crawl(self, url, is_again=True, two_again=True):
        loginer = Loginer()
        cookie = loginer.get_cookie()
        proxy = loginer.get_proxy()
        craw_object = Crawler_with_proxy(url, cookie, proxy)

        WeiboSearchLog().get_scheduler_logger().info(self.name + " start to crawl ! " + url)

        repost_list = []
        page = ""
        try:
            page = craw_object.get_page()
            repost_list = page_parser_from_search_for_repost(page)  # 解析页面,生成一条条的 repost
        except:
            print traceback.format_exc()
            crawl_repost.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(self.name + " proxy exception , change proxy !")
            crawl_repost.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return repost_list

        if len(repost_list) == 0:
            # ## 还没有人针对这条微博发表评论!
            #             if no_one_commented(page):
            #                 WeiboSearchLog().get_scheduler_logger().info(self.name + " nobody commented !")
            #                 return repost_list;

            crawl_repost.del_proxy_lock.acquire()
            if proxy == loginer.get_proxy():
                loginer.del_proxy()
                WeiboSearchLog().get_scheduler_logger().warning(self.name + " repost_list is 0 , change proxy !")
            crawl_repost.del_proxy_lock.release()
            if is_again:
                return self.crawl(url, is_again=False)
            else:
                if two_again:
                    return self.crawl(url, is_again=False, two_again=False)
                return repost_list
        else:
            return repost_list