Beispiel #1
0
def local(db='file', folder=None, uids=[]):
    global give_ups
    
    create = create_cookie_file()
    fetcher = CnFetcher(account, pwd, cookie_file if not create else None)
    if create:
        fetcher.login(cookie_filename=cookie_file)
    while give_ups > 0:
        while len(tokens) == 0:
            if give_ups > 0:
                pass
            else:
                return
        
        token = tokens.pop()
        cb = callback(token)
        
        if len(uids) == 0:
            give_ups = 0
        else:
            uid = uids.pop()
            
            try:
                crawler = UserCrawler(uid, is_uid=True, fetcher=fetcher, 
                                      fetch_fans=False, callbacks=cb, span=False)
                uid = crawler.uid
                if db == 'file' and folder is not None:
                    storage = FileStorage(uid, folder)
                elif db == 'mongo':
                    storage = MongoStorage(uid)
                else:
                    raise ValueError('db must be "file" or "mongo", ' + 
                                     'when is "file", you must define folder parameter.')
                
                if storage.crawled: 
                    storage.complete()
                    cb()
                    continue
                else:
                    crawler.set_storage(storage)
                    crawler.start()
            except Exception, e:
                cb()
                # raise e
                logger.exception(e)
Beispiel #2
0
def local(uids=[]):

    fetcher = CnFetcher()
    fetcher.login()

    connection_error = False

    while len(uids) > 0 or connection_error:
        if not connection_error:
            uid = uids.pop()
        try:
            crawler = UserCrawler(uid, fetcher)
            crawler.run()
            connection_error = False
        except URLError, e:
            logger.exception(e)
            connection_error = True
            time.sleep(10)
Beispiel #3
0
 def __init__(self, user, is_uid=None, 
              storage=None, fetcher=None, 
              fetch_fans=True, span=True, 
              # dozens of callbacks
              callbacks=None,  
              success_callbacks=None, 
              error_callbacks=None,
              ):
     super(UserCrawler, self).__init__()
     
     logger.info('fetch user: %s' % user)
     if is_uid is True:
         self.uid = user
     elif is_uid is False:
         self.uid = None
     else:
         try:
             int(user)
             self.uid = user
         except ValueError:
             self.uid = None
     if self.uid is not None:
         self.url = 'http://weibo.cn/u/%s' % self.uid
     else:
         self.url = 'http://weibo.cn/%s' % user
     if fetcher is None:
         self.fetcher = CnFetcher(account, pwd)
         self.fetcher.login()
     else:
         self.fetcher = fetcher
     self.storage = storage
     
     self.user_not_exist = False
     html = self._fetch(self.url)
     if html is None:
         self.user_not_exist = True
     elif self.uid is None:
         parser = CnWeiboParser(html, user, self.storage)
         self.uid = parser.get_uid()
     self.fetch_fans = fetch_fans
     self.span = span
     self.error = False
     self.callbacks = callbacks
     self.success_callbacks = success_callbacks
     self.error_callbacks = error_callbacks
Beispiel #4
0
def dc():
    def run_callbacks(callbacks):
        for callback in callbacks:
            callback()
    
    global give_ups
    
    try:
        create = create_cookie_file()
        fetcher = CnFetcher(account, pwd, cookie_file if not create else None)
        if create:
            fetcher.login(cookie_filename=cookie_file)
        while give_ups > 0:
            n = 0
            while len(tokens) == 0:
                if give_ups > 0:
                    n += 1
                    time.sleep(n);
                else:
                    return
            
            token = tokens.pop()
            cb = callback(token)
            
            soc = create_socket()
            try:
                data = json.loads(soc.recv(buf_size))
                if data == None:
                    time.sleep(15)
                    cb()
                    continue
                elif len(data) == 0:
                    give_ups -= 1
                    continue
                
                user = data['user']
                is_uid = data['is_uid']
                crawled = data.get('crawled', False)
                follow = data.get('follow', None)
                
                # monitor callback
                register_heartbeat(user)()
                register_rm_cb = register_heartbeat(user, True)
                
                # success callbacks
                success_callbacks = (register_rm_cb, reset_error_callback)
                error_callbacks = (error_callback, register_rm_cb)
                
                try:
                    crawler = UserCrawler(user, is_uid=is_uid, fetcher=fetcher, 
                                          fetch_fans=follow is None, 
                                          callbacks=cb, 
                                          success_callbacks=success_callbacks,
                                          error_callbacks=error_callbacks)
                    # the user not exist
                    if crawler.user_not_exist or crawler.uid == 'attention':
                        cb()
                        run_callbacks(success_callbacks)
                        continue
                    
                    uid = crawler.uid
                    storage = MongoStorage(uid, follow, user=user)
                    
                    if crawled or storage.crawled: 
                        cb()
                        run_callbacks(success_callbacks)
                        storage.close()
                        continue
                    else:
                        crawler.set_storage(storage)
                        crawler.start()
                except Exception, e:
                    cb()
                    run_callbacks(error_callbacks)
                    # raise e
                    logger.exception(e)
            finally:
                soc.close()
    finally:
        # When run over, call stop heartbeat
        stop_heartbeat()
Beispiel #5
0
class UserCrawler(threading.Thread):
    def __init__(self, user, is_uid=None, 
                 storage=None, fetcher=None, 
                 fetch_fans=True, span=True, 
                 # dozens of callbacks
                 callbacks=None,  
                 success_callbacks=None, 
                 error_callbacks=None,
                 ):
        super(UserCrawler, self).__init__()
        
        logger.info('fetch user: %s' % user)
        if is_uid is True:
            self.uid = user
        elif is_uid is False:
            self.uid = None
        else:
            try:
                int(user)
                self.uid = user
            except ValueError:
                self.uid = None
        if self.uid is not None:
            self.url = 'http://weibo.cn/u/%s' % self.uid
        else:
            self.url = 'http://weibo.cn/%s' % user
        if fetcher is None:
            self.fetcher = CnFetcher(account, pwd)
            self.fetcher.login()
        else:
            self.fetcher = fetcher
        self.storage = storage
        
        self.user_not_exist = False
        html = self._fetch(self.url)
        if html is None:
            self.user_not_exist = True
        elif self.uid is None:
            parser = CnWeiboParser(html, user, self.storage)
            self.uid = parser.get_uid()
        self.fetch_fans = fetch_fans
        self.span = span
        self.error = False
        self.callbacks = callbacks
        self.success_callbacks = success_callbacks
        self.error_callbacks = error_callbacks
        
    def _check_user_exist(self, html):
        # If user not exist or forbiddened by weibo, directly return False 
        if u'抱歉,您当前访问的用户状态异常,暂时无法访问。' in html:
            self.error = True
            self.user_not_exist = True
            return False
        return True
        
    def _fetch(self, url):
        html = self.fetcher.fetch(url)
        if not self._check_user_exist(html):
            return
        right = check_page_right(html)
        tries = 0
        while not right and tries <= 6:
            time.sleep(10)
            self.fetcher.login()
            sec = 10 * (tries + 1) if tries <= 2 else (
                600 * (tries - 2) if tries < 6 else 3600)
            time.sleep(sec)
            html = self.fetcher.fetch(url)
            if not self._check_user_exist(html):
                return
            right = check_page_right(html)
            if right:
                return html
            tries += 1
        else:
            return html
        self.error = True
        
    @property
    def info_link(self):
        return 'http://weibo.cn/%s/info' % self.uid
    
    @property
    def follow_link(self):
        return 'http://weibo.cn/%s/follow' % self.uid
    
    @property
    def fav_link(self):
        return 'http://weibo.cn/%s/fans' % self.uid
    
    def set_storage(self, storage):
        self.storage = storage
    
    def _crawl(self, url, parser_cls):
        def start(url):
            html = self._fetch(url)
            parser = parser_cls(html, self.uid, self.storage)
            return parser.parse()
        
        error = None
        for i in range(3):
            try:
                return start(url)
            except urllib2.HTTPError, e:
                if e.code == 404:
                    self.error = True
                    continue
                else:
                    error = e
                    continue
            except urllib2.URLError, e:
                error = e
                continue
            time.sleep(i * 5)