Beispiel #1
0
 def valid_cookie(self, html=''):
     html = str(html)
     
     if not html:
         url     = 'http://weibo.cn/kaifulee'
         headers = self.get_headers(url)
         html    = self.get_content_head(url, headers=headers)
     
     if not html:
         msg = 'Error in cookie: need relogin.'
         logger.info(msg)
         write_message(msg, self.window)
         
         self.clear_cookie(self.cookie_file)
         
         return False
     
     if u'登录' in html:
         if not self.login():
             msg = 'In valid_cookie: relogin failed.'
             logger.info(msg)
             write_message(msg, self.window)
             
             self.clear_cookie(self.cookie_file)
             
             return False
     
     gsid = None
     for c in self.cj:
         if c.name.startswith('gsid') and c.domain == '.weibo.cn':
             gsid = c.value
     
     self.login_params = {'gsid': gsid, 'vt': '4', 'lret': '1'}
     return True
    def _fetch_msg_repost(self, msg_id, page=1):
        html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
        
        page_right = self._check_page_right(html)

        if page_right is None:
            return None
        
        if page_right:
            return html, num_pages
        
        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()
            
            sec = (tries + 1) * 10
            write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
            time.sleep(sec)
            
            html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
            page_right = self._check_page_right(html)
            
            if page_right:
                return html, num_pages
            
            tries += 1
        
        return None, None
Beispiel #3
0
    def _fetch(self, url):
        html = self.fetcher.fetch(url)

        page_right = self._check_page_right(html)

        if page_right:
            return html

        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message(
                '_fetch trying: %s, sleep: %s seconds' % (tries, sec),
                self.window)
            time.sleep(sec)

            html = self.fetcher.fetch(url)
            page_right = self._check_page_right(html)

            if page_right:
                return html

            tries += 1

        return None
Beispiel #4
0
 def get_servertime(self):
     url = ('http://login.sina.com.cn/sso/prelogin.php?entry=account'
            '&callback=sinaSSOController.preloginCallBack&su=&rsakt=mod'
            '&client=ssologin.js(v1.4.5)&_=%s' % self.get_milli_time())
     
     headers = self.get_headers(url)
     headers['Accept']  = '*/*'
     headers['Referer'] = 'http://weibo.com/'
     del headers['Accept-encoding']
     
     result = {}
     req = self.pack_request(url, headers)
     
     for _ in range(3):
         data = None
         
         try:
             with contextlib.closing(urllib2.urlopen(req)) as resp:
                 data = resp.read()
                 
             p = re.compile('\((.*)\)')
         
             json_data = p.search(data).group(1)
             data      = json.loads(json_data)
             
             result['servertime'] = str(data['servertime'])
             result['nonce']      = data['nonce']
             result['rsakv']      = str(data['rsakv'])
             result['pubkey']     = str(data['pubkey'])
             self.pcid            = str(data['pcid'])
             break
         except Exception, e:
             msg = 'Get severtime error. %s' %str(e)
             logger.info(msg)
             write_message(msg, self.window)
Beispiel #5
0
 def get_login_form(self):
     url = 'http://3g.sina.com.cn/prog/wapsite/sso/login.php?ns=1&revalid=2&backURL=http%3A%2F%2Fweibo.cn%2F&backTitle=%D0%C2%C0%CB%CE%A2%B2%A9&vt='
     
     headers = self.get_headers(url)
     headers['Accept'] = '*/*'
     headers['Referer']= 'http://weibo.cn'
     del headers['Accept-encoding']
      
     req = self.pack_request(url, headers)
      
     rand     = None
     passwd_s = None 
     vk       = None 
     for _ in range(3):
         try:
             data = None
              
             with contextlib.closing(urllib2.urlopen(req)) as resp:
                 data = resp.read()
              
             rand     = HTML.fromstring(data).xpath('//form/@action')[0]
             passwd_s = HTML.fromstring(data).xpath("//input[@type='password']/@name")[0]
             vk       = HTML.fromstring(data).xpath("//input[@name='vk']/@value")[0]
                  
             return rand, passwd_s, vk 
         except Exception, e:
             msg = 'get login form error: %s' %str(e)
             logger.info(msg)
             write_message(msg, self.window)
              
             pass
Beispiel #6
0
 def check_cookie(self, user=None, pwd=None, soft_path=None):
     if user is None or pwd is None:
         user = self.username
         pwd  = self.password
     
     assert(user is not None and pwd is not None)
     
     if soft_path is None:
         soft_path = self.soft_path
         
     login_ok = True
     
     self.cookie_file = os.path.join(soft_path, settings.COMWEIBO_COOKIE)
     if os.path.exists(self.cookie_file):
         msg = 'cookie exist.'
         write_message(msg)
         
         if 'Set-Cookie' not in open(self.cookie_file, 'r').read():
             msg = 'but does not contain a valid cookie.'
             write_message(msg)
             
             login_ok = self.login(user, pwd)
     else:
         login_ok = self.login(user, pwd)
         
     if login_ok:
         return self.valid_cookie()
     else:
         return False            
Beispiel #7
0
 def save_verify_code(self, url):
     try:
         cookie_str = ''
         for cookie in self.cj.as_lwp_str(True, True).split('\n'):
             cookie = cookie.split(';')[0]
             cookie = cookie.replace('\"', '').replace('Set-Cookie3: ', ' ').strip()+';'
             cookie_str += cookie
         
         headers = self.get_headers(url)
         headers['Accept']  = 'image/png,image/*;q=0.8,*/*;q=0.5'
         headers['Referer'] = 'http://weibo.com/'
         headers['Cookie']  = cookie_str
         del headers['Accept-encoding']
         
         req = self.pack_request(url, headers)
         content = self.urlopen_read(req)
         
         f = open(os.path.join(self.soft_path, 'pin.png'), 'wb')
         f.write(content)
         f.flush()
         f.close()
     except Exception, e:
         msg = 'Save verify code error. %s' %str(e)
         logger.info(msg)
         write_message(msg, self.window)
         
         return
Beispiel #8
0
    def _fetch_msg_comment(self, msg_id, page=1):
        html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page)

        page_right = self._check_page_right(html)

        if page_right is None:
            return None

        if page_right:
            return html, num_pages

        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message(
                '_fetch trying: %s, sleep: %s seconds' % (tries, sec),
                self.window)
            time.sleep(sec)

            html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
            page_right = self._check_page_right(html)

            if page_right:
                return html, num_pages

            tries += 1

        return None, None
Beispiel #9
0
 def fetch_msg_comments(self, msg_id, page=1):
     url = 'http://weibo.com/aj/comment/big?_wv=5'
     
     headers = self.get_headers(url)
     headers['Accept']  = '*/*'
     headers['Referer'] = 'http://weibo.com/'
     del headers['Accept-encoding']
     
     body = {
         '__rnd'   : str(int(time.time() * 1000)),
         '_t'      : '0',
         'id'      : msg_id,
         'page'    : page            
     }        
     
     url = url + urllib.urlencode(body)
     req = self.pack_request(url, headers)
     page= self.urlopen_read(req)
     
     try:
         if json.loads(page)['code'] == '100000':
             data = json.loads(page)['data']
             html = data['html']
             pg   = int(data['page']['totalpage'])
             
             return html, pg
         else:
             msg = json.loads(page)['msg']
             logger.info(msg)
             write_message(msg, self.window)
             return None, None
     except ValueError:
         return page, None   
 def _fetch(self, url):
     html = self.fetcher.fetch(url)
     
     page_right = self._check_page_right(html)
     
     if page_right:
         return html
     
     tries = 0
     while not page_right and tries <= 10:
         time.sleep(10)
         self.fetcher.check_cookie()
         
         sec = (tries + 1) * 10
         write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
         time.sleep(sec)
         
         html = self.fetcher.fetch(url)
         page_right = self._check_page_right(html)
         
         if page_right:
             return html
         
         tries += 1
     
     return None
Beispiel #11
0
 def _crawl(parser, uid, page, num_pages=''):
     msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page)
     write_message(msg, self.window)
     
     url  = 'http://weibo.com/%s/fans?page=%s' %(uid, page)
     html = self._fetch(url, query=settings.QUERY_FANS)
     try:
         pq_doc = pq(html)
         return parser.parse(pq_doc)
     except:
         return 0
Beispiel #12
0
        def _crawl(parser, uid, page, num_pages=""):
            msg = "Crawl user(%s)'s weibos-page: %s:%s" % (self.uid, num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
Beispiel #13
0
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
Beispiel #14
0
        def _crawl(parser, uid, page, num_pages=""):
            msg = "Crawl user(%s)'s follows-page: %s:%s" % (self.uid, num_pages, page)
            write_message(msg, self.window)

            url = "http://weibo.com/%s/follow?page=%s" % (uid, page)
            html = self._fetch(url, query=settings.QUERY_FOLLOWS)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
Beispiel #15
0
 def _crawl(parser, msg_id, page, num_pages=''):
     msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page)
     write_message(msg, self.window)
 
     html, num_pages = self._fetch_msg_comment(msg_id, page)
     try:
         pq_doc = pq(html)
         parser.parse(pq_doc)
     except:
         pass
     
     return num_pages
Beispiel #16
0
        def _crawl(parser, msg_id, page, num_pages=""):
            msg = "Crawl message(%s)'s reposts-page:%s:%s" % (self.msg_id, num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_repost(msg_id, page)

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)
            except:
                pass

            return num_pages
Beispiel #17
0
    def do_login(self, login_user, login_pwd, door=''):
        login_ok = False

        try:
            username = login_user
            pwd = login_pwd

            url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)'

            postdata = {
                'entry': 'weibo',
                'gateway': '1',
                'from': '',
                'savestate': '7',
                'userticket': '1',
                'pagerefer': '',
                'ssosimplelogin': '******',
                'vsnf': '1',
                'vsnval': '',
                'service': 'miniblog',
                'pwencode': 'rsa2',
                'rsakv': self.rsakv,
                'encoding': 'utf-8',
                'url':
                'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
                'returntype': 'META',
                'prelt': '26',
            }
            postdata['servertime'] = self.servertime
            postdata['nonce'] = self.nonce
            postdata['su'] = self.get_user(username)
            postdata['sp'] = self.get_pwd(pwd, self.servertime,
                                          self.nonce).lower()

            #当需要验证码登录的时候
            if door:
                postdata['pcid'] = self.pcid
                postdata['door'] = door.lower()

            headers = self.get_headers(url)
            headers['Referer'] = 'http://weibo.com/'

            req = self.pack_request(url, headers, postdata)

            text = self.urlopen_read(req)
            return text
        except Exception, e:
            msg = 'Error in do_login. %s' % str(e)
            logger.info(msg)
            write_message(msg, self.window)
 def _crawl(parser, uid, page, num_pages='?'):
     msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
     write_message(msg, self.window)
 
     html = self._fetch_weibo(uid, page)
     
     if html is None:
         return None
     
     try:
         pq_doc = pq(html)
         return parser.parse(pq_doc)
     except:
         return None
Beispiel #19
0
    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            url  = 'http://weibo.com/%s/follow?page=%s' %(uid, page)
            html = self._fetch(url, query=settings.QUERY_FOLLOWS)
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
        
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path)
        
        start_time = time.time()
        
        parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)
        
                num_pages = settings.PAGE_LIMIT
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
                
            worker_manager.wait_all_complete()

        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
 def _crawl(parser, uid, page, num_pages='?'):
     msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page)
     write_message(msg, self.window)
     
     url  = 'http://weibo.cn/%s/fans?page=%s' %(uid, page)
     html = self._fetch(url)
     
     if html is None:
         return None
     
     try:
         pq_doc = pq(html)
         return parser.parse(pq_doc)
     except:
         return None
Beispiel #21
0
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s weibos-page: %s:%s' % (self.uid,
                                                            num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None
 def do_login(self, login_user, login_pwd, door=''):
     login_ok = False
     
     try:
         username = login_user
         pwd      = login_pwd
         
         url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)'
         
         postdata = {
             'entry'         : 'weibo',
             'gateway'       : '1',
             'from'          : '',
             'savestate'     : '7',
             'userticket'    : '1',
             'pagerefer'     : '',
             'ssosimplelogin': '******',
             'vsnf'          : '1',
             'vsnval'        : '',
             'service'       : 'miniblog',
             'pwencode'      : 'rsa2',
             'rsakv'         : self.rsakv,
             'encoding'      : 'utf-8',
             'url'           : 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
             'returntype'    : 'META',
             'prelt'         : '26',
         }
         postdata['servertime'] = self.servertime
         postdata['nonce']      = self.nonce
         postdata['su']         = self.get_user(username)
         postdata['sp']         = self.get_pwd(pwd, self.servertime, self.nonce).lower()
        
         #当需要验证码登录的时候
         if door:
             postdata['pcid'] = self.pcid
             postdata['door'] = door.lower()
         
         headers = self.get_headers(url)
         headers['Referer'] = 'http://weibo.com/'
         
         req = self.pack_request(url, headers, postdata)
         
         text = self.urlopen_read(req)
         return text
     except Exception, e:
         msg = 'Error in do_login. %s' %str(e)
         logger.info(msg)
         write_message(msg, self.window)
Beispiel #23
0
    def _check_page_right(self, html):
        '''
        check whether the page is got before login or after.
        '''

        if html is None:
            return False

        if len(html) == 0:
            msg = u'weibo改版了,信息标签发生变化'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        return not (u'<title>' in html)
Beispiel #24
0
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages,
                                                          page)
            write_message(msg, self.window)

            url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page)
            html = self._fetch(url)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None
 def _crawl(parser, msg_id, page, num_pages='?'):
     msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page)
     write_message(msg, self.window)
 
     html, num_pages = self._fetch_msg_repost(msg_id, page)
     
     if html is None:
         return None
     
     try:
         pq_doc = pq(html)
         parser.parse(pq_doc)
         
         return num_pages
     except:
         return None
 def _check_page_right(self, html):
     '''
     check whether the page is got before login or after.
     '''
     
     if html is None:
         return False
     
     if len(html) == 0:
         msg = u'weibo改版了,信息标签发生变化'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
     
     return not (u'<title>' in html)
Beispiel #27
0
    def login(self, login_user=None, login_pwd=None):
        if self.username is None or self.password is None:
            self.username = login_user
            self.password = login_pwd
        
        assert self.username is not None and self.password is not None
            
        rand, passwd_s, vk = self.get_login_form()
        postdata = {
            'mobile'   : self.username,
            passwd_s   : self.password,
            'remember' : 'on',
            'backURL'  : 'http://weibo.cn/',
            'backTitle': '新浪微博',
            'vk'       : vk,
            'submit'   : '登录',
            'encoding' : 'utf-8'
        }
        
        url  = 'http://3g.sina.com.cn/prog/wapsite/sso/' + rand
        headers = self.get_headers(url)
        req  = self.pack_request(url, headers, postdata)
        page = self.urlopen_read(req)

        link = HTML.fromstring(page).xpath("//a/@href")[0]
        if not link.startswith('http://'): 
            link = 'http://weibo.cn/%s' % link
        
        headers = self.get_headers(link)
        req = self.pack_request(link, headers)
        self.urlopen_read(req)
        
        link = urldecode(link)
        
        try:
            self.login_params = urldecode(link['u'])
            self.cj.save(self.cookie_file, True, True)
            
            msg = 'weibo.cn: login succeed.'
            write_message(msg)
            
            return True
        except KeyError:
            msg = 'Login failed: it may caused by the wrong username/password.\nPlease check.'
            logger.info(msg)
            write_message(msg, self.window)
            return False
Beispiel #28
0
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s reposts-page:%s:%s' % (self.msg_id,
                                                               num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_repost(msg_id, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)

                return num_pages
            except:
                return None
Beispiel #29
0
 def check_user(self, uid):
     url = 'http://weibo.cn/u/%s' %(uid)
     
     headers = self.get_headers(url)
     headers['Accept'] = '*/*'
     headers['Referer']= 'http://weibo.cn/'
     
     tries = 10
     for _ in range(tries):
         try:
             if self.login_params is None:
                 if not self.check_cookie():
                     continue
     
             params = urldecode(url)
             params.update(self.login_params)
             url = '%s?%s' % (url.split('?', 1)[0], urllib.urlencode(params))
     
             req = self.pack_request(url, headers)
             
             self.n_connections += 1
             
             page = None
             with contextlib.closing(urllib2.urlopen(req)) as resp:
                 if resp.info().get('Content-Encoding') == 'gzip':
                     page = self.gzip_data(resp.read())
                 else:
                     page = resp.read()
                      
             #not login
             if u'登录' in page:
                 if not self.check_cookie():
                     msg = 'Error in check user: login failed.'
                     write_message(msg, self.window)
                     
                     return None
             
             return  not (u'用户不存在' in page or 'User does not exists' in page
                          or u'抱歉,您当前访问的用户状态异常,暂时无法访问。' in page)
         except Exception, e:
             msg = 'Error in check_user: exit Exception. %s' %str(e)
             logger.info(msg)
             write_message(msg, self.window)
             
             return None  
Beispiel #30
0
 def fetch(self, url, query):
     headers = self.get_headers(url)
     headers['Accept']  = '*/*'
     headers['Referer'] = 'http://weibo.com/'
     
     req = self.pack_request(url, headers)
     page= self.urlopen_read(req)
     
     if "$CONFIG['allowConnect'] = 'false'" in page or "$CONFIG['allowConnect']='false'" in page:
         msg = u'访问频繁,被新浪暂封了.'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
         
     doc = self.extract_content(page, query)
                 
     return doc     
Beispiel #31
0
 def _get_first_part(headers, body, url):
     body['__rnd']    = str(int(time.time() * 1000))
     body['pre_page'] = body['page'] - 1
     
     url = url + urllib.urlencode(body)
     req = self.pack_request(url, headers)
     page= self.urlopen_read(req)
                 
     try:
         if json.loads(page)['code'] == "100000":
             return json.loads(page)['data']
         else:
             msg = json.loads(page)['msg']
             logger.info(msg)
             write_message(msg, self.window)
             return None
     except ValueError:
         return page
Beispiel #32
0
 def redo_login(self, login_url):
     login_ok = False
     
     try:
         headers = self.get_headers(login_url)
         headers['Referer'] = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)'
         
         req = self.pack_request(login_url, headers)
         if self.urlopen_read(req) is not None:
             self.cj.save(self.cookie_file, True, True)
         
             msg = 'login success'
             write_message(msg)
             login_ok = True
     except Exception, e:
         msg = 'Error in redo_login. %s' %str(e)
         logger.info(msg)
         write_message(msg, self.window)
Beispiel #33
0
    def urlopen_read(self, req):
        tries = 10
         
        for i in range(tries):
            try:
                self.n_connections += 1
                 
                page = None
                with contextlib.closing(urllib2.urlopen(req)) as resp:
                    if resp.info().get('Content-Encoding') == 'gzip':
                        page = self.gzip_data(resp.read())
                    else:
                        page = resp.read()
                return page
            except Exception, e:
                if e.code == 404:
                    msg = 'Error in urlopen_read: %s.' %str(e)
                    write_message(msg, self.window)
                
                    return None
                
                if i < tries - 1:
                    sec = (i + 1) * 5
                    msg = ('Error in urlopen_read: %s\nTake a rest: %s seconds, and retry.'
                           %(str(e), sec))
                    write_message(msg, self.window)
 
                    time.sleep(sec)
                else:
                    msg = 'Exit incorrect. %s' %str(e)
                    logger.info(msg)
                    write_message(msg, self.window)
                     
                    return None
Beispiel #34
0
    def valid_cookie(self, html=''):
        html = str(html)
        
        if not html:
            url     = 'http://weibo.com/kaifulee'
            headers = self.get_headers(url)
            html    = self.get_content_head(url, headers=headers)
            
        if not html:
            msg = 'need relogin.'
            logger.info(msg)
            write_message(msg, self.window)
         
            self.clear_cookie(self.cookie_file)
            
            return False
        
        html = str(html)
        html = html.replace('"', "'")
        
        if 'sinaSSOController' in html:
            p = re.compile('location\.replace\(\'(.*?)\'\)')
            
            try:
                login_url = p.search(html).group(1)
                headers   = self.get_headers(login_url)
                
                req = self.pack_request(url=login_url, headers=headers)
                html = self.urlopen_read(req)
                
                self.cj.save(self.cookie_file, True, True)
            except Exception, e:
                msg = 'relogin failed. %s' %str(e)
                logger.info(msg)
                write_message(msg, self.window)

                self.clear_cookie(self.cookie_file)
                
                return False
Beispiel #35
0
    def _fetch_msg_comment(self, msg_id, page=1):
        html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page)

        page_right = self._check_page_right(html)
        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message("_fetch trying: %s, sleep: %s seconds" % (tries, sec), self.window)
            time.sleep(sec)

            html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
            page_right = self._check_page_right(html)
            if page_right:
                return html, num_pages

            tries += 1
        else:
            return html, num_pages

        self.error = True
Beispiel #36
0
 def _fetch_weibo(self, uid, page):
     html = self.fetcher.fetch_weibo(uid, page)
     
     page_right = self._check_page_right(html)
     tries = 0
     while not page_right and tries <= 10:
         time.sleep(10)
         self.fetcher.check_cookie()
         
         sec = (tries + 1) * 10
         write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
         time.sleep(sec)
         
         html = self.fetcher.fetch_weibo(uid, page)
         page_right = self._check_page_right(html)
         if page_right:
             return html
         
         tries += 1
     else:
         return html    
     
     self.error = True
Beispiel #37
0
    def check_user(self, uid):
        is_exist = False

        url = 'http://weibo.com/u/%s' % (uid)

        headers = self.get_headers(url)
        headers['Accept'] = '*/*'
        headers['Referer'] = 'http://weibo.com/'

        req = self.pack_request(url, headers)

        tries = 10
        for i in range(tries):
            try:
                self.n_connections += 1

                page = None
                with contextlib.closing(urllib2.urlopen(req)) as resp:
                    if resp.info().get('Content-Encoding') == 'gzip':
                        page = self.gzip_data(resp.read())
                    else:
                        page = resp.read()

                if "$CONFIG['islogin'] = '******'" in page or "$CONFIG['islogin']='******'" in page:
                    is_exist = not (u'错误提示 新浪微博' in page)

                    return is_exist
                else:
                    if not self.check_cookie():
                        msg = 'Error in check_user: login failed'
                        logger.info(msg)
                        write_message(msg, self.window)

                        return None

            except urllib2.HTTPError, e:
                if e.code == 302 and e.geturl is not None:
                    is_exist = True
                else:
                    is_exist = False

                return is_exist
            except urllib2.URLError, e:
                if isinstance(e.reason, socket.timeout) and (i < tries - 1):
                    sec = (i + 1) * 5
                    msg = (
                        'Error in check_user:timeout. Retry: (%s-%s)-sleep %s seconds'
                        % (tries, i, sec))
                    write_message(msg, self.window)
                    time.sleep(sec)
                else:
                    msg = 'Error in check_user: retry timeout. %s' % str(e)
                    logger.info(msg)
                    write_message(msg, self.window)

                    return None
Beispiel #38
0
    def crawl_weibos(self):
        def _crawl(parser, uid, page, num_pages=""):
            msg = "Crawl user(%s)'s weibos-page: %s:%s" % (self.uid, num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0

        msg = "Checking: whether user(%s) exists or not..." % self.uid
        write_message(msg, self.window)

        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:
            return

        if not is_exist:
            msg = "Not exist: %s." % self.uid
            logger.info(msg)
            write_message(msg, self.window)

            return

        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)

        start_time = time.time()

        parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage)

        num_pages = _crawl(parser, self.uid, page=1)

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()

        cost_time = int(time.time() - start_time)
        msg = "Crawl user(%s)'s weibos: total page=%s," " cost time=%s sec, connections=%s" % (
            self.uid,
            num_pages,
            cost_time,
            self.fetcher.n_connections,
        )
        logger.info(msg)
        write_message(msg, self.window)
Beispiel #39
0
    def crawl_weibos(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            html = self._fetch_weibo(uid, page)
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
            
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        
        is_exist = self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            
            return
        
        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)
        
        start_time = time.time()
        
        parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage)
        
        num_pages = _crawl(parser, self.uid, page=1)

        pages = [i for i in xrange(2, num_pages+1)]
        """
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
            
            worker_manager.wait_all_complete()
        """
        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
Beispiel #40
0
 def crawl_msg_reposts(self):
     def _crawl(parser, msg_id, page, num_pages=''):
         msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page)
         write_message(msg, self.window)
     
         html, num_pages = self._fetch_msg_repost(msg_id, page)
         
         try:
             pq_doc = pq(html)
             parser.parse(pq_doc)
         except:
             pass
         
         return num_pages
     
     msg = 'Checking: whether message exists or not...'
     write_message(msg, self.window)
     msg_id = self.fetcher.check_message(self.msg_url)
     
     if msg_id is None:
         msg = 'Not exist: %s.' %self.msg_url            
         logger.info(msg)
         write_message(msg, self.window)
         
         return
       
     self.msg_id = msg_id
     self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path)
     
     start_time = time.time()
     
     parser = ComRepostsParser(msg_id, self.storage)
     num_pages = _crawl(parser, self.msg_id, 1)
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
         
         worker_manager.wait_all_complete()
         
     cost_time = int(time.time() - start_time)
     
     msg = ('Crawl message(%s)\'s reposts: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window) 
Beispiel #41
0
    def crawl_infos(self):
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            return
        
        msg = 'Crawl user(%s)\'s profile' %self.uid
        logger.info(msg)
        write_message(msg, self.window)
        
        self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path)
        
        start_time = time.time()

        url    = 'http://weibo.com/%s/info' % self.uid
        parser = ComInfosParser(self.uid, self.storage)
        
        html   = self._fetch(url, query=settings.QUERY_INFO)
        try:
            pq_doc = pq(html)
            parser.parse(pq_doc)
        except:
            pass
    
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' 
               %(self.uid, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
Beispiel #42
0
    def check_new_weibos(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0

        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)

        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:
            return

        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)

            return

        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)

        start_time = time.time()

        parser = ComWeibosParser(self.uid, self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s'
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)