Ejemplo n.º 1
0
    def mainpage_parse(self,response):
        #inspect_response(response,self)
        if response == None:
            yield self.start_requests()
            exit(1)

        sel = Selector(response)
        login_user = {}
        login_user['toCrawl_user_id']   =  response.meta['user_id']
        login_user['toCrawl_user_nick'] =  self.get_property(response,"onick")
        login_user['login_user_id']     =  self.get_property(response,"uid")
        login_user['page_id']           =  self.get_property(response,"page_id")
        login_user['domain']            =  self.get_property(response,"domain")

        # if uid's mainpage does note exist, pass to next user in queue
        if login_user['login_user_id'] is None or login_user['page_id'] is None or login_user['domain'] is None:
            log.msg(' user toCrawl id:%s does not exist! Please try next one' % login_user['toCrawl_user_id'],level=log.DEBUG)
            next_uid  =  self.forward_crawling_redis()
            if next_uid:
                self.redis_server.lpush(self.ids_problem_name,next_uid)
                self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1)
                trypage_url        =   QueryFactory.mainpage_query(next_uid)
                mainpage_request   =   Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid})
                yield mainpage_request
            else:
                log.msg(' Queue is empty, task to terminate.',level=log.INFO)
        else:
            print '\n',login_user,'\n'

            login_user_profile_url = QueryFactory.info_query(page_id=login_user['page_id'], domain=login_user['domain'])
            log.msg('  user toCrawl id: %s, user login id: %s' \
                    % (login_user['toCrawl_user_id'],login_user['login_user_id']), level=log.INFO)

            request = Request(url=login_user_profile_url,callback=self.user_info_parse,meta={'login_user':login_user})
            yield request
Ejemplo n.º 2
0
    def weibo_pages_num(self,response):
        #inspect_response(response,self)
        if response == None:
            yield self.start_requests()
            exit(1)

        # pares the current js response
        self.weibo_parse(response)

        login_user       =  response.meta['login_user']
        # load response in json form
        html_block_soup  =  self.json_load_response(response)

        # get the tag containing the max num of page
        page_list_tag    =  html_block_soup.find('div',{'action-type':'feed_list_page_morelist'})
        MAX_PAGE_NUM     =  50
        if page_list_tag:
            total_num_pages  =  int(re.search(r'\d+',page_list_tag.a.string).group(0))
            if total_num_pages > MAX_PAGE_NUM:
                total_num_pages   =   MAX_PAGE_NUM
        else:
            total_num_pages  =  1

        # warp weibo page urls to crawl
        weibo_page_urls       =  self.wrap_weibo_pages_urls(domain=login_user['domain'], page_id=login_user['page_id'], num_page=total_num_pages )

        print '\n\n Number of user weibos pages: ',total_num_pages,'\n\n'

        # test part weibo parser
        user_weibo_page_url   =  QueryFactory.weibo_js_query(domain = login_user['domain'], page_id = login_user['page_id'], page_num=2 )[0]

        # first request to get the total number of user weibos' pages
        #request = Request(url=user_weibo_page_url,callback=self.user_weibo_parse,meta={'login_user':login_user})
        #yield request


        # send requests contained in the weibo pages urls
        for page_url in weibo_page_urls:
            yield Request(url=page_url,callback=self.weibo_parse,meta={'login_user':login_user})

        # insert id crawled into redis
        self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1)
        self.redis_server.lpush(self.ids_crawled_name  ,login_user['toCrawl_user_id'])

        # TODO
        next_uid  =  self.forward_crawling_redis()

        if next_uid:
            trypage_url = QueryFactory.mainpage_query(next_uid)
            mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid})
            yield mainpage_request
        else:
            log.msg(' Queue is empty, task to terminate.',level=log.INFO)
    def login_parse(self,response):
        if response.body.find('feedBackUrlCallBack') != -1:
            data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))
            userinfo = data.get('userinfo', '')
            if len(userinfo):
                user_id = userinfo.get('uniqueid')
                screen_name = userinfo.get('displayname')
                log.msg('user login displayname: %s, user login id: %s' % (screen_name,user_id), level=log.INFO)
                assert screen_name in self.username
                self.logined = True

                #mainpage_url = QueryFactory.mainpage_query(user_id)
                # get 1 id from the list toCrawl
                id_toCrawl = self.id_toCrawl_list.pop()

                trypage_url = QueryFactory.mainpage_query(id_toCrawl)
                mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':id_toCrawl})
                yield mainpage_request

            else:
                self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))
Ejemplo n.º 4
0
    def login_parse(self,response):
        if response.body.find('feedBackUrlCallBack') != -1:
            data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))
            userinfo = data.get('userinfo', '')
            if len(userinfo):
                user_id = userinfo.get('uniqueid')
                screen_name = userinfo.get('displayname')
                log.msg('user login displayname: %s, user login id: %s' % (screen_name,user_id), level=log.INFO)
                #assert screen_name in self.username
                self.logined = True

                # load first user_id toCrawl from the Queue
                id_toCrawl  = self.redis_server.rpop(self.ids_toCrawl_name)
                if id_toCrawl:
                    # push  user_id toCrawl into the Processing Queue
                    self.redis_server.lpush(self.ids_processing_name,id_toCrawl)

                    trypage_url = QueryFactory.mainpage_query(id_toCrawl)
                    mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':id_toCrawl})
                    yield mainpage_request

            else:
                self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))