Example #1
0
    def mainpage_parse(self,response):
        #inspect_response(response,self)
        if response == None:
            yield self.start_requests()
            exit(1)

        sel = Selector(response)
        login_user = {}
        login_user['toCrawl_user_id']   =  response.meta['user_id']
        login_user['toCrawl_user_nick'] =  self.get_property(response,"onick")
        login_user['login_user_id']     =  self.get_property(response,"uid")
        login_user['page_id']           =  self.get_property(response,"page_id")
        login_user['domain']            =  self.get_property(response,"domain")

        # if uid's mainpage does note exist, pass to next user in queue
        if login_user['login_user_id'] is None or login_user['page_id'] is None or login_user['domain'] is None:
            log.msg(' user toCrawl id:%s does not exist! Please try next one' % login_user['toCrawl_user_id'],level=log.DEBUG)
            next_uid  =  self.forward_crawling_redis()
            if next_uid:
                self.redis_server.lpush(self.ids_problem_name,next_uid)
                self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1)
                trypage_url        =   QueryFactory.mainpage_query(next_uid)
                mainpage_request   =   Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid})
                yield mainpage_request
            else:
                log.msg(' Queue is empty, task to terminate.',level=log.INFO)
        else:
            print '\n',login_user,'\n'

            login_user_profile_url = QueryFactory.info_query(page_id=login_user['page_id'], domain=login_user['domain'])
            log.msg('  user toCrawl id: %s, user login id: %s' \
                    % (login_user['toCrawl_user_id'],login_user['login_user_id']), level=log.INFO)

            request = Request(url=login_user_profile_url,callback=self.user_info_parse,meta={'login_user':login_user})
            yield request
Example #2
0
    def weibo_pages_num(self,response):
        #inspect_response(response,self)
        if response == None:
            yield self.start_requests()
            exit(1)

        # pares the current js response
        self.weibo_parse(response)

        login_user       =  response.meta['login_user']
        # load response in json form
        html_block_soup  =  self.json_load_response(response)

        # get the tag containing the max num of page
        page_list_tag    =  html_block_soup.find('div',{'action-type':'feed_list_page_morelist'})
        MAX_PAGE_NUM     =  50
        if page_list_tag:
            total_num_pages  =  int(re.search(r'\d+',page_list_tag.a.string).group(0))
            if total_num_pages > MAX_PAGE_NUM:
                total_num_pages   =   MAX_PAGE_NUM
        else:
            total_num_pages  =  1

        # warp weibo page urls to crawl
        weibo_page_urls       =  self.wrap_weibo_pages_urls(domain=login_user['domain'], page_id=login_user['page_id'], num_page=total_num_pages )

        print '\n\n Number of user weibos pages: ',total_num_pages,'\n\n'

        # test part weibo parser
        user_weibo_page_url   =  QueryFactory.weibo_js_query(domain = login_user['domain'], page_id = login_user['page_id'], page_num=2 )[0]

        # first request to get the total number of user weibos' pages
        #request = Request(url=user_weibo_page_url,callback=self.user_weibo_parse,meta={'login_user':login_user})
        #yield request


        # send requests contained in the weibo pages urls
        for page_url in weibo_page_urls:
            yield Request(url=page_url,callback=self.weibo_parse,meta={'login_user':login_user})

        # insert id crawled into redis
        self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1)
        self.redis_server.lpush(self.ids_crawled_name  ,login_user['toCrawl_user_id'])

        # TODO
        next_uid  =  self.forward_crawling_redis()

        if next_uid:
            trypage_url = QueryFactory.mainpage_query(next_uid)
            mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid})
            yield mainpage_request
        else:
            log.msg(' Queue is empty, task to terminate.',level=log.INFO)
Example #3
0
    def wrap_weibo_pages_urls(self, domain,page_id, num_page):
        weibo_urls = set()

        for page in range(num_page):
            weibo_urls=weibo_urls.union(QueryFactory.weibo_js_query( domain=domain,page_id=page_id,page_num=page+1))

        return weibo_urls
    def user_weibo_pages_num(self,response):
        #inspect_response(response,self)
        if response:
            # pares the current js response
            self.user_weibo_parse(response)

            login_user       =  response.meta['login_user']
            # load response in json form
            html_block_soup  =  self.json_load_response(response)

            # get the tag containing the max num of page
            page_list_tag    =  html_block_soup.find('div',{'class':'W_pages_layer S-FIXED'})
            if page_list_tag:
                total_num_pages  =  int(re.search(r'\d+',page_list_tag.a.string).group(0))
            else:
                total_num_pages  =  1

            # warp weibo page urls to crawl
            weibo_page_urls       =  self.wrap_weibo_pages_urls( page_id=login_user['page_id'], num_page=total_num_pages )


            # test part weibo parser
            user_weibo_page_url   =  QueryFactory.weibo_page_num_query(page_id = login_user['page_id'], page_num=2 )

            # first request to get the total number of user weibos' pages
            request = Request(url=user_weibo_page_url,callback=self.user_weibo_parse,meta={'login_user':login_user})
            yield request

        else:
            self.start_requests()
Example #5
0
    def user_info_parse(self,response):
        user_profile_translation = {
                        u"性别"      :    'sex',
                        u"简介"      :    'description',
                        u"注册时间"  :    'signed_time',
                        u"所在地"    :    'location',
                        u"生日"      :    'birthday',

                        u"公司"      :    'company',

                        u"大学"      :    'university',

                        u"标签"      :    'personal_tags'
                       }

        if response == None:
            yield self.start_requests()
            exit(1)

        login_user = response.meta['login_user']
        user = UserProfileItem()
        # fulfill the user Item
        user['user_id']          =  login_user['toCrawl_user_id']
        user['screen_name']      =  login_user['toCrawl_user_nick']

        user_tags_dict           =  self.get_userinfo_by_html(response)

        for property_name in user_profile_translation:
            user[user_profile_translation[property_name]] = user_tags_dict.get(property_name,'')

        if user.get('signed_time'):
            user['signed_time']  =  datetime.strptime(user['signed_time'],'%Y-%m-%d')

        print '\n\n User Profile:\n'
        for user_item in dict(user).items():
            print '\t',user_item[0],' : ',user_item[1]

        print "\n\n"
        yield user

        # url to get total number of weibos' pages
        user_weibo_page_url     =  QueryFactory.weibo_page_num_query(domain = login_user['domain'], page_id = login_user['page_id'], page_num=1 )

        # first request to get the total number of user weibos' pages
        request = Request(url=user_weibo_page_url,callback=self.weibo_pages_num,meta={'login_user':login_user})
        yield request
    def mainpage_parse(self,response):
        if response:
            sel = Selector(response)
            login_user = {}
            login_user['toCrawl_user_id']   =  response.meta['user_id']
            login_user['login_user_id']     =  self.get_property(response,"uid")
            login_user['page_id']           =  self.get_property(response,"page_id")
            login_user['pid']               =  self.get_property(response,"pid")
            print '\n',login_user,'\n'

            login_user_profile_url = QueryFactory.info_query(page_id=login_user['page_id'], pid=login_user['pid'])
            log.msg('  user toCrawl id: %s, user login id: %s' \
                    % (login_user['toCrawl_user_id'],login_user['login_user_id']), level=log.INFO)

            request = Request(url=login_user_profile_url,callback=self.user_info_parse,meta={'login_user':login_user})
            yield request

        else:
            yield self.start_requests()
    def login_parse(self,response):
        if response.body.find('feedBackUrlCallBack') != -1:
            data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))
            userinfo = data.get('userinfo', '')
            if len(userinfo):
                user_id = userinfo.get('uniqueid')
                screen_name = userinfo.get('displayname')
                log.msg('user login displayname: %s, user login id: %s' % (screen_name,user_id), level=log.INFO)
                assert screen_name in self.username
                self.logined = True

                #mainpage_url = QueryFactory.mainpage_query(user_id)
                # get 1 id from the list toCrawl
                id_toCrawl = self.id_toCrawl_list.pop()

                trypage_url = QueryFactory.mainpage_query(id_toCrawl)
                mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':id_toCrawl})
                yield mainpage_request

            else:
                self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))
Example #8
0
    def login_parse(self,response):
        if response.body.find('feedBackUrlCallBack') != -1:
            data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1))
            userinfo = data.get('userinfo', '')
            if len(userinfo):
                user_id = userinfo.get('uniqueid')
                screen_name = userinfo.get('displayname')
                log.msg('user login displayname: %s, user login id: %s' % (screen_name,user_id), level=log.INFO)
                #assert screen_name in self.username
                self.logined = True

                # load first user_id toCrawl from the Queue
                id_toCrawl  = self.redis_server.rpop(self.ids_toCrawl_name)
                if id_toCrawl:
                    # push  user_id toCrawl into the Processing Queue
                    self.redis_server.lpush(self.ids_processing_name,id_toCrawl)

                    trypage_url = QueryFactory.mainpage_query(id_toCrawl)
                    mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':id_toCrawl})
                    yield mainpage_request

            else:
                self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))