Example #1
0
    def parse_total_page(self, response):
        analyzer = Analyzer()
        total_pq = analyzer.get_html(response.body,
                                     'script:contains("W_pages")')
        friendcircle_analyzer = keyword_info_analyzer()
        total_pages = friendcircle_analyzer.get_totalpages(
            total_pq)  #需要爬取的微博朋友圈页数
        logger.info("the total_pages is: %d", total_pages)

        getweibopage = GetWeibopage()
        mainpage_url = response.meta['mainpage_url']
        user_id = response.meta['uid']
        is_search = response.meta['is_search']

        for page in range(1):  #TODO 此处要更改为total_pages
            GetWeibopage.data['uid'] = user_id
            GetWeibopage.data['page'] = page + 1
            firstload_url = mainpage_url + getweibopage.get_firstloadurl()
            yield Request(url=firstload_url,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'uid': user_id,
                              'is_search': is_search
                          },
                          callback=self.parse_load)

            secondload_url = mainpage_url + getweibopage.get_secondloadurl()
            #yield  Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load)

            thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
    def start_getweiboinfo(self,response):
         db = OracleStore();conn = db.get_connection()
         sql1 = '''select * from t_user_info'''
         cursor1 = db.select_operation(conn,sql1)
          
         sql2 = '''select count(*) from t_user_info'''
         cursor2 = db.select_operation(conn,sql2)
         count = cursor2.fetchone()
        
         if count[0]:           
            for i in range(count[0]):
                for result in cursor1.fetchmany(1):
                    mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num): 
                        GetWeibopage.data['page'] = page+1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                        yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
                        yield  Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load)
           
                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
                        yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load)           
         else:            
             yield None
         db.close_connection(conn,cursor1,cursor2)
    def start_getweiboinfo(self,response):
#        db = OracleStore()
#        conn = db.get_connection()
#        sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword)) 
#        cursor1 = db.select_operation(conn,sql1)
#        
#        sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword))
#        cursor2 = db.select_operation(conn,sql2)
#        count = cursor2.fetchone()
#        
#        if count[0]:
#            for i in range(1):   #(count[0]):
#                for result in cursor1.fetchmany(1):
#                    if result[0]:
        mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&'
        GetWeibopage.data['uid'] = self.uid    #result[0]
        getweibopage = GetWeibopage()
        for page in range(WeiboSpider.page_num): 
            GetWeibopage.data['page'] = page+1
            firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
            yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)

            secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
            yield  Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
           
            thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
            yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)           
    def get_userinfo(self, response):
        db = OracleStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_info where imagestate = 0"
        cursor1 = db.select_operation(conn, sql1)

        sql2 = "select count(*) from t_user_info where imagestate = 0"
        cursor2 = db.select_operation(conn, sql2)
        count = cursor2.fetchone()

        for i in range(count[0]):
            for result in cursor1.fetchmany(1):
                if result[0]:
                    mainpageurl = 'http://weibo.com/u/' + str(
                        result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]  #result[1]
                    getweibopage = GetWeibopage()
                    GetWeibopage.data['page'] = 1
                    firstloadurl = mainpageurl + getweibopage.get_firstloadurl(
                    )
                    yield Request(url=firstloadurl,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': result[0]
                                  },
                                  callback=self.get_userurl)
    def start_getweiboinfo(self,response):
        db = MysqlStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_follow where contentstate = 0" 
        cursor1 = db.select_operation(conn,sql1)
        
        sql2 = "select count(*) from t_user_follow where contentstate = 0"
        cursor2 = db.select_operation(conn,sql2)
        count = cursor2.fetchone()
        for i in range(10):        #(count[0]):  #count[0]为需要再爬取的用户数
            for result in cursor1.fetchmany(1):
                if result[1]:
                    mainpageurl = 'http://weibo.com/u/'+str(result[1])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[1]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num): 
                        GetWeibopage.data['page'] = page+1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                        yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_firstload)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
                        yield  Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_secondload)
           
                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
                        yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[1]},callback=self.parse_thirdload)
    def start_getweiboinfo(self, response):
        mainpageurl = 'http://weibo.com/u/' + str(
            self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&'
        GetWeibopage.data['uid'] = self.uid
        getweibopage = GetWeibopage()
        for page in range(int(self.per_page_num), int(self.per_page_num) + 2):
            GetWeibopage.data['page'] = page
            firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
            yield Request(url=firstloadurl,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'uid': self.uid
                          },
                          callback=self.parse_load)

            secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
            yield Request(url=secondloadurl,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'uid': self.uid
                          },
                          callback=self.parse_load)

            thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
            yield Request(url=thirdloadurl,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'uid': self.uid
                          },
                          callback=self.parse_load)
Example #7
0
 def get_userinfo(self,response):
     mainpageurl = 'http://weibo.com/u/'+str(1227086635)+'?from=otherprofile&wvr=3.6&loc=tagweibo'
     GetWeibopage.data['uid'] = 1227086635
     getweibopage = GetWeibopage()
     GetWeibopage.data['page'] = 1
     #firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
     thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
     yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar']},callback=self.get_userurl)
    def get_relation(self,response):
        '''获取用户粉丝或关注请求'''
        getweibopage = GetWeibopage()
        for page in range(WeiboSpider.follow_page_num,0,-1):
            GetWeibopage.relation_data['page'] = page
            follow_url = getinfo.get_follow_mainurl(self.uid) + getweibopage.get_relation_paramurl()
            yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_follow)

        for page in range(WeiboSpider.follower_page_num,0,-1):
            GetWeibopage.relation_data['page'] = page
            follower_url = getinfo.get_follower_mainurl(self.uid) + getweibopage.get_relation_paramurl()
            yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_follower)
 def get_userinfo(self,response):
     #db = OracleStore();conn = db.get_connection()
     for uid in self.uid_list:
         #sql = "select count(*) from (select userID from t_user_info where userID='%s' union select userID from t_publicuser_info where userID='%s')" % (uid,uid)
         #cursor = db.select_operation(conn,sql);count = cursor.fetchone()
         #if not count[0]:   #没有爬取过该uid用户
         print "!!scraping each uid:",uid
         mainpageurl = 'http://weibo.com/u/'+str(uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo'
         GetWeibopage.data['uid'] = uid     
         getweibopage = GetWeibopage()
         GetWeibopage.data['page'] = 1
         firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
         yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':uid},callback=self.get_userurl)
Example #10
0
    def start_getweiboinfo(self, response):
        db = OracleStore()
        conn = db.get_connection()
        sql1 = '''select * from t_user_info'''
        cursor1 = db.select_operation(conn, sql1)

        sql2 = '''select count(*) from t_user_info'''
        cursor2 = db.select_operation(conn, sql2)
        count = cursor2.fetchone()

        if count[0]:
            for i in range(count[0]):
                for result in cursor1.fetchmany(1):
                    mainpageurl = 'http://weibo.com/u/' + str(
                        result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num):
                        GetWeibopage.data['page'] = page + 1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl(
                        )
                        yield Request(url=firstloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[0]
                                      },
                                      callback=self.parse_load)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl(
                        )
                        yield Request(url=secondloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[0]
                                      },
                                      callback=self.parse_load)

                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl(
                        )
                        yield Request(url=thirdloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[0]
                                      },
                                      callback=self.parse_load)
        else:
            yield None
        db.close_connection(conn, cursor1, cursor2)
    def start_getweiboinfo(self,response):
        mainpageurl = 'http://weibo.com/u/'+str(self.uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&'
        GetWeibopage.data['uid'] = self.uid    
        getweibopage = GetWeibopage()
        for page in range(WeiboSpider.page_num): 
            GetWeibopage.data['page'] = page+1
            firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
            yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)

            secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
            yield  Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)
           
            thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
            yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid},callback=self.parse_load)           
Example #12
0
 def parse_userinfo(self,response):
     item = response.meta['item'] 
     #f=open('./text2.html','w')
     #f.write(response.body)
     analyzer = Analyzer()
     total_pq = analyzer.get_html(response.body,'script:contains("PCD_text_b")')
     #userinfo_dict = analyzer.get_userinfo(total_pq)
     item['userinfo'] = analyzer.get_userinfo(total_pq)
     #uid = item['uid']
     mainpageurl = 'http://weibo.com/u/'+str(response.meta['uid'])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
     GetWeibopage.data['uid'] = response.meta['uid']     #uid
     getweibopage = GetWeibopage()
     GetWeibopage.data['page'] = WeiboSpider.page_num-1
     thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
     yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':response.meta['uid'],'followlist':response.meta['followlist']},callback=self.parse_thirdload)
Example #13
0
    def start_getweiboinfo(self, response):
        db = MysqlStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_follow where contentstate = 0"
        cursor1 = db.select_operation(conn, sql1)

        sql2 = "select count(*) from t_user_follow where contentstate = 0"
        cursor2 = db.select_operation(conn, sql2)
        count = cursor2.fetchone()
        for i in (count[0]):  #count[0]为需要再爬取的用户数
            for result in cursor1.fetchmany(1):
                if result[1]:
                    mainpageurl = 'http://weibo.com/u/' + str(
                        result[1]) + '?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[1]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num):
                        GetWeibopage.data['page'] = page + 1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl(
                        )
                        yield Request(url=firstloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[1]
                                      },
                                      callback=self.parse_firstload)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl(
                        )
                        yield Request(url=secondloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[1]
                                      },
                                      callback=self.parse_secondload)

                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl(
                        )
                        yield Request(url=thirdloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[1]
                                      },
                                      callback=self.parse_thirdload)
Example #14
0
    def parse_follow(self,response):
        #print '************************ source request url:',response.request.url
        item = WeibospiderItem()
        analyzer = Analyzer()
        total_pq = analyzer.get_followhtml(response.body)
        #item['followuidlist'] = analyzer.get_follow(total_pq) 
        followlist = analyzer.get_follow(total_pq)
        #item['userinfo'] = {} 
        oldflag,stopflag= getinfo.get_followflag(WeiboSpider.filename)

        p = re.compile('.*_page=(\d).*',re.S)
        current_page = p.search(response.request.url).group(1)  #获取当前关注用户列表页页数
        
        if int(current_page) == 1:
            getinfo.set_followflag(WeiboSpider.filename,followlist[0],'False')
            print 'page is equal 1 '
        else:
            print 'page is NOT equal 1'
        
        for follow_uid in followlist[:2]:
            print '%%%%%%%%%%%%%%%%%%%%%%%%%%',follow_uid
            #item['uid'] = follow_uid
            if follow_uid != oldflag:                       #对于已爬uid不进行重复爬取,即增量爬取
                #爬取该uid用户主页微博内容
                if stopflag == 'False':
                    getinfo.set_followflag(WeiboSpider.filename,followlist[0],'True')
                    mainpageurl = 'http://weibo.com/u/'+str(follow_uid)+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = follow_uid
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num):
                        GetWeibopage.data['page'] = page+1
                        #当页第一次加载
                        #当页第二次加载
                        #当页第三次加载
                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
                        if int(GetWeibopage.data['pagebar']) == 1 and page == WeiboSpider.page_num-1:    #在最后一页最后一次加载时,获取用户基本信息
                            print 'hhhhhhhhhhhhhhhhhhhh',followlist
                            yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid,'followlist':followlist},callback=self.get_userurl)
                            #continue
                        #yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_thirdload)

                        #firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                        #yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'item':item,'uid':follow_uid},callback=self.parse_firstload)
                else:
                    break
            else:
                break
    def parse_follower(self, response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        getweibopage = GetWeibopage()
        total_follower_pq = analyzer.get_followerhtml(response.body)
        item['uid'] = response.meta['uid']
        item['follower_uid_list'] = analyzer.get_follower(total_follower_pq)
        item['follow_uid_list'] = []
        yield item

        #获取二级(粉丝)用户的关注和粉丝
        if self.uid == response.meta['uid'] and len(item['follower_uid_list']):
            db = OracleStore()
            conn = db.get_connection()

            for follower_uid in item['follower_uid_list']:
                #获取粉丝用户的关注用户
                sql1 = """select count(*) from t_user_follow where userID=%s""" % str(
                    follower_uid)
                cursor1 = db.select_operation(conn, sql1)
                count1 = cursor1.fetchone()
                follower_scraped = count1[0]
                cursor1.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follow_url = 'http://weibo.com/%s/follow?page=1' % str(
                        follower_uid)
                    yield Request(url=follow_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follower_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_follownum)
                else:
                    print 'follow_uid existed!', follower_uid
                    yield None

                #获取粉丝用户的粉丝用户
                sql2 = """select count(*) from t_user_follower where userID=%s""" % str(
                    follower_uid)
                cursor2 = db.select_operation(conn, sql2)
                count2 = cursor2.fetchone()
                follower_scraped = count2[0]
                cursor2.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follower_url = 'http://weibo.com/%s/fans?page=1' % str(
                        follower_uid)
                    yield Request(url=follower_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follower_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_followernum)
                else:
                    print 'follower_uid existed!', follower_uid
                    yield None

            conn.close()
    def parse_follower(self,response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        getweibopage = GetWeibopage()
        total_follower_pq = analyzer.get_followerhtml(response.body)
        item['uid'] = response.meta['uid']
        item['follower_uid_list'] = analyzer.get_follower(total_follower_pq)
        item['follow_uid_list'] = []    
        yield item

        if self.uid == response.meta['uid'] and len(item['follower_uid_list']):
            db = OracleStore()
            conn = db.get_connection()

            for follower_uid in item['follower_uid_list']:
                #获取粉丝用户的关注用户
                sql1 = """select count(*) from t_user_follow where userID=%s""" % str(follower_uid)
                cursor1 = db.select_operation(conn,sql1)
                count1 = cursor1.fetchone()
                follower_scraped = count1[0]
                cursor1.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    for page in range(WeiboSpider.follow_page_num,0,-1):
                        GetWeibopage.relation_data['page'] = page
                        follow_url = getinfo.get_follow_mainurl(follower_uid) + getweibopage.get_relation_paramurl()
                        yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follow)
                else:
                    print 'follow_uid existed!',follower_uid
                    yield None

                #获取粉丝用户的粉丝用户
                sql2 = """select count(*) from t_user_follower where userID=%s""" % str(follower_uid)
                cursor2 = db.select_operation(conn,sql2)
                count2 = cursor2.fetchone()
                follower_scraped = count2[0]
                cursor2.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    for page in range(WeiboSpider.follower_page_num,0,-1):
                        GetWeibopage.relation_data['page'] = page
                        follower_url = getinfo.get_follower_mainurl(follower_uid) + getweibopage.get_relation_paramurl()
                        yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follower)
                else:
                    print 'follower_uid existed!',follower_uid
                    yield None

            conn.close()
Example #17
0
    def start_getweibo_info(self,response):
        db = MysqlStore();
        #取出没有爬取过的且is_delete=0的重点人员
        GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage()

        for round in range(1): #遍历数据库的轮数
            conn = db.get_connection()

            sql1 = "select user_id from cauc_warning_man_test a \
                    where a.is_search = 0 and a.is_delete = 0"          
            cursor = db.select_operation(conn,sql1)
            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the unsearched user_id:%s",user_id)
            
                #更新is_search标志位为1
                sql2 = "update cauc_warning_man_test set is_search = 1 where user_id = '%s'" % user_id
                db.update_operation(conn,sql2)
                
                #获取需要爬取的总页面数
                start_time = self.start_time;end_time = get_current_time('hour') 
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page)
                
            logger.info("current timestamp:%d",int(time.time()))
            #设置循环爬取间隔
            time.sleep(WeiboSpider.settings['WEIBOCONTENT_INTERVAL']) #可以采用间隔15min

            #取出已经爬取过is_search=1的且is_delete=0的预警人员
            sql3 = "select user_id from cauc_warning_man_test a \
                    where a.is_search = 1 and a.is_delete = 0"
            cursor = db.select_operation(conn,sql3)

            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the searched user_id:%s",user_id)

                start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s)
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                #yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page)
            conn.close()
    def start_getweibo_info(self,response):
        db = MysqlStore();
        #取出没有爬取过的且is_delete=0的重点人员
        GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage()

        for round in range(1): #遍历数据库的轮数
            conn = db.get_connection()

            sql1 = "select user_id from cauc_black_man_test a \
                    where a.is_search = 0 and a.is_delete = 0"          
            cursor = db.select_operation(conn,sql1)
            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the unsearched user_id:%s",user_id)
            
                #更新is_search标志位为1
                sql2 = "update cauc_black_man_test set is_search = 1 where user_id = '%s'" % user_id
                db.update_operation(conn,sql2)
                
                #获取需要爬取的总页面数
                start_time = self.start_time;end_time = get_current_time('hour') 
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page)
                
            logger.info("current timestamp:%d",int(time.time()))
            #设置循环爬取间隔
            time.sleep(WeiboSpider.settings['FRIENDCIRCAL_INTERVAL']) #可以采用间隔15min

            #取出已经爬取过is_search=1的且is_delete=0的重点人员
            sql3 = "select user_id from cauc_black_man_test a \
                    where a.is_search = 1 and a.is_delete = 0"
            cursor = db.select_operation(conn,sql3)

            for user_id in cursor.fetchall():
                user_id = user_id[0]
                logger.info("this is the searched user_id:%s",user_id)

                start_time = get_time_by_interval(int(time.time()),86400,'hour');end_time = get_current_time('hour') #起始和结束间隔时间为1天(86400s)
                mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
                GetWeibopage.data['uid'] = user_id; 
                thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
                #yield  Request(url=thirdload_url,meta={'cookiejar':response.meta['cookiejar'],'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page)
            conn.close()
    def get_userinfo(self,response):
        db = OracleStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_info where imagestate = 0"
        cursor1 = db.select_operation(conn,sql1)

        sql2 = "select count(*) from t_user_info where imagestate = 0"
        cursor2 = db.select_operation(conn,sql2)
        count = cursor2.fetchone()

        for i in range(count[0]):
            for result in cursor1.fetchmany(1):
                if result[0]:
                    mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]   #result[1]
                    getweibopage = GetWeibopage()
                    GetWeibopage.data['page'] = 1
                    firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                    yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.get_userurl)
Example #20
0
 def get_userinfo(self, response):
     #db = OracleStore();conn = db.get_connection()
     for uid in self.uid_list:
         #sql = "select count(*) from (select userID from t_user_info where userID='%s' union select userID from t_publicuser_info where userID='%s')" % (uid,uid)
         #cursor = db.select_operation(conn,sql);count = cursor.fetchone()
         #if not count[0]:   #没有爬取过该uid用户
         print "!!scraping each uid:", uid
         mainpageurl = 'http://weibo.com/u/' + str(
             uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo'
         GetWeibopage.data['uid'] = uid
         getweibopage = GetWeibopage()
         GetWeibopage.data['page'] = 1
         firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
         yield Request(url=firstloadurl,
                       meta={
                           'cookiejar': response.meta['cookiejar'],
                           'uid': uid
                       },
                       callback=self.get_userurl)
    def start_getweiboinfo(self, response):
        #        db = OracleStore()
        #        conn = db.get_connection()
        #        sql1 = '''select * from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword))
        #        cursor1 = db.select_operation(conn,sql1)
        #
        #        sql2 = '''select count(*) from "t_user_keyword" where "keyword" = '%s' ''' % str((self.keyword))
        #        cursor2 = db.select_operation(conn,sql2)
        #        count = cursor2.fetchone()
        #
        #        if count[0]:
        #            for i in range(1):   #(count[0]):
        #                for result in cursor1.fetchmany(1):
        #                    if result[0]:
        mainpageurl = 'http://weibo.com/u/' + str(
            self.uid) + '?from=otherprofile&wvr=3.6&loc=tagweibo&is_all=1&'
        GetWeibopage.data['uid'] = self.uid  #result[0]
        getweibopage = GetWeibopage()
        for page in range(WeiboSpider.page_num):
            GetWeibopage.data['page'] = page + 1
            firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
            yield Request(url=firstloadurl,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'uid': self.uid
                          },
                          callback=self.parse_load)

            secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
            yield Request(url=secondloadurl,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'uid': self.uid
                          },
                          callback=self.parse_load)

            thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
            yield Request(url=thirdloadurl,
                          meta={
                              'cookiejar': response.meta['cookiejar'],
                              'uid': self.uid
                          },
                          callback=self.parse_load)
Example #22
0
    def start_getweibo_info(self,response):
        db = MysqlStore();
        #取出没有爬取过的且is_delete=0的重点人员
        GetWeibopage.data['page'] = 1; getweibopage = GetWeibopage()

        #for round in range(1): #遍历数据库的轮数
        conn = db.get_connection()
        sql1 = "select user_id from cauc_warning_man a \
                where a.is_search = 0 and a.is_delete = 0"          
        cursor1 = db.select_operation(conn,sql1)
        for user_id in cursor1.fetchall():
            user_id = user_id[0]
            logger.info("this is the unsearched user_id:%s",user_id)
            
            #获取需要爬取的总页面数
            start_time = self.start_time;end_time = get_current_time('day') 
            mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
            GetWeibopage.data['uid'] = user_id; 
            thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
            yield  Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':0},callback=self.parse_total_page)
            

        #取出已经爬取过is_search=1的且is_delete=0的预警人员
        sql2 = "select user_id from cauc_warning_man a \
                where a.is_search = 1 and a.is_delete = 0"
        cursor2 = db.select_operation(conn,sql2)

        for user_id in cursor2.fetchall():
            user_id = user_id[0]
            logger.info("this is the searched user_id:%s",user_id)

            #start_time = get_time_by_interval(int(time.time()),86400,'day');end_time = get_current_time('day') #起始和结束间隔时间为1天(86400s),即过去一天的内容
            start_time = get_time_by_interval(int(time.time()),int(self.interval),'day');end_time = get_current_time('day') #起始和结束间隔时间为x天(由interval代表的秒换算而来)
            mainpage_url = "http://weibo.com/" + str(user_id) + "?is_ori=1&is_forward=1&is_text=1&is_pic=1&key_word=&start_time=" + start_time + "&end_time=" + end_time + "&is_search=1&is_searchadv=1&" 
            GetWeibopage.data['uid'] = user_id; 
            thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
            yield  Request(url=thirdload_url,cookies=random.choice(COOKIES),meta={'mainpage_url':mainpage_url,'uid':user_id,'is_search':1},callback=self.parse_total_page)

        #更新is_search标志位为1
        sql3 = "update cauc_warning_man set is_search = 1 where is_search = 0 and is_delete = 0"
        db.update_operation(conn,sql3)
        db.close_connection(conn)
    def parse_total_page(self,response):
        analyzer = Analyzer()
        total_pq = analyzer.get_html(response.body,'script:contains("W_pages")')
        friendcircle_analyzer = keyword_info_analyzer()
        total_pages = friendcircle_analyzer.get_totalpages(total_pq) #需要爬取的微博朋友圈页数
        logger.info("the total_pages is: %d",total_pages)
        
        getweibopage = GetWeibopage()
        mainpage_url = response.meta['mainpage_url']
        user_id = response.meta['uid']
        is_search = response.meta['is_search']

        for page in range(1): #TODO 此处要更改为total_pages
            GetWeibopage.data['uid'] = user_id
            GetWeibopage.data['page'] = page + 1
            firstload_url = mainpage_url + getweibopage.get_firstloadurl()
            yield  Request(url=firstload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load)

            secondload_url = mainpage_url + getweibopage.get_secondloadurl()
            #yield  Request(url=secondload_url,meta={'cookiejar':response.meta['cookiejar'],'uid':user_id,'is_search':is_search},callback=self.parse_load)

            thirdload_url = mainpage_url + getweibopage.get_thirdloadurl()
Example #24
0
 def get_follow(self,response):
     getweibopage = GetWeibopage()
     for page in range(WeiboSpider.follow_page_num,0,-1):
         GetWeibopage.followdata['Pl_Official_RelationMyfollow__108_page'] = page
         follow_url = getinfo.get_url(WeiboSpider.start_uid) + getweibopage.get_followurl()
         yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar']},callback=self.parse_follow)
class WeiboSpider(CrawlSpider):
    name = 'userfollow'
    allowed_domains = ['weibo.com', 'sina.com.cn']
    settings = get_project_settings()
    start_username = settings['USER_NAME']
    start_password = settings['PASS_WORD']
    start_uid = settings['UID']
    page_num = settings['PAGE_NUM']
    follow_page_num = settings['FOLLOW_PAGE_NUM']
    follower_page_num = settings['FOLLOWER_PAGE_NUM']
    getweibopage = GetWeibopage()

    def __init__(self, uid=None):
        self.uid = uid

    def start_requests(self):
        username = WeiboSpider.start_username
        url = 'http://login.sina.com.cn/sso/prelogin.php?entry=sso&callback=sinaSSOController.preloginCallBack&su=%s&rsakt=mod&client=ssologin.js(v1.4.4)' % username
        return [Request(url=url, method='get', callback=self.post_requests)]

    def post_requests(self, response):
        serverdata = re.findall(
            '{"retcode":0,"servertime":(.*?),"pcid":.*?,"nonce":"(.*?)","pubkey":"(.*?)","rsakv":"(.*?)","exectime":.*}',
            response.body, re.I)[0]  #获取get请求的数据,用于post请求登录
        #print '!!!!GET responsebody:',response.body
        #print '!!!!serverdata',serverdata[0]
        servertime = serverdata[0]
        nonce = serverdata[1]
        pubkey = serverdata[2]
        rsakv = serverdata[3]
        username = WeiboSpider.start_username
        password = WeiboSpider.start_password
        formdata = {
            'entry': 'weibo',
            'gateway': '1',
            'from': '',
            'ssosimplelogin': '******',
            'vsnf': '1',
            'vsnval': '',
            'su': getinfo.get_user(username),
            'service': 'miniblog',
            'servertime': servertime,
            'nonce': nonce,
            'pwencode': 'rsa2',
            'sp': getinfo.get_pwd(password, servertime, nonce, pubkey),
            'encoding': 'UTF-8',
            'prelt': '115',
            'rsakv': rsakv,
            'url':
            'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
            'returntype': 'META'
        }
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0 Chrome/20.0.1132.57 Safari/536.11'
        }
        return [
            FormRequest(
                url=
                'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.4)',
                formdata=formdata,
                headers=headers,
                callback=self.get_cookie)
        ]

    def get_cookie(self, response):
        #print 'response:~~~~~~~~~~~~~~~',response.body
        p = re.compile('location\.replace\(\'(.*)\'\)')
        try:
            login_url = p.search(response.body).group(1)
            #print '==============',login_url
            ret_res = re.search('retcode=0', login_url)
            if ret_res:
                print 'Login Success!!!!'
            else:
                print 'Login Fail!!!!'
        except:
            print 'Login Error!!!!'

        request = response.request.replace(
            url=login_url,
            meta={'cookiejar': 1},
            method='get',
            callback=self.get_relation_pagenum
        )  #GET请求login_url获取返回的cookie,后续发送Request携带此cookie
        return request

    def get_relation_pagenum(self, response):
        follow_url = 'http://weibo.com/%s/follow?page=1' % str(self.uid)
        follower_url = 'http://weibo.com/%s/fans?page=1' % str(self.uid)
        yield Request(url=follow_url,
                      meta={
                          'cookiejar': 1,
                          'uid': self.uid
                      },
                      dont_filter=True,
                      callback=self.parse_based_follownum)
        yield Request(url=follower_url,
                      meta={
                          'cookiejar': 1,
                          'uid': self.uid
                      },
                      dont_filter=True,
                      callback=self.parse_based_followernum)

    def parse_based_follownum(self, response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        total_follow_pq = analyzer.get_childfollowhtml(response.body)
        follow_page_num = analyzer.get_relation_pagenum(total_follow_pq)

        if follow_page_num != "" and int(follow_page_num) >= 5:
            for page in range(5, 0, -1):
                GetWeibopage.relation_data['page'] = page
                follow_url = getinfo.get_follow_mainurl(
                    response.meta['uid']
                ) + WeiboSpider.getweibopage.get_relation_paramurl()
                yield Request(url=follow_url,
                              meta={
                                  'cookiejar': response.meta['cookiejar'],
                                  'uid': response.meta['uid']
                              },
                              callback=self.parse_follow)

        elif follow_page_num == "":
            follow_url = 'http://weibo.com/%s/follow?page=1' % response.meta[
                'uid']
            yield Request(url=follow_url,
                          meta={
                              'cookiejar': 1,
                              'uid': response.meta['uid']
                          },
                          callback=self.parse_follow)
        else:
            for page in range(int(follow_page_num), 0, -1):
                GetWeibopage.relation_data['page'] = page
                follow_url = getinfo.get_follow_mainurl(
                    response.meta['uid']
                ) + WeiboSpider.getweibopage.get_relation_paramurl()
                yield Request(url=follow_url,
                              meta={
                                  'cookiejar': response.meta['cookiejar'],
                                  'uid': response.meta['uid']
                              },
                              callback=self.parse_follow)

    def parse_based_followernum(self, response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        total_follower_pq = analyzer.get_followerhtml(response.body)
        follower_page_num = analyzer.get_relation_pagenum(total_follower_pq)

        if follower_page_num != "" and int(follower_page_num) >= 5:
            for page in range(5, 0, -1):
                GetWeibopage.relation_data['page'] = page
                follower_url = getinfo.get_follower_mainurl(
                    response.meta['uid']
                ) + WeiboSpider.getweibopage.get_relation_paramurl()
                yield Request(url=follower_url,
                              meta={
                                  'cookiejar': response.meta['cookiejar'],
                                  'uid': response.meta['uid']
                              },
                              callback=self.parse_follower)

        elif follower_page_num == "":
            follower_url = 'http://weibo.com/%s/fans?page=1' % response.meta[
                'uid']
            yield Request(url=follower_url,
                          meta={
                              'cookiejar': 1,
                              'uid': response.meta['uid']
                          },
                          callback=self.parse_follower)
            #yield None
        else:
            for page in range(int(follower_page_num), 0, -1):
                GetWeibopage.relation_data['page'] = page
                follower_url = getinfo.get_follower_mainurl(
                    response.meta['uid']
                ) + WeiboSpider.getweibopage.get_relation_paramurl()
                yield Request(url=follower_url,
                              meta={
                                  'cookiejar': response.meta['cookiejar'],
                                  'uid': response.meta['uid']
                              },
                              callback=self.parse_follower)

    def parse_follow(self, response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        total_follow_pq = analyzer.get_childfollowhtml(response.body)
        item['uid'] = response.meta['uid']
        item['follow_uid_list'] = analyzer.get_childfollow(total_follow_pq)
        item['follower_uid_list'] = []
        yield item

        #获取二级(关注)用户的关注和粉丝
        if self.uid == response.meta['uid'] and len(item['follow_uid_list']):
            db = OracleStore()
            conn = db.get_connection()

            for follow_uid in item['follow_uid_list']:
                #获取关注用户的关注用户
                sql1 = """select count(*) from t_user_follow where userID=%s""" % str(
                    follow_uid)
                cursor1 = db.select_operation(conn, sql1)
                count1 = cursor1.fetchone()
                follow_scraped = count1[0]
                cursor1.close()
                if not follow_scraped:  #scraped为0,即该账户没有获取过
                    follow_url = 'http://weibo.com/%s/follow?page=1' % str(
                        follow_uid)
                    yield Request(url=follow_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follow_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_follownum)
                else:
                    print 'follow_uid existed!', follow_uid
                    yield None

                #获取关注用户的粉丝用户
                sql2 = """select count(*) from t_user_follower where userID=%s""" % str(
                    follow_uid)
                cursor2 = db.select_operation(conn, sql2)
                count2 = cursor2.fetchone()
                follower_scraped = count2[0]
                cursor2.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follower_url = 'http://weibo.com/%s/fans?page=1' % str(
                        follow_uid)
                    yield Request(url=follower_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follow_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_followernum)
                else:
                    print 'follower_uid existed!', follow_uid
                    yield None

            conn.close()

    def parse_follower(self, response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        getweibopage = GetWeibopage()
        total_follower_pq = analyzer.get_followerhtml(response.body)
        item['uid'] = response.meta['uid']
        item['follower_uid_list'] = analyzer.get_follower(total_follower_pq)
        item['follow_uid_list'] = []
        yield item

        #获取二级(粉丝)用户的关注和粉丝
        if self.uid == response.meta['uid'] and len(item['follower_uid_list']):
            db = OracleStore()
            conn = db.get_connection()

            for follower_uid in item['follower_uid_list']:
                #获取粉丝用户的关注用户
                sql1 = """select count(*) from t_user_follow where userID=%s""" % str(
                    follower_uid)
                cursor1 = db.select_operation(conn, sql1)
                count1 = cursor1.fetchone()
                follower_scraped = count1[0]
                cursor1.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follow_url = 'http://weibo.com/%s/follow?page=1' % str(
                        follower_uid)
                    yield Request(url=follow_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follower_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_follownum)
                else:
                    print 'follow_uid existed!', follower_uid
                    yield None

                #获取粉丝用户的粉丝用户
                sql2 = """select count(*) from t_user_follower where userID=%s""" % str(
                    follower_uid)
                cursor2 = db.select_operation(conn, sql2)
                count2 = cursor2.fetchone()
                follower_scraped = count2[0]
                cursor2.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follower_url = 'http://weibo.com/%s/fans?page=1' % str(
                        follower_uid)
                    yield Request(url=follower_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follower_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_followernum)
                else:
                    print 'follower_uid existed!', follower_uid
                    yield None

            conn.close()