コード例 #1
0
 def open_spider(self, spider):
     #获取数据库中微博内容最新时间戳
     if spider.name in ('keyweibocontent', 'weibocontent_userinfo',
                        'weibocontent_danger',
                        'weibocontent_userinfo_intime'):
         db = OracleStore()
         conn = db.get_connection()
         sql = "select * from t_user_weibocontent where userID = '%s' order by publishTimeStamp desc" % str(
             spider.uid)
         cursor = db.select_operation(conn, sql)
         count = cursor.fetchone()
         if not count:  #count为None,即数据库中没有该用户微博及时间戳的数据
             WeibospiderPipeline.weibocontent_timestamp = None
         else:
             WeibospiderPipeline.weibocontent_timestamp = count[
                 6]  #获取数据库中最新的时间戳(publishTimeStamp)字段
         db.close_connection(conn, cursor)
     if spider.name == 'keyuser':
         db = OracleStore()
         conn = db.get_connection()
         sql = "select * from t_user_keyword where keyword = '%s' order by publishTimeStamp desc" % str(
             spider.keyword)
         cursor = db.select_operation(conn, sql)
         count = cursor.fetchone()
         if not count:  #count为None,即数据库中没有该关键词搜索结果及时间戳的数据
             WeibospiderPipeline.keyword_timestamp = None
         else:
             WeibospiderPipeline.keyword_timestamp = count[
                 4]  #获取数据库中最新的时间戳(publishTimeStamp)字段
         db.close_connection(conn, cursor)
コード例 #2
0
    def get_userinfo(self, response):
        db = OracleStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_info where imagestate = 0"
        cursor1 = db.select_operation(conn, sql1)

        sql2 = "select count(*) from t_user_info where imagestate = 0"
        cursor2 = db.select_operation(conn, sql2)
        count = cursor2.fetchone()

        for i in range(count[0]):
            for result in cursor1.fetchmany(1):
                if result[0]:
                    mainpageurl = 'http://weibo.com/u/' + str(
                        result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]  #result[1]
                    getweibopage = GetWeibopage()
                    GetWeibopage.data['page'] = 1
                    firstloadurl = mainpageurl + getweibopage.get_firstloadurl(
                    )
                    yield Request(url=firstloadurl,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': result[0]
                                  },
                                  callback=self.get_userurl)
コード例 #3
0
 def closed(self, reason):
     db = OracleStore()
     conn = db.get_connection()
     sql = '''update t_spider_state set contentstate = 1'''
     db.insert_operation(conn, sql)
     #logger.info('------keyweibocontent_spider closed------')
     print '------keyweibocontent_spider closed------'
コード例 #4
0
    def start_getweiboinfo(self,response):
         db = OracleStore();conn = db.get_connection()
         sql1 = '''select * from t_user_info'''
         cursor1 = db.select_operation(conn,sql1)
          
         sql2 = '''select count(*) from t_user_info'''
         cursor2 = db.select_operation(conn,sql2)
         count = cursor2.fetchone()
        
         if count[0]:           
            for i in range(count[0]):
                for result in cursor1.fetchmany(1):
                    mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num): 
                        GetWeibopage.data['page'] = page+1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                        yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl()
                        yield  Request(url=secondloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load)
           
                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl()
                        yield  Request(url=thirdloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.parse_load)           
         else:            
             yield None
         db.close_connection(conn,cursor1,cursor2)
コード例 #5
0
    def parse_follower(self, response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        getweibopage = GetWeibopage()
        total_follower_pq = analyzer.get_followerhtml(response.body)
        item['uid'] = response.meta['uid']
        item['follower_uid_list'] = analyzer.get_follower(total_follower_pq)
        item['follow_uid_list'] = []
        yield item

        #获取二级(粉丝)用户的关注和粉丝
        if self.uid == response.meta['uid'] and len(item['follower_uid_list']):
            db = OracleStore()
            conn = db.get_connection()

            for follower_uid in item['follower_uid_list']:
                #获取粉丝用户的关注用户
                sql1 = """select count(*) from t_user_follow where userID=%s""" % str(
                    follower_uid)
                cursor1 = db.select_operation(conn, sql1)
                count1 = cursor1.fetchone()
                follower_scraped = count1[0]
                cursor1.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follow_url = 'http://weibo.com/%s/follow?page=1' % str(
                        follower_uid)
                    yield Request(url=follow_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follower_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_follownum)
                else:
                    print 'follow_uid existed!', follower_uid
                    yield None

                #获取粉丝用户的粉丝用户
                sql2 = """select count(*) from t_user_follower where userID=%s""" % str(
                    follower_uid)
                cursor2 = db.select_operation(conn, sql2)
                count2 = cursor2.fetchone()
                follower_scraped = count2[0]
                cursor2.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follower_url = 'http://weibo.com/%s/fans?page=1' % str(
                        follower_uid)
                    yield Request(url=follower_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': follower_uid
                                  },
                                  dont_filter=True,
                                  callback=self.parse_based_followernum)
                else:
                    print 'follower_uid existed!', follower_uid
                    yield None

            conn.close()
コード例 #6
0
    def parse_load(self, response):
        user_info = userinfo.WeiboSpider()
        request_url = response.request.url
        p = re.compile('&pre_page=(\d).*&page=(\d)')  #用于判断是第一页的第一次加载
        match = p.search(request_url)
        if int(match.group(1)) == 0 and int(match.group(2)) == 1:  #进行用户信息的获取
            db = OracleStore()
            conn = db.get_connection()
            sql = "select count(*) from t_user_info where userID='%s'" % self.uid
            cursor = db.select_operation(conn, sql)
            count = cursor.fetchone()
            if not count[0]:  #若没有爬取过该uid用户,则爬取用户基本信息
                analyzer = Analyzer()
                total_pq = analyzer.get_html(
                    response.body, 'script:contains("PCD_person_info")')
                user_property = analyzer.get_userproperty(total_pq)
                if user_property == 'icon_verify_co_v':  #该账号为公众账号
                    public_userinfo_url = analyzer.get_public_userinfohref(
                        total_pq)
                    #yield Request(url=public_userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_public_userinfo) 暂时不处理公众账号,需要数据库设置外键
                else:
                    userinfo_url = analyzer.get_userinfohref(total_pq)
                    yield Request(url=userinfo_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': response.meta['uid'],
                                      'user_property': user_property
                                  },
                                  callback=self.parse_userinfo)
            db.close_connection(conn, cursor)

        item = WeibospiderItem()  #获取用户微博信息及@用户信息
        analyzer = Analyzer()
        friendcircle = FriendCircle()
        total_pq = analyzer.get_mainhtml(response.body)
        item['uid'] = response.meta['uid']
        item['content'] = analyzer.get_content(total_pq)
        item['time'], item['timestamp'] = analyzer.get_time(total_pq)
        atuser_info, item['repost_user'] = analyzer.get_atuser_repostuser(
            total_pq)
        atuser_list = friendcircle.atuser_parser(atuser_info)
        item['atuser_nickname_list'] = atuser_list
        yield item

        for atuser_inlist in atuser_list:
            if atuser_inlist != []:
                for atuser in atuser_inlist:
                    uid_url = "http://s.weibo.com/user/" + quote(
                        quote(str(atuser))) + "&Refer=SUer_box"
                    yield Request(url=uid_url,
                                  meta={
                                      'cookiejar': response.meta['cookiejar'],
                                      'uid': self.uid,
                                      'atuser_nickname': atuser
                                  },
                                  callback=self.parse_atuser_uid)
            else:
                continue
コード例 #7
0
    def closed(self,reason):
        db = OracleStore();conn = db.get_connection()
        cur = conn.cursor()
        for key in self.atuser_dict.keys(): #插入@用户uid信息
            sql= """update t_user_weibocontent_atuser set atuserID = %s where userID = %s and atuser = '******'""" % (self.atuser_dict.get(key),self.uid,key)
            cur.execute(sql)
            conn.commit()

        sql = '''update t_spider_state set contentstate = 1'''
        db.insert_operation(conn,sql)
        print '------weibocontent_info_spider closed------'                                                                                                               
コード例 #8
0
    def closed(self, reason):
        db = OracleStore()
        conn = db.get_connection()
        cur = conn.cursor()
        for key in self.atuser_dict.keys():  #插入@用户uid信息
            sql = """update t_user_weibocontent_atuser set atuserID = %s where userID = %s and atuser = '******'""" % (
                self.atuser_dict.get(key), self.uid, key)
            cur.execute(sql)
            conn.commit()

        sql = '''update t_spider_state set contentstate = 1'''
        db.insert_operation(conn, sql)
        print '------weibocontent_info_spider closed------'
コード例 #9
0
 def open_spider(self,spider):
     #获取数据库中微博内容最新时间戳
     if spider.name in ('keyweibocontent','weibocontent_userinfo','weibocontent_danger','weibocontent_userinfo_intime'):
         db=OracleStore();conn = db.get_connection()
         sql = "select * from t_user_weibocontent where userID = '%s' order by publishTimeStamp desc" % str(spider.uid) 
         cursor = db.select_operation(conn,sql)
         count = cursor.fetchone()
         if not count:  #count为None,即数据库中没有该用户微博及时间戳的数据
             WeibospiderPipeline.weibocontent_timestamp = None
         else:
             WeibospiderPipeline.weibocontent_timestamp = count[6] #获取数据库中最新的时间戳(publishTimeStamp)字段 
         db.close_connection(conn,cursor)
     if spider.name == 'keyuser':
         db=OracleStore();conn = db.get_connection()
         sql = "select * from t_user_keyword where keyword = '%s' order by publishTimeStamp desc" % str(spider.keyword) 
         cursor = db.select_operation(conn,sql)
         count = cursor.fetchone()
         if not count:  #count为None,即数据库中没有该关键词搜索结果及时间戳的数据
             WeibospiderPipeline.keyword_timestamp = None
         else:
             WeibospiderPipeline.keyword_timestamp = count[4] #获取数据库中最新的时间戳(publishTimeStamp)字段 
         db.close_connection(conn,cursor)
コード例 #10
0
    def start_getweiboinfo(self, response):
        db = OracleStore()
        conn = db.get_connection()
        sql1 = '''select * from t_user_info'''
        cursor1 = db.select_operation(conn, sql1)

        sql2 = '''select count(*) from t_user_info'''
        cursor2 = db.select_operation(conn, sql2)
        count = cursor2.fetchone()

        if count[0]:
            for i in range(count[0]):
                for result in cursor1.fetchmany(1):
                    mainpageurl = 'http://weibo.com/u/' + str(
                        result[0]) + '?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]
                    getweibopage = GetWeibopage()
                    for page in range(WeiboSpider.page_num):
                        GetWeibopage.data['page'] = page + 1
                        firstloadurl = mainpageurl + getweibopage.get_firstloadurl(
                        )
                        yield Request(url=firstloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[0]
                                      },
                                      callback=self.parse_load)

                        secondloadurl = mainpageurl + getweibopage.get_secondloadurl(
                        )
                        yield Request(url=secondloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[0]
                                      },
                                      callback=self.parse_load)

                        thirdloadurl = mainpageurl + getweibopage.get_thirdloadurl(
                        )
                        yield Request(url=thirdloadurl,
                                      meta={
                                          'cookiejar':
                                          response.meta['cookiejar'],
                                          'uid': result[0]
                                      },
                                      callback=self.parse_load)
        else:
            yield None
        db.close_connection(conn, cursor1, cursor2)
コード例 #11
0
    def parse_follower(self,response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        getweibopage = GetWeibopage()
        total_follower_pq = analyzer.get_followerhtml(response.body)
        item['uid'] = response.meta['uid']
        item['follower_uid_list'] = analyzer.get_follower(total_follower_pq)
        item['follow_uid_list'] = []    
        yield item

        if self.uid == response.meta['uid'] and len(item['follower_uid_list']):
            db = OracleStore()
            conn = db.get_connection()

            for follower_uid in item['follower_uid_list']:
                #获取粉丝用户的关注用户
                sql1 = """select count(*) from t_user_follow where userID=%s""" % str(follower_uid)
                cursor1 = db.select_operation(conn,sql1)
                count1 = cursor1.fetchone()
                follower_scraped = count1[0]
                cursor1.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    for page in range(WeiboSpider.follow_page_num,0,-1):
                        GetWeibopage.relation_data['page'] = page
                        follow_url = getinfo.get_follow_mainurl(follower_uid) + getweibopage.get_relation_paramurl()
                        yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follow)
                else:
                    print 'follow_uid existed!',follower_uid
                    yield None

                #获取粉丝用户的粉丝用户
                sql2 = """select count(*) from t_user_follower where userID=%s""" % str(follower_uid)
                cursor2 = db.select_operation(conn,sql2)
                count2 = cursor2.fetchone()
                follower_scraped = count2[0]
                cursor2.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    for page in range(WeiboSpider.follower_page_num,0,-1):
                        GetWeibopage.relation_data['page'] = page
                        follower_url = getinfo.get_follower_mainurl(follower_uid) + getweibopage.get_relation_paramurl()
                        yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},callback=self.parse_follower)
                else:
                    print 'follower_uid existed!',follower_uid
                    yield None

            conn.close()
コード例 #12
0
    def get_userinfo(self,response):
        db = OracleStore()
        conn = db.get_connection()
        sql1 = "select * from t_user_info where imagestate = 0"
        cursor1 = db.select_operation(conn,sql1)

        sql2 = "select count(*) from t_user_info where imagestate = 0"
        cursor2 = db.select_operation(conn,sql2)
        count = cursor2.fetchone()

        for i in range(count[0]):
            for result in cursor1.fetchmany(1):
                if result[0]:
                    mainpageurl = 'http://weibo.com/u/'+str(result[0])+'?from=otherprofile&wvr=3.6&loc=tagweibo'
                    GetWeibopage.data['uid'] = result[0]   #result[1]
                    getweibopage = GetWeibopage()
                    GetWeibopage.data['page'] = 1
                    firstloadurl = mainpageurl + getweibopage.get_firstloadurl()
                    yield  Request(url=firstloadurl,meta={'cookiejar':response.meta['cookiejar'],'uid':result[0]},callback=self.get_userurl)
コード例 #13
0
    def parse_follower(self,response):
        item = WeibospiderItem()
        analyzer = Analyzer()
        getweibopage = GetWeibopage()
        total_follower_pq = analyzer.get_followerhtml(response.body)
        item['uid'] = response.meta['uid']
        item['follower_uid_list'] = analyzer.get_follower(total_follower_pq)
        item['follow_uid_list'] = []    
        yield item

       #获取二级(粉丝)用户的关注和粉丝
        if self.uid == response.meta['uid'] and len(item['follower_uid_list']):
            db = OracleStore()
            conn = db.get_connection()

            for follower_uid in item['follower_uid_list']:
                #获取粉丝用户的关注用户
                sql1 = """select count(*) from t_user_follow where userID=%s""" % str(follower_uid)
                cursor1 = db.select_operation(conn,sql1)
                count1 = cursor1.fetchone()
                follower_scraped = count1[0]
                cursor1.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follow_url = 'http://weibo.com/%s/follow?page=1' % str(follower_uid) 
                    yield Request(url=follow_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},dont_filter=True,callback=self.parse_based_follownum) 
                else:
                    print 'follow_uid existed!',follower_uid
                    yield None

                #获取粉丝用户的粉丝用户
                sql2 = """select count(*) from t_user_follower where userID=%s""" % str(follower_uid)
                cursor2 = db.select_operation(conn,sql2)
                count2 = cursor2.fetchone()
                follower_scraped = count2[0]
                cursor2.close()
                if not follower_scraped:  #scraped为0,即该账户没有获取过
                    follower_url = 'http://weibo.com/%s/fans?page=1' % str(follower_uid) 
                    yield Request(url=follower_url,meta={'cookiejar':response.meta['cookiejar'],'uid':follower_uid},dont_filter=True,callback=self.parse_based_followernum)
                else:
                    print 'follower_uid existed!',follower_uid
                    yield None

            conn.close()
コード例 #14
0
    def parse_load(self,response):
        user_info = userinfo.WeiboSpider()
        request_url = response.request.url
        p=re.compile('&pre_page=(\d).*&page=(\d)')  #用于判断是第一页的第一次加载
        match = p.search(request_url)
        if int(match.group(1)) == 0 and int(match.group(2)) == 1: #进行用户信息的获取
            db = OracleStore();conn = db.get_connection()
            sql = "select count(*) from t_user_info where userID='%s'" % self.uid
            cursor = db.select_operation(conn,sql);count = cursor.fetchone()
            if not count[0]:  #若没有爬取过该uid用户,则爬取用户基本信息
                analyzer = Analyzer()
                total_pq =  analyzer.get_html(response.body,'script:contains("PCD_person_info")')
                user_property = analyzer.get_userproperty(total_pq)
                if user_property == 'icon_verify_co_v': #该账号为公众账号
                    public_userinfo_url = analyzer.get_public_userinfohref(total_pq)
                    #yield Request(url=public_userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_public_userinfo) 暂时不处理公众账号,需要数据库设置外键
                else:
                    userinfo_url = analyzer.get_userinfohref(total_pq)
                    yield Request(url=userinfo_url,meta={'cookiejar':response.meta['cookiejar'],'uid':response.meta['uid'],'user_property':user_property},callback=self.parse_userinfo)
            db.close_connection(conn,cursor)

        item = WeibospiderItem()  #获取用户微博信息及@用户信息
        analyzer = Analyzer()
        friendcircle = FriendCircle()
        total_pq =  analyzer.get_mainhtml(response.body)
        item['uid'] = response.meta['uid']
        item['content'] = analyzer.get_content(total_pq)
        item['time'],item['timestamp'] = analyzer.get_time(total_pq)
        atuser_info,item['repost_user'] = analyzer.get_atuser_repostuser(total_pq)
        atuser_list = friendcircle.atuser_parser(atuser_info)
        item['atuser_nickname_list'] = atuser_list
        yield item
      
        for atuser_inlist in atuser_list:
            if atuser_inlist != []:
                for atuser in atuser_inlist:
                    uid_url = "http://s.weibo.com/user/"+quote(quote(str(atuser)))+"&Refer=SUer_box"
                    yield Request(url=uid_url,meta={'cookiejar':response.meta['cookiejar'],'uid':self.uid,'atuser_nickname':atuser},callback=self.parse_atuser_uid)
            else:
                continue
コード例 #15
0
 def closed(self,reason):
     db = OracleStore();conn = db.get_connection()
     sql = 'update t_spider_state set userinfostate = 1'
     db.insert_operation(conn,sql)
     print '------userinfo_list_spider closed------'
コード例 #16
0
 def closed(self, reason):
     db = OracleStore()
     conn = db.get_connection()
     sql = 'update t_spider_state set searchstate=1'
     db.insert_operation(conn, sql)
     print '------keyuser_spider closed------'
コード例 #17
0
 def closed(self, reason):
     db = OracleStore()
     conn = db.get_connection()
     db.close_connection(conn)
     print '--------closed-------'
コード例 #18
0
 def closed(self, reason):
     db = OracleStore()
     conn = db.get_connection()
     db.close_connection(conn)
     print "--------closed-------"
コード例 #19
0
 def closed(self,reason):
     db = OracleStore();conn = db.get_connection()
     sql = '''update t_spider_state set contentstate = 1'''
     db.insert_operation(conn,sql)
     #logger.info('------keyweibocontent_spider closed------')                                                                                                                 
     print '------keyweibocontent_spider closed------'