예제 #1
0
    def start_requests(self):
        while True:
            if len(self.scrawl_ID) > 0:
                ID = self.scrawl_ID.pop()
            else:
                break
            self.finish_ID.add(ID)  # 加入已爬队列
            ID = str(ID)
            follows = []
            followsItems = FollowsItem()
            followsItems["_id"] = ID
            followsItems["follows"] = follows
            fans = []
            fansItems = FansItem()
            fansItems["_id"] = ID
            fansItems["fans"] = fans
            informationItems = InformationItem()
            informationItems["_id"] = ID

            url_follows = "http://weibo.cn/%s/follow" % ID
            url_fans = "http://weibo.cn/%s/fans" % ID
            url_tweets = "http://weibo.cn/%s/profile?filter=1&page=1" % ID
            url_information0 = "http://weibo.cn/attgroup/opening?uid=%s" % ID
            url_information1 = "http://weibo.cn/%s/info" % ID

            #yield Request(url=url_follows, meta={"item": followsItems, "result": follows}, callback=self.parse3)  # 去爬关注人
            #yield Request(url=url_information1, meta={"item": informationItems}, callback=self.parse1)
            #yield Request(url=url_information0, meta={"ID": ID}, callback=self.parse0)  # 去爬个人信息
            #yield Request(url=url_fans, meta={"item": fansItems, "result": fans}, callback=self.parse3)  # 去爬粉丝
            yield Request(url=url_tweets,
                          meta={"ID": ID},
                          callback=self.parse2)  # 去爬微博
예제 #2
0
 def parse0(self, response):
     """ 抓取个人信息1 """
     informationItems = InformationItem()
     selector = Selector(response)
     text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
     if text0:
         num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # 微博数
         num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # 关注数
         num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # 粉丝数
         if num_tweets:
             informationItems["tweets_num"] = int(num_tweets[0])
         if num_follows:
             informationItems["follows_num"] = int(num_follows[0])
         if num_fans:
             informationItems["fans_num"] = int(num_fans[0])
         informationItems["_id"] = response.meta["ID"]
         yield informationItems
예제 #3
0
 def parse0(self, response):
     """ 抓取个人信息1 """
     informationItems = InformationItem()
     selector = Selector(response)
     text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
     if text0:
         num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # 微博数
         num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # 关注数
         num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # 粉丝数
         if num_tweets:
             informationItems["Num_Tweets"] = int(num_tweets[0])
         if num_follows:
             informationItems["Num_Follows"] = int(num_follows[0])
         if num_fans:
             informationItems["Num_Fans"] = int(num_fans[0])
         informationItems["_id"] = response.meta["ID"]
         url_information1 = "http://weibo.cn/%s/info" % response.meta["ID"]
         yield Request(url=url_information1, meta={"item": informationItems}, callback=self.parse1)
예제 #4
0
 def parse0(self, response):
     if response.body == "":
         req = response.request
         req.meta["change_proxy"] = True
         yield req
     else:
         """ 抓取个人信息1 """
         informationItems = InformationItem()
         selector = Selector(response)
         text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
         if text0:
             # 当给出的正则表达式中带有一个括号时,列表的元素为字符串,此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容)。
             num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # 微博数
             num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # 关注数
             num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # 粉丝数
             if num_tweets:
                 informationItems["Num_Tweets"] = int(num_tweets[0])
             if num_follows:
                 informationItems["Num_Follows"] = int(num_follows[0])
             if num_fans:
                 informationItems["Num_Fans"] = int(num_fans[0])
             informationItems["_id"] = response.meta["ID"]
             url_information1 = "http://weibo.cn/%s/info" % response.meta["ID"]
             yield informationItems
예제 #5
0
파일: spiders.py 프로젝트: asevergreen/GCN
 def parse0(self, response):
     """ 抓取个人信息1 """
     print("parse0: ", response)
     informationItems = InformationItem()
     selector = Selector(response)
     text0 = selector.xpath(
         'body/div[@class="u"]/div[@class="tip2"]').extract_first()
     if text0:
         num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # 微博数
         num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # 关注数
         num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # 粉丝数
         if num_tweets:
             informationItems["Num_Tweets"] = int(num_tweets[0])
         print(".....", informationItems)
         if num_follows:
             informationItems["Num_Follows"] = int(num_follows[0])
         if num_fans:
             informationItems["Num_Fans"] = int(num_fans[0])
         informationItems["_id"] = response.meta["ID"]
         # informationItems["LastCrawlTime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
         url_information1 = "https://weibo.cn/%s/info" % response.meta["ID"]
         yield Request(url=url_information1,
                       meta={"item": informationItems},
                       callback=self.parse1)