def start_requests(self): while True: if len(self.scrawl_ID) > 0: ID = self.scrawl_ID.pop() else: break self.finish_ID.add(ID) # 加入已爬队列 ID = str(ID) follows = [] followsItems = FollowsItem() followsItems["_id"] = ID followsItems["follows"] = follows fans = [] fansItems = FansItem() fansItems["_id"] = ID fansItems["fans"] = fans informationItems = InformationItem() informationItems["_id"] = ID url_follows = "http://weibo.cn/%s/follow" % ID url_fans = "http://weibo.cn/%s/fans" % ID url_tweets = "http://weibo.cn/%s/profile?filter=1&page=1" % ID url_information0 = "http://weibo.cn/attgroup/opening?uid=%s" % ID url_information1 = "http://weibo.cn/%s/info" % ID #yield Request(url=url_follows, meta={"item": followsItems, "result": follows}, callback=self.parse3) # 去爬关注人 #yield Request(url=url_information1, meta={"item": informationItems}, callback=self.parse1) #yield Request(url=url_information0, meta={"ID": ID}, callback=self.parse0) # 去爬个人信息 #yield Request(url=url_fans, meta={"item": fansItems, "result": fans}, callback=self.parse3) # 去爬粉丝 yield Request(url=url_tweets, meta={"ID": ID}, callback=self.parse2) # 去爬微博
def parse0(self, response): """ 抓取个人信息1 """ informationItems = InformationItem() selector = Selector(response) text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # 微博数 num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # 关注数 num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # 粉丝数 if num_tweets: informationItems["tweets_num"] = int(num_tweets[0]) if num_follows: informationItems["follows_num"] = int(num_follows[0]) if num_fans: informationItems["fans_num"] = int(num_fans[0]) informationItems["_id"] = response.meta["ID"] yield informationItems
def parse0(self, response): """ 抓取个人信息1 """ informationItems = InformationItem() selector = Selector(response) text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # 微博数 num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # 关注数 num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # 粉丝数 if num_tweets: informationItems["Num_Tweets"] = int(num_tweets[0]) if num_follows: informationItems["Num_Follows"] = int(num_follows[0]) if num_fans: informationItems["Num_Fans"] = int(num_fans[0]) informationItems["_id"] = response.meta["ID"] url_information1 = "http://weibo.cn/%s/info" % response.meta["ID"] yield Request(url=url_information1, meta={"item": informationItems}, callback=self.parse1)
def parse0(self, response): if response.body == "": req = response.request req.meta["change_proxy"] = True yield req else: """ 抓取个人信息1 """ informationItems = InformationItem() selector = Selector(response) text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() if text0: # 当给出的正则表达式中带有一个括号时,列表的元素为字符串,此字符串的内容与括号中的正则表达式相对应(不是整个正则表达式的匹配内容)。 num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # 微博数 num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # 关注数 num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # 粉丝数 if num_tweets: informationItems["Num_Tweets"] = int(num_tweets[0]) if num_follows: informationItems["Num_Follows"] = int(num_follows[0]) if num_fans: informationItems["Num_Fans"] = int(num_fans[0]) informationItems["_id"] = response.meta["ID"] url_information1 = "http://weibo.cn/%s/info" % response.meta["ID"] yield informationItems
def parse0(self, response): """ 抓取个人信息1 """ print("parse0: ", response) informationItems = InformationItem() selector = Selector(response) text0 = selector.xpath( 'body/div[@class="u"]/div[@class="tip2"]').extract_first() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # 微博数 num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # 关注数 num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # 粉丝数 if num_tweets: informationItems["Num_Tweets"] = int(num_tweets[0]) print(".....", informationItems) if num_follows: informationItems["Num_Follows"] = int(num_follows[0]) if num_fans: informationItems["Num_Fans"] = int(num_fans[0]) informationItems["_id"] = response.meta["ID"] # informationItems["LastCrawlTime"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") url_information1 = "https://weibo.cn/%s/info" % response.meta["ID"] yield Request(url=url_information1, meta={"item": informationItems}, callback=self.parse1)