def parse_information(self, response): """ 抓取个人信息 """ information_item = InformationItem() information_item['crawl_time'] = int(time.time()) selector = Selector(response) information_item['_id'] = re.findall('(\d+)/info', response.url)[0] text1 = ";".join( selector.xpath( 'body/div[@class="c"]//text()').extract()) # 获取标签里的所有text() name = re.findall('昵称;?[::]?(.*?);', text1) gender = re.findall('性别;?[::]?(.*?);', text1) place = re.findall('地区;?[::]?(.*?);', text1) briefIntroduction = re.findall('简介;?[::]?(.*?);', text1) birthday = re.findall('生日;?[::]?(.*?);', text1) if name and name[0]: information_item["nick_name"] = name[0].replace(u"\xa0", "") if gender and gender[0]: information_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: information_item["place"] = place[0].replace(u"\xa0", "") if briefIntroduction and briefIntroduction[0]: information_item["brief_introduction"] = briefIntroduction[ 0].replace(u"\xa0", "") if birthday and birthday[0]: information_item['birthday'] = birthday[0] # request_meta = response.meta # request_meta['item'] = information_item yield Request(self.base_url + '/u/{}'.format(information_item['_id']), callback=self.parse_further_information, meta={'item': information_item}, dont_filter=True, priority=1)
def parse(self, response): selector = Selector(response) information_item = InformationItem() uid_from_url = re.findall('(\d+)/info', response.url) if uid_from_url: information_item['_id'] = re.findall( '(\d+)/info', response.url)[0] # get user id else: information_item['_id'] = "NA" information_item['page_url'] = response.url.replace( self.base_url, self.weibo_baseurl) information_item['page_raw'] = selector.extract( ) # get raw page content information_item['crawl_time_utc'] = dt.utcnow() yield information_item # request tweets page if uid_from_url: yield Request(url=self.base_url + '/{}/profile?page=1'.format(information_item['_id']), callback=self.parse_tweet, meta={'user_id': information_item['_id']}, priority=1) else: yield Request(url=response.url + '?page=1', callback=self.parse_tweet, meta={'user_id': information_item['_id']}, priority=1)
def parse_information(self, response): """ :param response: :param from_twitter_id: 用来获取asker的爬取来自哪一个twitter,但是好像没有必要,因为数据都是当天爬取的 :return: """ """ 抓取个人信息 """ information_item = InformationItem() if response.meta.get('asker_from', None): information_item['asker_from_tweet'] = response.meta['asker_from'] information_item['crawl_time'] = int(time.time()) selector = Selector(response) information_item['_id'] = re.findall('(\d+)/info', response.url)[0] text1 = ";".join( selector.xpath( 'body/div[@class="c"]//text()').extract()) # 获取标签里的所有text() nick_name = re.findall('昵称;?[::]?(.*?);', text1) gender = re.findall('性别;?[::]?(.*?);', text1) place = re.findall('地区;?[::]?(.*?);', text1) brief_introduction = re.findall('简介;[::]?(.*?);', text1) birthday = re.findall('生日;?[::]?(.*?);', text1) sex_orientation = re.findall('性取向;?[::]?(.*?);', text1) sentiment = re.findall('感情状况;?[::]?(.*?);', text1) vip_level = re.findall('会员等级;?[::]?(.*?);', text1) authentication = re.findall('认证;?[::]?(.*?);', text1) if nick_name and nick_name[0]: information_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: information_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") information_item["province"] = place[0] if len(place) > 1: information_item["city"] = place[1] if brief_introduction and brief_introduction[0]: information_item["brief_introduction"] = brief_introduction[ 0].replace(u"\xa0", "") if birthday and birthday[0]: information_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: information_item["sex_orientation"] = "同性恋" else: information_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: information_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: information_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: information_item["authentication"] = authentication[0].replace( u"\xa0", "") request_meta = response.meta request_meta['item'] = information_item yield Request(self.base_url + '/u/{}'.format(information_item['_id']), callback=self.parse_further_information, meta=request_meta, dont_filter=True, priority=4)
def parse_information(self, response): """ 抓取个人信息 """ informationItem = InformationItem() selector = Selector(response) ID = re.findall('(\d+)/info', response.url)[0] try: text1 = ";".join( selector.xpath('body/div[@class="c"]//text()').extract() ) # 获取标签里的所有text() nickname = re.findall('昵称;?[::]?(.*?);', text1) url = re.findall('互联网;?[::]?(.*?);', text1) informationItem["Num_Tweets"] = 0 informationItem["Num_Follows"] = 0 informationItem["Num_Fans"] = 0 informationItem["_id"] = ID if nickname and nickname[0]: informationItem["NickName"] = nickname[0].replace(u"\xa0", "") if url: informationItem["URL"] = url[0] try: urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID new_ck = {} for ck in response.request.cookies: new_ck[ck['name']] = ck['value'] r = requests.get(urlothers, cookies=new_ck, timeout=5) if r.status_code == 200: selector = etree.HTML(r.content) texts = ";".join( selector.xpath('//body//div[@class="tip2"]/a//text()')) if texts: num_tweets = re.findall('微博\[(\d+)\]', texts) num_follows = re.findall('关注\[(\d+)\]', texts) num_fans = re.findall('粉丝\[(\d+)\]', texts) if num_tweets: informationItem["Num_Tweets"] = int(num_tweets[0]) if num_follows: informationItem["Num_Follows"] = int( num_follows[0]) if num_fans: informationItem["Num_Fans"] = int(num_fans[0]) except Exception as e: pass except Exception as e: pass else: yield informationItem if informationItem["Num_Tweets"] > 0: page_cnt = min((informationItem["Num_Tweets"] - 1) / 10, 2000) for i in range(0, int(page_cnt) + 1): yield Request( url="https://weibo.cn/%s/profile?filter=1&page=%s" % (ID, str(i)), callback=self.parse_tweets, dont_filter=True)
def parse(self, response): """ 抓取个人信息 """ information_item = InformationItem() information_item['crawl_time'] = int(time.time()) selector = Selector(response) information_item['_id'] = re.findall('(\d+)/info', response.url)[0] text1 = ";".join( selector.xpath( 'body/div[@class="c"]//text()').extract()) # 获取标签里的所有text() nick_name = re.findall('昵称;?[::]?(.*?);', text1) gender = re.findall('性别;?[::]?(.*?);', text1) place = re.findall('地区;?[::]?(.*?);', text1) briefIntroduction = re.findall('简介;?[::]?(.*?);', text1) birthday = re.findall('生日;?[::]?(.*?);', text1) sex_orientation = re.findall('性取向;?[::]?(.*?);', text1) sentiment = re.findall('感情状况;?[::]?(.*?);', text1) vip_level = re.findall('会员等级;?[::]?(.*?);', text1) authentication = re.findall('认证;?[::]?(.*?);', text1) labels = re.findall('标签;?[::]?(.*?)更多>>', text1) if nick_name and nick_name[0]: information_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: information_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") information_item["province"] = place[0] if len(place) > 1: information_item["city"] = place[1] if briefIntroduction and briefIntroduction[0]: information_item["brief_introduction"] = briefIntroduction[ 0].replace(u"\xa0", "") if birthday and birthday[0]: information_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: information_item["sex_orientation"] = "同性恋" else: information_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: information_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: information_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: information_item["authentication"] = authentication[0].replace( u"\xa0", "") if labels and labels[0]: information_item["labels"] = labels[0].replace( u"\xa0", ",").replace(';', '').strip(',') request_meta = response.meta request_meta['item'] = information_item yield Request(self.base_url + '/u/{}'.format(information_item['_id']), callback=self.parse_further_information, meta=request_meta, dont_filter=True, priority=1)
def parse_information(self, response): """ 抓取评论个人信息 """ information_item = InformationItem() information_item['crawl_time'] = int(time.time()) selector = Selector(response) information_item['_id'] = re.findall('(\d+)/info', response.url)[0] text1 = ";".join( selector.xpath( 'body/div[@class="c"]//text()').extract()) # 获取标签里的所有text() nick_name = re.findall('昵称;?[::]?(.*?);', text1) gender = re.findall('性别;?[::]?(.*?);', text1) place = re.findall('地区;?[::]?(.*?);', text1) briefIntroduction = re.findall('简介;[::]?(.*?);', text1) birthday = re.findall('生日;?[::]?(.*?);', text1) sex_orientation = re.findall('性取向;?[::]?(.*?);', text1) sentiment = re.findall('感情状况;?[::]?(.*?);', text1) vip_level = re.findall('会员等级;?[::]?(.*?);', text1) authentication = re.findall('认证;?[::]?(.*?);', text1) if nick_name and nick_name[0]: information_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: information_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") information_item["province"] = place[0] if len(place) > 1: information_item["city"] = place[1] if briefIntroduction and briefIntroduction[0]: information_item["brief_introduction"] = briefIntroduction[ 0].replace(u"\xa0", "") if birthday and birthday[0]: information_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: information_item["sex_orientation"] = "同性恋" else: information_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: information_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: information_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: information_item["authentication"] = authentication[0].replace( u"\xa0", "") information_item['mark'] = mark yield information_item
def parseInformation(self,response): """ 抓取个人信息1 """ if len(response.body) > 50: print("###########################") print("Fetch information0 Success") print("###########################") informationItems=InformationItem() informations=json.loads(response.body) if informations.get("userInfo",""): informationItems["_id"]=informations["userInfo"]["id"] informationItems["NickName"]=informations["userInfo"]["screen_name"] informationItems["Signature"]=informations["userInfo"]["description"] informationItems["Num_Tweets"]=informations["userInfo"]["statuses_count"] informationItems["Num_Follows"]=informations["userInfo"]["follow_count"] informationItems["Num_Fans"]=informations["userInfo"]["followers_count"] informationItems["Pic"]=informations["userInfo"]["avatar_hd"] informationItems["Gender"]=informations["userInfo"]["gender"] informationItems["Verified"]=informations["userInfo"]["verified_reason"] yield informationItems #微博入口 tweets_container_id=informations["tabsInfo"]["tabs"][1]["containerid"] url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s" % (response.meta["ID"],tweets_container_id) yield Request(url=url_tweets, meta={"ID": response.meta["ID"]}, callback=self.parseTweets, dont_filter=True) #关注者入口 if informations.get("follow_scheme",""): follow_scheme=informations["follow_scheme"] follow_container_id=re.findall(r"containerid=(.*)",follow_scheme) follow_container_id[0]=follow_container_id[0].replace('followersrecomm','followers') url_follow="https://m.weibo.cn/api/container/getIndex?containerid="+follow_container_id[0] yield Request(url=url_follow, meta={"ID": response.meta["ID"]}, callback=self.parseFollows, dont_filter=True) #粉丝入口 if informations.get("fans_scheme",""): fans_scheme=informations["fans_scheme"] fans_container_id=re.findall(r"containerid=(.*)",fans_scheme) fans_container_id[0]=fans_container_id[0].replace('fansrecomm','fans') url_fans="https://m.weibo.cn/api/container/getIndex?containerid="+fans_container_id[0] yield Request(url=url_fans,meta={"ID": response.meta["ID"]},callback=self.parseFans, dont_filter=True)
def parse_information(self, response): """ 抓取个人信息 """ informationItem = InformationItem() selector = Selector(response) ID = re.findall('(\d+)/info', response.url)[0] try: text1 = ";".join( selector.xpath('body/div[@class="c"]//text()').extract() ) # 获取标签里的所有text() nickname = re.findall('昵称[::]?(.*?);', text1) gender = re.findall('性别[::]?(.*?);', text1) place = re.findall('地区[::]?(.*?);', text1) briefIntroduction = re.findall('简介[::]?(.*?);', text1) birthday = re.findall('生日[::]?(.*?);', text1) sexOrientation = re.findall('性取向[::]?(.*?);', text1) sentiment = re.findall('感情状况[::]?(.*?);', text1) vipLevel = re.findall('会员等级[::]?(.*?);', text1) authentication = re.findall('认证[::]?(.*?);', text1) url = re.findall('互联网[::]?(.*?);', text1) informationItem["_id"] = ID if nickname and nickname[0]: informationItem["NickName"] = nickname[0].replace(u"\xa0", "") if gender and gender[0]: informationItem["Gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") informationItem["Province"] = place[0] if len(place) > 1: informationItem["City"] = place[1] if briefIntroduction and briefIntroduction[0]: informationItem["BriefIntroduction"] = briefIntroduction[ 0].replace(u"\xa0", "") if birthday and birthday[0]: try: birthday = datetime.datetime.strptime( birthday[0], "%Y-%m-%d") informationItem[ "Birthday"] = birthday - datetime.timedelta(hours=8) except Exception: informationItem['Birthday'] = birthday[0] # 有可能是星座,而非时间 if sexOrientation and sexOrientation[0]: if sexOrientation[0].replace(u"\xa0", "") == gender[0]: informationItem["SexOrientation"] = "同性恋" else: informationItem["SexOrientation"] = "异性恋" if sentiment and sentiment[0]: informationItem["Sentiment"] = sentiment[0].replace( u"\xa0", "") if vipLevel and vipLevel[0]: informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "") if authentication and authentication[0]: informationItem["Authentication"] = authentication[0].replace( u"\xa0", "") if url: informationItem["URL"] = url[0] try: urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID new_ck = {} for ck in response.request.cookies: new_ck[ck['name']] = ck['value'] r = requests.get(urlothers, cookies=new_ck, timeout=5) if r.status_code == 200: selector = etree.HTML(r.content) texts = ";".join( selector.xpath('//body//div[@class="tip2"]/a//text()')) if texts: num_tweets = re.findall('微博\[(\d+)\]', texts) num_follows = re.findall('关注\[(\d+)\]', texts) num_fans = re.findall('粉丝\[(\d+)\]', texts) if num_tweets: informationItem["Num_Tweets"] = int(num_tweets[0]) if num_follows: informationItem["Num_Follows"] = int( num_follows[0]) if num_fans: informationItem["Num_Fans"] = int(num_fans[0]) except Exception as e: pass except Exception as e: pass else: yield informationItem if int(num_tweets[0]) < 5000: yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID, callback=self.parse_tweets, dont_filter=True) if int(num_follows[0]) < 500: yield Request(url="https://weibo.cn/%s/follow" % ID, callback=self.parse_relationship, dont_filter=True) if int(num_fans[0]) < 500: yield Request(url="https://weibo.cn/%s/fans" % ID, callback=self.parse_relationship, dont_filter=True)
def parse_information(self, response): """ functions: 1. catch basic informations 2. catch the number of tweets, follows, fans 3. request tweets as a corpus 4. request follows to make a loop for crawling all worthly user 5. request fans for analyzing the relationship """ informationItem = InformationItem() ID = re.findall('(\d+)/info', response.url)[0] try: # all infomation basicInfo = ';'.join( response.xpath('//div[@class="c"]/text()').extract()) # id informationItem["_id"] = ID # nickname if re.findall('昵称[::]?(.*?);', basicInfo): nickname = re.findall('昵称[::]?(.*?);', basicInfo) informationItem["NickName"] = nickname[0].replace(u"\xa0", "") # gender if re.findall('性别[::]?(.*?);', basicInfo): gender = re.findall('性别[::]?(.*?);', basicInfo) informationItem["Gender"] = gender[0].replace(u"\xa0", "") # place if re.findall('地区[::]?(.*?);', basicInfo): place = re.findall('地区[::]?(.*?);', basicInfo)[0].replace(u"\xa0", "").split(" ") informationItem["Province"] = place[0] if len(place) > 1: informationItem["City"] = place[1] # briefIntroduction if re.findall('简介[::]?(.*?);', basicInfo): briefIntroduction = re.findall('简介[::]?(.*?);', basicInfo) informationItem["BriefIntroduction"] = briefIntroduction[ 0].replace(u"\xa0", "") # birthday or Sign if re.findall('生日[::]?(.*?);', basicInfo): birthday = re.findall('生日[::]?(.*?);', basicInfo) try: birthday = datetime.datetime.strptime( birthday[0], "%Y-%m-%d") informationItem[ "Birthday"] = birthday - datetime.timedelta(hours=8) except Exception: # maybe zodiac informationItem['Birthday'] = birthday[0] # sexOrientaion if re.findall('性取向[::]?(.*?);', basicInfo): sexOrientation = re.findall('性取向[::]?(.*?);', basicInfo) if sexOrientation[0].replace(u"\xa0", "") == gender[0]: informationItem["SexOrientation"] = "同性恋" else: informationItem["SexOrientation"] = "异性恋" # sentiment if re.findall('感情状况[::]?(.*?);', basicInfo): sentiment = re.findall('感情状况[::]?(.*?);', basicInfo) informationItem["Sentiment"] = sentiment[0].replace( u"\xa0", "") # vipLevel if re.findall('会员等级[::]?(.*?);', basicInfo): vipLevel = re.findall('会员等级[::]?(.*?);', basicInfo) informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "") # authentication if re.findall('认证[::]?(.*?);', basicInfo): authentication = re.findall('认证[::]?(.*?);', basicInfo) informationItem["Authentication"] = authentication[0].replace( u"\xa0", "") # url if re.findall('互联网[::]?(.*?);', basicInfo): url = re.findall('互联网[::]?(.*?);', basicInfo) informationItem["URL"] = url[0] # get Tweets, Follows, Fans try: tweet_url = "https://weibo.cn/attgroup/opening?uid=%s" % ID r = requests.get(tweet_url, cookies=response.request.cookies, timeout=5) if r.status_code == 200: selector = etree.HTML(r.content) data = ";".join( selector.xpath('//body//div[@class="tip2"]/a//text()')) if data: num_tweets = re.findall('微博\[(\d+)\]', data) num_follows = re.findall('关注\[(\d+)\]', data) num_fans = re.findall('粉丝\[(\d+)\]', data) if num_tweets: informationItem["Num_Tweets"] = int(num_tweets[0]) if num_follows: informationItem["Num_Follows"] = int( num_follows[0]) if num_fans: informationItem["Num_Fans"] = int(num_fans[0]) except Exception as e: self.logger.info(e) pass except Exception as e: self.logger.info(e) pass else: print(informationItem) yield informationItem # filter worthless data if int(num_tweets[0]) < 5000: yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID, callback=self.parse_tweets, dont_filter=True) if int(num_follows[0]) < 500: yield Request(url="https://weibo.cn/%s/follow" % ID, callback=self.parse_relationship, dont_filter=True) if int(num_fans[0]) < 500: yield Request(url="https://weibo.cn/%s/fans" % ID, callback=self.parse_relationship, dont_filter=True)
def parse1(self, response): '''抓取个人信息2''' #因为有的字段不存在,而mysql需要提取字段,为保存不出错,先统一默认为空,monogodb不存在这个问题,是因为mongodb直接将dict插入数据库,不需要对每个字段赋值 informationItems = InformationItem() informationItems['NickName'] = '' informationItems['Gender'] = '' informationItems['City'] = '' informationItems['URL'] = '' informationItems['Num_Fans'] = '' informationItems['Num_Follows'] = '' informationItems['Num_Tweets'] = '' informationItems['Province'] = '' informationItems['Signature'] = '' # informationItems = response.meta["item"] selector = Selector(response) ID = re.findall('weibo\.cn/(\d+)', response.url)[0] text1 = ";".join( selector.xpath('body/div[@class="c"]/text()').extract()) print('text1的数据是:') print(text1) nickname = re.findall('昵称[:|:](.*?);', text1) # 昵称 gender = re.findall('性别[:|:](.*?);', text1) # 性别 place = re.findall('地区[:|:](.*?);', text1) # 地区(包括省份和城市) signature = re.findall('简介[:|:](.*?);', text1) # 个性签名 birthday = re.findall('生日[:|:](.*?);', text1) # 生日 sexorientation = re.findall('性取向[:|:](.*?);', text1) # 性取向 marriage = re.findall('感情状况[:|:](.*?);', text1) # 婚姻状况 url = re.findall('互联网[:|:](.*?);', text1) # 首页链接 print('nieckname和gender的数据是:') print(nickname) print(gender) if nickname: informationItems["NickName"] = nickname[0] if gender: informationItems['Gender'] = gender[0] if place: place = place[0].split(' ') informationItems['Province'] = place[0] if len(place) > 1: informationItems['City'] = place[1] if signature: informationItems['Signature'] = signature[0] if birthday: try: birthday = datatime.datetime.strptime(birthday[0], "%Y-%m-%d") informationItems["Birthday"] = birthday - datetime.timedelta( hours=8) except Exception: pass if sexorientation: if sexorientation[0] == gender[0]: informationItems["Sex_Orientation"] = "gay" else: informationItems["Sex_Orientation"] = "Heterosexual" if marriage: informationItems['Marriage'] = marriage[0] if url: informationItems["URL"] = url[0] urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID r = requests.get(urlothers, cookies=response.request.cookies) if r.status_code == 200: selector = etree.HTML(r.content) texts = ';'.join(selector.xpath('//div[@class="tip2"]/a/text()')) print('texts的数据是') print(texts) if texts: num_tweets = re.findall('微博\[(\d+)\]', texts) #微博数 num_follows = re.findall('关注\[(\d+)\]', texts) #关注数 num_fans = re.findall('粉丝\[(\d+)\]', texts) #粉丝数 if num_tweets: informationItems['Num_Tweets'] = int(num_tweets[0]) if num_follows: informationItems['Num_Follows'] = int(num_follows[0]) if num_fans: informationItems['Num_Fans'] = int(num_fans[0]) print('informationItems的数据是:') print(informationItems) yield informationItems contents = [] tweets = TweetsItem() tweets['_id'] = ID tweets['Content'] = contents yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID, meta={ 'item': tweets, 'contents': contents }, callback=self.parse_tweets)
def parseInformation(self, response): """ 抓取个人信息1 """ if len(response.body) > 50: print "###########################" print "Fetch information0 Success" print "###########################" ID = response.meta['ID'] # self.db.Aims.delete_one({'ID': ID}) self.db.Finished.insert_one({'ID': ID}) informationItems = InformationItem() informations = json.loads(response.body) # print informations if informations.get("userInfo", ""): # print informations["userInfo"] informationItems["_id"] = informations["userInfo"]["id"] informationItems["NickName"] = informations["userInfo"][ "screen_name"] informationItems["Signature"] = informations["userInfo"][ "description"] informationItems["Num_Tweets"] = informations["userInfo"][ "statuses_count"] informationItems["Num_Follows"] = informations["userInfo"][ "follow_count"] informationItems["Num_Fans"] = informations["userInfo"][ "followers_count"] informationItems["User_Url"] = informations["userInfo"][ "profile_url"] informationItems['Avatar'] = informations["userInfo"][ "profile_image_url"] informationItems['LocalAvatar'] = '' informationItems['Cover'] = informations["userInfo"][ 'cover_image_phone'] informationItems['LocalCover'] = '' informationItems['Used'] = False yield informationItems # # 微博入口 # tweets_container_id = informations["tabsInfo"]["tabs"][1]["containerid"] # url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s" % ( # response.meta["ID"], tweets_container_id) # yield Request(url=url_tweets, meta={"ID": response.meta["ID"],'owner':informations["userInfo"]["screen_name"]}, # callback=self.parseTweets, dont_filter=True) # 原创微博入口 # 先请求一次获得微博主页 # 详细信息入口 info_container_tabs = informations["tabsInfo"]["tabs"] #原创微博入口 #先获取一个主页的url # info_container_id = '' info_raw_id = '' for tab in info_container_tabs: if tab['tab_type'] == "profile": info_container_id = tab['containerid'] + \ '_' + '-' + '_INFO' # print info_container_id info_raw_id = tab['containerid'] home_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s' % ( response.meta["ID"], info_raw_id) yield Request(url=home_url, meta={ "detail_id": info_raw_id, 'ID': response.meta["ID"], 'owner': informations["userInfo"]["screen_name"] }, callback=self.parseHome, dont_filter=True) url_details = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s' % ( response.meta["ID"], info_container_id) yield Request(url=url_details, meta={ "detail_id": info_container_id, 'ID': response.meta["ID"], 'owner': informations["userInfo"]["screen_name"] }, callback=self.parseDetails, dont_filter=True) break #处理主页获取原创微博的ID # 关注者入口 # if informations.get("follow_scheme", ""): # follow_scheme = informations["follow_scheme"] # follow_container_id = re.findall( # r"containerid=(.*)", follow_scheme) # follow_container_id[0] = follow_container_id[0].replace( # 'followersrecomm', 'followers') # url_follow = "https://m.weibo.cn/api/container/getIndex?containerid=" + \ # follow_container_id[0] # yield Request(url=url_follow, meta={"ID": response.meta["ID"]}, callback=self.parseFollows, dont_filter=True) # 粉丝入口 # if informations.get("fans_scheme", ""): # fans_scheme = informations["fans_scheme"] # fans_container_id = re.findall( # r"containerid=(.*)", fans_scheme) # fans_container_id[0] = fans_container_id[0].replace( # 'fansrecomm', 'fans') # url_fans = "https://m.weibo.cn/api/container/getIndex?containerid=" + \ # fans_container_id[0] # yield Request(url=url_fans, meta={"ID": response.meta["ID"]}, callback=self.parseFans, dont_filter=True) else: print "###########################" print "Fetch information0 Fail" print "###########################" return