Esempio n. 1
0
 def parse_information(self, response):
     """ 抓取个人信息 """
     information_item = InformationItem()
     information_item['crawl_time'] = int(time.time())
     selector = Selector(response)
     information_item['_id'] = re.findall('(\d+)/info', response.url)[0]
     text1 = ";".join(
         selector.xpath(
             'body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
     name = re.findall('昵称;?[::]?(.*?);', text1)
     gender = re.findall('性别;?[::]?(.*?);', text1)
     place = re.findall('地区;?[::]?(.*?);', text1)
     briefIntroduction = re.findall('简介;?[::]?(.*?);', text1)
     birthday = re.findall('生日;?[::]?(.*?);', text1)
     if name and name[0]:
         information_item["nick_name"] = name[0].replace(u"\xa0", "")
     if gender and gender[0]:
         information_item["gender"] = gender[0].replace(u"\xa0", "")
     if place and place[0]:
         information_item["place"] = place[0].replace(u"\xa0", "")
     if briefIntroduction and briefIntroduction[0]:
         information_item["brief_introduction"] = briefIntroduction[
             0].replace(u"\xa0", "")
     if birthday and birthday[0]:
         information_item['birthday'] = birthday[0]
     # request_meta = response.meta
     # request_meta['item'] = information_item
     yield Request(self.base_url + '/u/{}'.format(information_item['_id']),
                   callback=self.parse_further_information,
                   meta={'item': information_item},
                   dont_filter=True,
                   priority=1)
Esempio n. 2
0
    def parse(self, response):
        selector = Selector(response)

        information_item = InformationItem()
        uid_from_url = re.findall('(\d+)/info', response.url)
        if uid_from_url:
            information_item['_id'] = re.findall(
                '(\d+)/info', response.url)[0]  # get user id
        else:
            information_item['_id'] = "NA"

        information_item['page_url'] = response.url.replace(
            self.base_url, self.weibo_baseurl)
        information_item['page_raw'] = selector.extract(
        )  # get raw page content
        information_item['crawl_time_utc'] = dt.utcnow()
        yield information_item

        # request tweets page
        if uid_from_url:
            yield Request(url=self.base_url +
                          '/{}/profile?page=1'.format(information_item['_id']),
                          callback=self.parse_tweet,
                          meta={'user_id': information_item['_id']},
                          priority=1)
        else:
            yield Request(url=response.url + '?page=1',
                          callback=self.parse_tweet,
                          meta={'user_id': information_item['_id']},
                          priority=1)
Esempio n. 3
0
    def parse_information(self, response):
        """

        :param response:
        :param from_twitter_id: 用来获取asker的爬取来自哪一个twitter,但是好像没有必要,因为数据都是当天爬取的
        :return:
        """
        """ 抓取个人信息 """
        information_item = InformationItem()
        if response.meta.get('asker_from', None):
            information_item['asker_from_tweet'] = response.meta['asker_from']
        information_item['crawl_time'] = int(time.time())
        selector = Selector(response)
        information_item['_id'] = re.findall('(\d+)/info', response.url)[0]
        text1 = ";".join(
            selector.xpath(
                'body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
        nick_name = re.findall('昵称;?[::]?(.*?);', text1)
        gender = re.findall('性别;?[::]?(.*?);', text1)
        place = re.findall('地区;?[::]?(.*?);', text1)
        brief_introduction = re.findall('简介;[::]?(.*?);', text1)
        birthday = re.findall('生日;?[::]?(.*?);', text1)
        sex_orientation = re.findall('性取向;?[::]?(.*?);', text1)
        sentiment = re.findall('感情状况;?[::]?(.*?);', text1)
        vip_level = re.findall('会员等级;?[::]?(.*?);', text1)
        authentication = re.findall('认证;?[::]?(.*?);', text1)
        if nick_name and nick_name[0]:
            information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
        if gender and gender[0]:
            information_item["gender"] = gender[0].replace(u"\xa0", "")
        if place and place[0]:
            place = place[0].replace(u"\xa0", "").split(" ")
            information_item["province"] = place[0]
            if len(place) > 1:
                information_item["city"] = place[1]
        if brief_introduction and brief_introduction[0]:
            information_item["brief_introduction"] = brief_introduction[
                0].replace(u"\xa0", "")
        if birthday and birthday[0]:
            information_item['birthday'] = birthday[0]
        if sex_orientation and sex_orientation[0]:
            if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
                information_item["sex_orientation"] = "同性恋"
            else:
                information_item["sex_orientation"] = "异性恋"
        if sentiment and sentiment[0]:
            information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
        if vip_level and vip_level[0]:
            information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
        if authentication and authentication[0]:
            information_item["authentication"] = authentication[0].replace(
                u"\xa0", "")
        request_meta = response.meta
        request_meta['item'] = information_item
        yield Request(self.base_url + '/u/{}'.format(information_item['_id']),
                      callback=self.parse_further_information,
                      meta=request_meta,
                      dont_filter=True,
                      priority=4)
Esempio n. 4
0
    def parse_information(self, response):
        """ 抓取个人信息 """
        informationItem = InformationItem()
        selector = Selector(response)
        ID = re.findall('(\d+)/info', response.url)[0]
        try:
            text1 = ";".join(
                selector.xpath('body/div[@class="c"]//text()').extract()
            )  # 获取标签里的所有text()
            nickname = re.findall('昵称;?[::]?(.*?);', text1)
            url = re.findall('互联网;?[::]?(.*?);', text1)

            informationItem["Num_Tweets"] = 0
            informationItem["Num_Follows"] = 0
            informationItem["Num_Fans"] = 0

            informationItem["_id"] = ID
            if nickname and nickname[0]:
                informationItem["NickName"] = nickname[0].replace(u"\xa0", "")
            if url:
                informationItem["URL"] = url[0]

            try:
                urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID
                new_ck = {}
                for ck in response.request.cookies:
                    new_ck[ck['name']] = ck['value']
                r = requests.get(urlothers, cookies=new_ck, timeout=5)
                if r.status_code == 200:
                    selector = etree.HTML(r.content)
                    texts = ";".join(
                        selector.xpath('//body//div[@class="tip2"]/a//text()'))
                    if texts:
                        num_tweets = re.findall('微博\[(\d+)\]', texts)
                        num_follows = re.findall('关注\[(\d+)\]', texts)
                        num_fans = re.findall('粉丝\[(\d+)\]', texts)
                        if num_tweets:
                            informationItem["Num_Tweets"] = int(num_tweets[0])
                        if num_follows:
                            informationItem["Num_Follows"] = int(
                                num_follows[0])
                        if num_fans:
                            informationItem["Num_Fans"] = int(num_fans[0])
            except Exception as e:
                pass
        except Exception as e:
            pass
        else:
            yield informationItem
        if informationItem["Num_Tweets"] > 0:
            page_cnt = min((informationItem["Num_Tweets"] - 1) / 10, 2000)
            for i in range(0, int(page_cnt) + 1):
                yield Request(
                    url="https://weibo.cn/%s/profile?filter=1&page=%s" %
                    (ID, str(i)),
                    callback=self.parse_tweets,
                    dont_filter=True)
Esempio n. 5
0
 def parse(self, response):
     """ 抓取个人信息 """
     information_item = InformationItem()
     information_item['crawl_time'] = int(time.time())
     selector = Selector(response)
     information_item['_id'] = re.findall('(\d+)/info', response.url)[0]
     text1 = ";".join(
         selector.xpath(
             'body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
     nick_name = re.findall('昵称;?[::]?(.*?);', text1)
     gender = re.findall('性别;?[::]?(.*?);', text1)
     place = re.findall('地区;?[::]?(.*?);', text1)
     briefIntroduction = re.findall('简介;?[::]?(.*?);', text1)
     birthday = re.findall('生日;?[::]?(.*?);', text1)
     sex_orientation = re.findall('性取向;?[::]?(.*?);', text1)
     sentiment = re.findall('感情状况;?[::]?(.*?);', text1)
     vip_level = re.findall('会员等级;?[::]?(.*?);', text1)
     authentication = re.findall('认证;?[::]?(.*?);', text1)
     labels = re.findall('标签;?[::]?(.*?)更多>>', text1)
     if nick_name and nick_name[0]:
         information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
     if gender and gender[0]:
         information_item["gender"] = gender[0].replace(u"\xa0", "")
     if place and place[0]:
         place = place[0].replace(u"\xa0", "").split(" ")
         information_item["province"] = place[0]
         if len(place) > 1:
             information_item["city"] = place[1]
     if briefIntroduction and briefIntroduction[0]:
         information_item["brief_introduction"] = briefIntroduction[
             0].replace(u"\xa0", "")
     if birthday and birthday[0]:
         information_item['birthday'] = birthday[0]
     if sex_orientation and sex_orientation[0]:
         if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
             information_item["sex_orientation"] = "同性恋"
         else:
             information_item["sex_orientation"] = "异性恋"
     if sentiment and sentiment[0]:
         information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
     if vip_level and vip_level[0]:
         information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
     if authentication and authentication[0]:
         information_item["authentication"] = authentication[0].replace(
             u"\xa0", "")
     if labels and labels[0]:
         information_item["labels"] = labels[0].replace(
             u"\xa0", ",").replace(';', '').strip(',')
     request_meta = response.meta
     request_meta['item'] = information_item
     yield Request(self.base_url + '/u/{}'.format(information_item['_id']),
                   callback=self.parse_further_information,
                   meta=request_meta,
                   dont_filter=True,
                   priority=1)
Esempio n. 6
0
 def parse_information(self, response):
     """ 抓取评论个人信息 """
     information_item = InformationItem()
     information_item['crawl_time'] = int(time.time())
     selector = Selector(response)
     information_item['_id'] = re.findall('(\d+)/info', response.url)[0]
     text1 = ";".join(
         selector.xpath(
             'body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
     nick_name = re.findall('昵称;?[::]?(.*?);', text1)
     gender = re.findall('性别;?[::]?(.*?);', text1)
     place = re.findall('地区;?[::]?(.*?);', text1)
     briefIntroduction = re.findall('简介;[::]?(.*?);', text1)
     birthday = re.findall('生日;?[::]?(.*?);', text1)
     sex_orientation = re.findall('性取向;?[::]?(.*?);', text1)
     sentiment = re.findall('感情状况;?[::]?(.*?);', text1)
     vip_level = re.findall('会员等级;?[::]?(.*?);', text1)
     authentication = re.findall('认证;?[::]?(.*?);', text1)
     if nick_name and nick_name[0]:
         information_item["nick_name"] = nick_name[0].replace(u"\xa0", "")
     if gender and gender[0]:
         information_item["gender"] = gender[0].replace(u"\xa0", "")
     if place and place[0]:
         place = place[0].replace(u"\xa0", "").split(" ")
         information_item["province"] = place[0]
         if len(place) > 1:
             information_item["city"] = place[1]
     if briefIntroduction and briefIntroduction[0]:
         information_item["brief_introduction"] = briefIntroduction[
             0].replace(u"\xa0", "")
     if birthday and birthday[0]:
         information_item['birthday'] = birthday[0]
     if sex_orientation and sex_orientation[0]:
         if sex_orientation[0].replace(u"\xa0", "") == gender[0]:
             information_item["sex_orientation"] = "同性恋"
         else:
             information_item["sex_orientation"] = "异性恋"
     if sentiment and sentiment[0]:
         information_item["sentiment"] = sentiment[0].replace(u"\xa0", "")
     if vip_level and vip_level[0]:
         information_item["vip_level"] = vip_level[0].replace(u"\xa0", "")
     if authentication and authentication[0]:
         information_item["authentication"] = authentication[0].replace(
             u"\xa0", "")
     information_item['mark'] = mark
     yield information_item
Esempio n. 7
0
    def parseInformation(self,response):
    	""" 抓取个人信息1 """
    	if len(response.body) > 50:
    		print("###########################")
    		print("Fetch information0 Success")
    		print("###########################")

    		informationItems=InformationItem()
    		informations=json.loads(response.body)
    		if informations.get("userInfo",""):
    			informationItems["_id"]=informations["userInfo"]["id"]
    			informationItems["NickName"]=informations["userInfo"]["screen_name"]
	    		informationItems["Signature"]=informations["userInfo"]["description"]
    			informationItems["Num_Tweets"]=informations["userInfo"]["statuses_count"]
    			informationItems["Num_Follows"]=informations["userInfo"]["follow_count"]
    			informationItems["Num_Fans"]=informations["userInfo"]["followers_count"]
                informationItems["Pic"]=informations["userInfo"]["avatar_hd"]
                informationItems["Gender"]=informations["userInfo"]["gender"]
                informationItems["Verified"]=informations["userInfo"]["verified_reason"]
    			yield informationItems

    		#微博入口
    		tweets_container_id=informations["tabsInfo"]["tabs"][1]["containerid"]
    		url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s" % (response.meta["ID"],tweets_container_id)
    		yield Request(url=url_tweets, meta={"ID": response.meta["ID"]}, callback=self.parseTweets, dont_filter=True)


    		#关注者入口
    		if informations.get("follow_scheme",""):
            	follow_scheme=informations["follow_scheme"]
    			follow_container_id=re.findall(r"containerid=(.*)",follow_scheme)
    			follow_container_id[0]=follow_container_id[0].replace('followersrecomm','followers')
    			url_follow="https://m.weibo.cn/api/container/getIndex?containerid="+follow_container_id[0]
    			yield Request(url=url_follow, meta={"ID": response.meta["ID"]}, callback=self.parseFollows, dont_filter=True)

    		#粉丝入口
    		if informations.get("fans_scheme",""):
    			fans_scheme=informations["fans_scheme"]
    			fans_container_id=re.findall(r"containerid=(.*)",fans_scheme)
    			fans_container_id[0]=fans_container_id[0].replace('fansrecomm','fans')
    			url_fans="https://m.weibo.cn/api/container/getIndex?containerid="+fans_container_id[0]
    			yield Request(url=url_fans,meta={"ID": response.meta["ID"]},callback=self.parseFans, dont_filter=True)
Esempio n. 8
0
    def parse_information(self, response):
        """ 抓取个人信息 """
        informationItem = InformationItem()
        selector = Selector(response)
        ID = re.findall('(\d+)/info', response.url)[0]
        try:
            text1 = ";".join(
                selector.xpath('body/div[@class="c"]//text()').extract()
            )  # 获取标签里的所有text()
            nickname = re.findall('昵称[::]?(.*?);', text1)
            gender = re.findall('性别[::]?(.*?);', text1)
            place = re.findall('地区[::]?(.*?);', text1)
            briefIntroduction = re.findall('简介[::]?(.*?);', text1)
            birthday = re.findall('生日[::]?(.*?);', text1)
            sexOrientation = re.findall('性取向[::]?(.*?);', text1)
            sentiment = re.findall('感情状况[::]?(.*?);', text1)
            vipLevel = re.findall('会员等级[::]?(.*?);', text1)
            authentication = re.findall('认证[::]?(.*?);', text1)
            url = re.findall('互联网[::]?(.*?);', text1)

            informationItem["_id"] = ID
            if nickname and nickname[0]:
                informationItem["NickName"] = nickname[0].replace(u"\xa0", "")
            if gender and gender[0]:
                informationItem["Gender"] = gender[0].replace(u"\xa0", "")
            if place and place[0]:
                place = place[0].replace(u"\xa0", "").split(" ")
                informationItem["Province"] = place[0]
                if len(place) > 1:
                    informationItem["City"] = place[1]
            if briefIntroduction and briefIntroduction[0]:
                informationItem["BriefIntroduction"] = briefIntroduction[
                    0].replace(u"\xa0", "")
            if birthday and birthday[0]:
                try:
                    birthday = datetime.datetime.strptime(
                        birthday[0], "%Y-%m-%d")
                    informationItem[
                        "Birthday"] = birthday - datetime.timedelta(hours=8)
                except Exception:
                    informationItem['Birthday'] = birthday[0]  # 有可能是星座,而非时间
            if sexOrientation and sexOrientation[0]:
                if sexOrientation[0].replace(u"\xa0", "") == gender[0]:
                    informationItem["SexOrientation"] = "同性恋"
                else:
                    informationItem["SexOrientation"] = "异性恋"
            if sentiment and sentiment[0]:
                informationItem["Sentiment"] = sentiment[0].replace(
                    u"\xa0", "")
            if vipLevel and vipLevel[0]:
                informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "")
            if authentication and authentication[0]:
                informationItem["Authentication"] = authentication[0].replace(
                    u"\xa0", "")
            if url:
                informationItem["URL"] = url[0]

            try:
                urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID
                new_ck = {}
                for ck in response.request.cookies:
                    new_ck[ck['name']] = ck['value']
                r = requests.get(urlothers, cookies=new_ck, timeout=5)
                if r.status_code == 200:
                    selector = etree.HTML(r.content)
                    texts = ";".join(
                        selector.xpath('//body//div[@class="tip2"]/a//text()'))
                    if texts:
                        num_tweets = re.findall('微博\[(\d+)\]', texts)
                        num_follows = re.findall('关注\[(\d+)\]', texts)
                        num_fans = re.findall('粉丝\[(\d+)\]', texts)
                        if num_tweets:
                            informationItem["Num_Tweets"] = int(num_tweets[0])
                        if num_follows:
                            informationItem["Num_Follows"] = int(
                                num_follows[0])
                        if num_fans:
                            informationItem["Num_Fans"] = int(num_fans[0])
            except Exception as e:
                pass
        except Exception as e:
            pass
        else:
            yield informationItem
        if int(num_tweets[0]) < 5000:
            yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" %
                          ID,
                          callback=self.parse_tweets,
                          dont_filter=True)
        if int(num_follows[0]) < 500:
            yield Request(url="https://weibo.cn/%s/follow" % ID,
                          callback=self.parse_relationship,
                          dont_filter=True)
        if int(num_fans[0]) < 500:
            yield Request(url="https://weibo.cn/%s/fans" % ID,
                          callback=self.parse_relationship,
                          dont_filter=True)
Esempio n. 9
0
    def parse_information(self, response):
        """
        functions:
           1. catch basic informations
           2. catch the number of tweets, follows, fans
           3. request tweets as a corpus
           4. request follows to make a loop for crawling all worthly user
           5. request fans for analyzing the relationship 
        """
        informationItem = InformationItem()
        ID = re.findall('(\d+)/info', response.url)[0]
        try:
            # all infomation
            basicInfo = ';'.join(
                response.xpath('//div[@class="c"]/text()').extract())
            # id
            informationItem["_id"] = ID
            # nickname
            if re.findall('昵称[::]?(.*?);', basicInfo):
                nickname = re.findall('昵称[::]?(.*?);', basicInfo)
                informationItem["NickName"] = nickname[0].replace(u"\xa0", "")
            # gender
            if re.findall('性别[::]?(.*?);', basicInfo):
                gender = re.findall('性别[::]?(.*?);', basicInfo)
                informationItem["Gender"] = gender[0].replace(u"\xa0", "")
            # place
            if re.findall('地区[::]?(.*?);', basicInfo):
                place = re.findall('地区[::]?(.*?);',
                                   basicInfo)[0].replace(u"\xa0",
                                                         "").split(" ")
                informationItem["Province"] = place[0]
                if len(place) > 1:
                    informationItem["City"] = place[1]
            # briefIntroduction
            if re.findall('简介[::]?(.*?);', basicInfo):
                briefIntroduction = re.findall('简介[::]?(.*?);', basicInfo)
                informationItem["BriefIntroduction"] = briefIntroduction[
                    0].replace(u"\xa0", "")
            # birthday or Sign
            if re.findall('生日[::]?(.*?);', basicInfo):
                birthday = re.findall('生日[::]?(.*?);', basicInfo)
                try:
                    birthday = datetime.datetime.strptime(
                        birthday[0], "%Y-%m-%d")
                    informationItem[
                        "Birthday"] = birthday - datetime.timedelta(hours=8)
                except Exception:
                    # maybe zodiac
                    informationItem['Birthday'] = birthday[0]
            # sexOrientaion
            if re.findall('性取向[::]?(.*?);', basicInfo):
                sexOrientation = re.findall('性取向[::]?(.*?);', basicInfo)
                if sexOrientation[0].replace(u"\xa0", "") == gender[0]:
                    informationItem["SexOrientation"] = "同性恋"
                else:
                    informationItem["SexOrientation"] = "异性恋"
            # sentiment
            if re.findall('感情状况[::]?(.*?);', basicInfo):
                sentiment = re.findall('感情状况[::]?(.*?);', basicInfo)
                informationItem["Sentiment"] = sentiment[0].replace(
                    u"\xa0", "")
            # vipLevel
            if re.findall('会员等级[::]?(.*?);', basicInfo):
                vipLevel = re.findall('会员等级[::]?(.*?);', basicInfo)
                informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "")
            # authentication
            if re.findall('认证[::]?(.*?);', basicInfo):
                authentication = re.findall('认证[::]?(.*?);', basicInfo)
                informationItem["Authentication"] = authentication[0].replace(
                    u"\xa0", "")
            # url
            if re.findall('互联网[::]?(.*?);', basicInfo):
                url = re.findall('互联网[::]?(.*?);', basicInfo)
                informationItem["URL"] = url[0]

            # get Tweets, Follows, Fans
            try:
                tweet_url = "https://weibo.cn/attgroup/opening?uid=%s" % ID
                r = requests.get(tweet_url,
                                 cookies=response.request.cookies,
                                 timeout=5)
                if r.status_code == 200:
                    selector = etree.HTML(r.content)
                    data = ";".join(
                        selector.xpath('//body//div[@class="tip2"]/a//text()'))
                    if data:
                        num_tweets = re.findall('微博\[(\d+)\]', data)
                        num_follows = re.findall('关注\[(\d+)\]', data)
                        num_fans = re.findall('粉丝\[(\d+)\]', data)
                        if num_tweets:
                            informationItem["Num_Tweets"] = int(num_tweets[0])
                        if num_follows:
                            informationItem["Num_Follows"] = int(
                                num_follows[0])
                        if num_fans:
                            informationItem["Num_Fans"] = int(num_fans[0])
            except Exception as e:
                self.logger.info(e)
                pass
        except Exception as e:
            self.logger.info(e)
            pass
        else:
            print(informationItem)
            yield informationItem
        # filter worthless data
        if int(num_tweets[0]) < 5000:
            yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" %
                          ID,
                          callback=self.parse_tweets,
                          dont_filter=True)
        if int(num_follows[0]) < 500:
            yield Request(url="https://weibo.cn/%s/follow" % ID,
                          callback=self.parse_relationship,
                          dont_filter=True)
        if int(num_fans[0]) < 500:
            yield Request(url="https://weibo.cn/%s/fans" % ID,
                          callback=self.parse_relationship,
                          dont_filter=True)
Esempio n. 10
0
    def parse1(self, response):
        '''抓取个人信息2'''
        #因为有的字段不存在,而mysql需要提取字段,为保存不出错,先统一默认为空,monogodb不存在这个问题,是因为mongodb直接将dict插入数据库,不需要对每个字段赋值
        informationItems = InformationItem()
        informationItems['NickName'] = ''
        informationItems['Gender'] = ''
        informationItems['City'] = ''
        informationItems['URL'] = ''
        informationItems['Num_Fans'] = ''
        informationItems['Num_Follows'] = ''
        informationItems['Num_Tweets'] = ''
        informationItems['Province'] = ''
        informationItems['Signature'] = ''
        #		informationItems = response.meta["item"]
        selector = Selector(response)
        ID = re.findall('weibo\.cn/(\d+)', response.url)[0]
        text1 = ";".join(
            selector.xpath('body/div[@class="c"]/text()').extract())
        print('text1的数据是:')
        print(text1)
        nickname = re.findall('昵称[:|:](.*?);', text1)  # 昵称
        gender = re.findall('性别[:|:](.*?);', text1)  # 性别
        place = re.findall('地区[:|:](.*?);', text1)  # 地区(包括省份和城市)
        signature = re.findall('简介[:|:](.*?);', text1)  # 个性签名
        birthday = re.findall('生日[:|:](.*?);', text1)  # 生日
        sexorientation = re.findall('性取向[:|:](.*?);', text1)  # 性取向
        marriage = re.findall('感情状况[:|:](.*?);', text1)  # 婚姻状况
        url = re.findall('互联网[:|:](.*?);', text1)  # 首页链接
        print('nieckname和gender的数据是:')
        print(nickname)
        print(gender)
        if nickname:
            informationItems["NickName"] = nickname[0]
        if gender:
            informationItems['Gender'] = gender[0]
        if place:
            place = place[0].split(' ')
            informationItems['Province'] = place[0]
            if len(place) > 1:
                informationItems['City'] = place[1]
        if signature:
            informationItems['Signature'] = signature[0]
        if birthday:
            try:
                birthday = datatime.datetime.strptime(birthday[0], "%Y-%m-%d")
                informationItems["Birthday"] = birthday - datetime.timedelta(
                    hours=8)
            except Exception:
                pass

        if sexorientation:
            if sexorientation[0] == gender[0]:
                informationItems["Sex_Orientation"] = "gay"
            else:
                informationItems["Sex_Orientation"] = "Heterosexual"
        if marriage:
            informationItems['Marriage'] = marriage[0]

        if url:
            informationItems["URL"] = url[0]

        urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID
        r = requests.get(urlothers, cookies=response.request.cookies)
        if r.status_code == 200:
            selector = etree.HTML(r.content)
            texts = ';'.join(selector.xpath('//div[@class="tip2"]/a/text()'))
            print('texts的数据是')
            print(texts)
            if texts:
                num_tweets = re.findall('微博\[(\d+)\]', texts)  #微博数
                num_follows = re.findall('关注\[(\d+)\]', texts)  #关注数
                num_fans = re.findall('粉丝\[(\d+)\]', texts)  #粉丝数
                if num_tweets:
                    informationItems['Num_Tweets'] = int(num_tweets[0])
                if num_follows:
                    informationItems['Num_Follows'] = int(num_follows[0])
                if num_fans:
                    informationItems['Num_Fans'] = int(num_fans[0])
        print('informationItems的数据是:')
        print(informationItems)
        yield informationItems

        contents = []
        tweets = TweetsItem()
        tweets['_id'] = ID
        tweets['Content'] = contents

        yield Request(url="https://weibo.cn/%s/profile?filter=1&page=1" % ID,
                      meta={
                          'item': tweets,
                          'contents': contents
                      },
                      callback=self.parse_tweets)
Esempio n. 11
0
    def parseInformation(self, response):
        """ 抓取个人信息1 """
        if len(response.body) > 50:
            print "###########################"
            print "Fetch information0 Success"
            print "###########################"
            ID = response.meta['ID']
            # self.db.Aims.delete_one({'ID': ID})
            self.db.Finished.insert_one({'ID': ID})
            informationItems = InformationItem()
            informations = json.loads(response.body)
            # print informations
            if informations.get("userInfo", ""):
                # print informations["userInfo"]
                informationItems["_id"] = informations["userInfo"]["id"]
                informationItems["NickName"] = informations["userInfo"][
                    "screen_name"]
                informationItems["Signature"] = informations["userInfo"][
                    "description"]
                informationItems["Num_Tweets"] = informations["userInfo"][
                    "statuses_count"]
                informationItems["Num_Follows"] = informations["userInfo"][
                    "follow_count"]
                informationItems["Num_Fans"] = informations["userInfo"][
                    "followers_count"]
                informationItems["User_Url"] = informations["userInfo"][
                    "profile_url"]
                informationItems['Avatar'] = informations["userInfo"][
                    "profile_image_url"]
                informationItems['LocalAvatar'] = ''
                informationItems['Cover'] = informations["userInfo"][
                    'cover_image_phone']
                informationItems['LocalCover'] = ''
                informationItems['Used'] = False
                yield informationItems

            # # 微博入口
            # tweets_container_id = informations["tabsInfo"]["tabs"][1]["containerid"]
            # url_tweets = "https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s" % (
            #     response.meta["ID"], tweets_container_id)
            # yield Request(url=url_tweets, meta={"ID": response.meta["ID"],'owner':informations["userInfo"]["screen_name"]},
            # callback=self.parseTweets, dont_filter=True)

            # 原创微博入口
            # 先请求一次获得微博主页

            # 详细信息入口
            info_container_tabs = informations["tabsInfo"]["tabs"]

            #原创微博入口
            #先获取一个主页的url

            # info_container_id = ''
            info_raw_id = ''
            for tab in info_container_tabs:
                if tab['tab_type'] == "profile":
                    info_container_id = tab['containerid'] + \
                        '_' + '-' + '_INFO'
                    # print info_container_id
                    info_raw_id = tab['containerid']
                    home_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s' % (
                        response.meta["ID"], info_raw_id)
                    yield Request(url=home_url,
                                  meta={
                                      "detail_id":
                                      info_raw_id,
                                      'ID':
                                      response.meta["ID"],
                                      'owner':
                                      informations["userInfo"]["screen_name"]
                                  },
                                  callback=self.parseHome,
                                  dont_filter=True)

                    url_details = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=%s&containerid=%s' % (
                        response.meta["ID"], info_container_id)
                    yield Request(url=url_details,
                                  meta={
                                      "detail_id":
                                      info_container_id,
                                      'ID':
                                      response.meta["ID"],
                                      'owner':
                                      informations["userInfo"]["screen_name"]
                                  },
                                  callback=self.parseDetails,
                                  dont_filter=True)
                    break

            #处理主页获取原创微博的ID

            # 关注者入口
            # if informations.get("follow_scheme", ""):
            #     follow_scheme = informations["follow_scheme"]
            #     follow_container_id = re.findall(
            #         r"containerid=(.*)", follow_scheme)
            #     follow_container_id[0] = follow_container_id[0].replace(
            #         'followersrecomm', 'followers')
            #     url_follow = "https://m.weibo.cn/api/container/getIndex?containerid=" + \
            #         follow_container_id[0]
            #     yield Request(url=url_follow, meta={"ID": response.meta["ID"]}, callback=self.parseFollows, dont_filter=True)

            # 粉丝入口
            # if informations.get("fans_scheme", ""):
            #     fans_scheme = informations["fans_scheme"]
            #     fans_container_id = re.findall(
            #         r"containerid=(.*)", fans_scheme)
            #     fans_container_id[0] = fans_container_id[0].replace(
            #         'fansrecomm', 'fans')
            #     url_fans = "https://m.weibo.cn/api/container/getIndex?containerid=" + \
            #         fans_container_id[0]
            #     yield Request(url=url_fans, meta={"ID": response.meta["ID"]}, callback=self.parseFans, dont_filter=True)
        else:
            print "###########################"
            print "Fetch information0 Fail"
            print "###########################"
            return