Example #1
0
 def parse_information(self, response):
     """ 抓取个人信息 """
     informationItem = InformationItem()
     selector = Selector(response)
     ID = re.findall('(\d+)/info', response.url)[0]
     try:
         text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
         nickname = re.findall('昵称[::]?(.*?);', text1)
Example #2
0
    def parse_information(self, response):
        """ 抓取个人信息 """
        informationItem = InformationItem()
        selector = Selector(response)
        ID = re.findall('(\d+)/info', response.url)[0]
        try:
            text1 = ";".join(
                selector.xpath('body/div[@class="c"]//text()').extract()
            )  # 获取标签里的所有text()
            nickname = re.findall('昵称[::]?(.*?);'.decode('utf8'), text1)
            gender = re.findall('性别[::]?(.*?);'.decode('utf8'), text1)
            place = re.findall('地区[::]?(.*?);'.decode('utf8'), text1)
            briefIntroduction = re.findall('简介[::]?(.*?);'.decode('utf8'),
                                           text1)
            birthday = re.findall('生日[::]?(.*?);'.decode('utf8'), text1)
            sexOrientation = re.findall('性取向[::]?(.*?);'.decode('utf8'), text1)
            sentiment = re.findall('感情状况[::]?(.*?);'.decode('utf8'), text1)
            vipLevel = re.findall('会员等级[::]?(.*?);'.decode('utf8'), text1)
            authentication = re.findall('认证[::]?(.*?);'.decode('utf8'), text1)
            url = re.findall('互联网[::]?(.*?);'.decode('utf8'), text1)

            informationItem["_id"] = ID
            if nickname and nickname[0]:
                informationItem["NickName"] = nickname[0].replace(u"\xa0", "")
            if gender and gender[0]:
                informationItem["Gender"] = gender[0].replace(u"\xa0", "")
            if place and place[0]:
                place = place[0].replace(u"\xa0", "").split(" ")
                informationItem["Province"] = place[0]
                if len(place) > 1:
                    informationItem["City"] = place[1]
            if briefIntroduction and briefIntroduction[0]:
                informationItem["BriefIntroduction"] = briefIntroduction[
                    0].replace(u"\xa0", "")
            if birthday and birthday[0]:
                try:
                    birthday = datetime.datetime.strptime(
                        birthday[0], "%Y-%m-%d")
                    informationItem[
                        "Birthday"] = birthday - datetime.timedelta(hours=8)
                except Exception:
                    informationItem['Birthday'] = birthday[0]  # 有可能是星座,而非时间
            if sexOrientation and sexOrientation[0]:
                if sexOrientation[0].replace(u"\xa0", "") == gender[0]:
                    informationItem["SexOrientation"] = "同性恋"
                else:
                    informationItem["SexOrientation"] = "异性恋"
            if sentiment and sentiment[0]:
                informationItem["Sentiment"] = sentiment[0].replace(
                    u"\xa0", "")
            if vipLevel and vipLevel[0]:
                informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "")
            if authentication and authentication[0]:
                informationItem["Authentication"] = authentication[0].replace(
                    u"\xa0", "")
            if url:
                informationItem["URL"] = url[0]

            try:
                urlothers = "http://weibo.cn/attgroup/opening?uid=%s" % ID
                r = requests.get(urlothers,
                                 cookies=response.request.cookies,
                                 timeout=5)
                if r.status_code == 200:
                    selector = etree.HTML(r.content)
                    texts = ";".join(
                        selector.xpath('//body//div[@class="tip2"]/a//text()'))
                    if texts:
                        num_tweets = re.findall('微博\[(\d+)\]'.decode('utf8'),
                                                texts)
                        num_follows = re.findall('关注\[(\d+)\]'.decode('utf8'),
                                                 texts)
                        num_fans = re.findall('粉丝\[(\d+)\]'.decode('utf8'),
                                              texts)
                        if num_tweets:
                            informationItem["Num_Tweets"] = int(num_tweets[0])
                        if num_follows:
                            informationItem["Num_Follows"] = int(
                                num_follows[0])
                        if num_fans:
                            informationItem["Num_Fans"] = int(num_fans[0])
            except Exception, e:
                pass
        except Exception, e:
            pass
Example #3
0
    def parse_information(self, response):
        """ 抓取个人信息 """
        informationItem = InformationItem()
        selector = Selector(response)
        ID = re.findall('(\d+)/info', response.url)[0]
        try:
            text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract())  # 获取标签里的所有text()
            self.logger.warning(text1)
            nickname = re.findall('昵称;?[::]?(.*?);'.decode('utf8'), text1)
            self.logger.warning(nickname)
            gender = re.findall('性别;?[::]?(.*?);'.decode('utf8'), text1)
            self.logger.warning(gender)
            place = re.findall('地区;?[::]?(.*?);'.decode('utf8'), text1)
            briefIntroduction = re.findall('简介;?[::]?(.*?);'.decode('utf8'), text1)
            birthday = re.findall('生日;?[::]?(.*?);'.decode('utf8'), text1)
            sexOrientation = re.findall('性取向;?[::]?(.*?);'.decode('utf8'), text1)
            sentiment = re.findall('感情状况;?[::]?(.*?);'.decode('utf8'), text1)
            vipLevel = re.findall('会员等级;?[::]?(.*?);'.decode('utf8'), text1)
            authentication = re.findall('认证;?[::]?(.*?);'.decode('utf8'), text1)
            url = re.findall('互联网;?[::]?(.*?);'.decode('utf8'), text1)
            self.logger.warning(url)

            informationItem["_id"] = ID
            if nickname and nickname[0]:
                informationItem["NickName"] = nickname[0].replace(u"\xa0", "")
            if gender and gender[0]:
                informationItem["Gender"] = gender[0].replace(u"\xa0", "")
            if place and place[0]:
                place = place[0].replace(u"\xa0", "").split(" ")
                informationItem["Province"] = place[0]
                if len(place) > 1:
                    informationItem["City"] = place[1]
            if briefIntroduction and briefIntroduction[0]:
                informationItem["BriefIntroduction"] = briefIntroduction[0].replace(u"\xa0", "")
            if birthday and birthday[0]:
                try:
                    birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
                    # 为何需要减8小时
                    informationItem["Birthday"] = birthday - datetime.timedelta(hours=8)
                    # informationItem["Birthday"] = birthday
                except Exception:
                    informationItem['Birthday'] = birthday[0]   # 有可能是星座,而非时间
            if sexOrientation and sexOrientation[0]:
                if sexOrientation[0].replace(u"\xa0", "") == gender[0]:
                    informationItem["SexOrientation"] = "同性恋"
                else:
                    informationItem["SexOrientation"] = "异性恋"
            if sentiment and sentiment[0]:
                informationItem["Sentiment"] = sentiment[0].replace(u"\xa0", "")
            if vipLevel and vipLevel[0]:
                informationItem["VIPlevel"] = vipLevel[0].replace(u"\xa0", "")
            if authentication and authentication[0]:
                informationItem["Authentication"] = authentication[0].replace(u"\xa0", "")
            if url:
                informationItem["URL"] = url[0]

            """ 获取标签 """
            # tagurl = self.host + selector.xpath('body/div[@class="c"]/a[text()="更多>>"]/@href'.decode('utf8')).extract()
            # self.logger.warning(tagurl)
            try:
                tagurl = "https://weibo.cn/account/privacy/tags/?uid=%s&st=f856d8" % ID
                self.logger.warning(tagurl)
                r = requests.get(tagurl, cookies=response.request.cookies, timeout=10)
                if r.status_code == 200:
                    self.logger.warning("bbb")
                    selector = etree.HTML(r.content)
                    self.logger.warning(selector)
                    texts = ";".join(selector.xpath('//body//div[@class="c"]/a//text()'))
                    self.logger.warning(texts)
                    if texts:
                        tags = re.findall( '资料;?(.*?);皮肤'.decode('utf8'), texts )
                        if tags:
                            informationItem["TAGS"] = tags
            except Exception, e:
                pass

            try:
                urlothers = "https://weibo.cn/attgroup/opening?uid=%s" % ID
                r = requests.get(urlothers, cookies=response.request.cookies, timeout=5)
                if r.status_code == 200:
                    selector = etree.HTML(r.content)
                    self.logger.warning("urlothers " + selector)
                    texts = ";".join(selector.xpath('//body//div[@class="tip2"]/a//text()'))
                    if texts:
                        num_tweets = re.findall('微博\[(\d+)\]'.decode('utf8'), texts)
                        num_follows = re.findall('关注\[(\d+)\]'.decode('utf8'), texts)
                        num_fans = re.findall('粉丝\[(\d+)\]'.decode('utf8'), texts)
                        if num_tweets:
                            informationItem["Num_Tweets"] = int(num_tweets[0])
                        if num_follows:
                            informationItem["Num_Follows"] = int(num_follows[0])
                        if num_fans:
                            informationItem["Num_Fans"] = int(num_fans[0])
            except Exception, e:
                pass
Example #4
0
 def parse_detail(self,response):
     selector = Selector(response)
     item = InformationItem()
     text1 = selector.xpath("/html/script[8]/text()").extract()
     print text1