Exemple #1
0
 def _parse_user(self, response):
     try:
         user_item = UserItem()
         _html = response.text
         _json = response.xpath(
             '''/html/script[starts-with(text(),'FM.view({"ns":"pl.header.preloginHead.index",'''
             '''"domid":"Pl_Official_Headerv6') or starts-with(text(),'FM.view({"ns":"pl.header.head.index",'''
             '''"domid":"Pl_Official_Headerv6')]/text()''')[0].extract()
         _html = JsonUtil.jsonp_to_html(_json)
         _html_ele = HtmlResponse(url=response.url,
                                  encoding='utf-8',
                                  body=_html)
         user_item['nickname'] = _html_ele.xpath(
             './descendant::h1[@class="username"]/text()')[0].extract()
         gender_class = \
             _html_ele.xpath('./descendant::i[@class="W_icon icon_pf_female" or @class="W_icon icon_pf_male"]/'
                             '@class')[0].extract()
         user_item['gender'] = gender_class[gender_class.rindex('_') + 1:]
         # vip6的样式W_icon icon_member6
         user_item['is_vip'] = len(
             _html_ele.xpath(
                 './descendant::a[@href="http://vip.weibo.com/personal?'
                 'from=main"]/em[not(contains(@class,"icon_member_dis"))]').
             extract()) > 0
         user_item['verified'] = len(
             _html_ele.xpath(
                 './descendant::div[@class="pf_photo"]/a').extract()) > 0
         user_item['introduction'] = _html_ele.xpath(
             './descendant::div[@class="pf_intro" and 2]/text()'
         )[0].extract().strip()
         _json = response.xpath(
             '''/html/script[starts-with(text(),'FM.view({"ns":"pl.content.homeFeed.index",'''
             '''"domid":"Pl_Core_UserInfo')]/text()''').extract()[0]
         _html = JsonUtil.jsonp_to_html(_json)
         _html_ele = HtmlResponse(url=response.url,
                                  encoding='utf-8',
                                  body=_html)
         level_text = _html_ele.xpath(
             './descendant::a/span/text()')[0].extract()
         user_item['level'] = int(level_text[level_text.index('.') + 1:])
         _json = response.xpath(
             '''/html/script[starts-with(text(),'FM.view({"ns":"","domid":'''
             '''"Pl_Core_T8CustomTriColumn')]''').extract()[0]
         _html = JsonUtil.jsonp_to_html(_json)
         _html_ele = HtmlResponse(url=response.url,
                                  encoding='utf-8',
                                  body=_html)
         nums = _html_ele.xpath(
             './descendant::td/descendant::strong/text()').extract()
         user_item['concern_num'] = int(nums[0])
         user_item['fans_num'] = int(nums[1])
         user_item['weibo_num'] = int(nums[2])
         user_item['home_url'] = response.url[:response.url.
                                              index('?') if response.url.
                                              count('?') else None]
         Spider.log(self, 'user_item: %s' % user_item, level=logging.INFO)
         return user_item
     except:
         Spider.log(self, "%s\n%s" % (response.url, _html), logging.ERROR)
         traceback.print_exc()
Exemple #2
0
 def _parse_div_list_v2(self, div_list_v2):
     for i in div_list_v2:
         weibo_item = WeiboItem()
         weibo_item['mid'] = i.xpath('./@mid')[0].extract()
         weibo_item['nickname'] = \
             i.xpath('./div[@class="list_des"]/div[@class="subinfo_box clearfix"]/a[2]/span/text()')[0].extract()
         date_str = \
             i.xpath(
                 './div[@class="list_des"]/div[@class="subinfo_box clearfix"]/span[@class="subinfo S_txt2"]/'
                 'text()')[0].extract()
         weibo_item['date'] = self.__process_datestr(date_str)
         content_div = i.xpath(
             './div[@class="list_des"]/*[1]/*/descendant-or-self::text()'
         ).extract()
         weibo_item['content'] = ''.join(content_div)
         weibo_item['source_url'] = 'http:' + i.xpath(
             './div[@class="vid"]/@href')[0].extract()
         weibo_item['image_urls'] = None
         action_data = i.xpath(
             './div[@class="vid"]/@action-data')[0].extract()
         video_src = action_data[action_data.index('video_src=') +
                                 10:action_data.index('&cover_img=')]
         weibo_item['video_url'] = parse.unquote(video_src)
         nums = i.xpath(
             './div[@class="list_des"]/div[@class="subinfo_box clearfix subinfo_box_btm"]/span[@class="subinfo_rgt '
             'S_txt2"]/em[2]/text()')[0].extract()
         weibo_item['forwarding_num'] = int(nums[-1])
         weibo_item['comment_num'] = int(nums[-2])
         weibo_item['praise_num'] = int(nums[-3])
         Spider.log(self, weibo_item)
         yield weibo_item
Exemple #3
0
 def _parse_div_list_b(self, div_list_b):
     for i in div_list_b:
         weibo_item = WeiboItem()
         weibo_item['mid'] = i.xpath('./@mid')[0].extract()
         weibo_item['nickname'] = \
             i.xpath('./div[@class="list_des"]/div[@class="subinfo_box clearfix"]/a[2]/span/text()')[0].extract()
         date_str = \
             i.xpath(
                 './div[@class="list_des"]/div[@class="subinfo_box clearfix"]/span[@class="subinfo S_txt2"]/text()')[
                 0].extract()
         weibo_item['date'] = self.__process_datestr(date_str)
         content_div = i.xpath(
             './div[@class="list_des"]/*[1]/*/descendant-or-self::text()'
         ).extract()
         weibo_item['content'] = ''.join(content_div)
         weibo_item['source_url'] = 'http:' + i.xpath(
             './@href')[0].extract()
         weibo_item['image_urls'] = i.xpath('./div[1]/img/@src').extract()
         weibo_item['video_url'] = None
         nums = i.xpath(
             './div[@class="list_des"]/div[@class="subinfo_box clearfix"]/span[@class="subinfo_rgt S_txt2"]/em['
             '2]/text()').extract()
         weibo_item['forwarding_num'] = int(nums[-1])
         weibo_item['comment_num'] = int(nums[-2])
         weibo_item['praise_num'] = int(nums[-3])
         Spider.log(self, weibo_item)
         yield weibo_item
Exemple #4
0
    def process_request(self, request: Request, spider: Spider) -> None:
        if self.items_scraped >= self.max_count:
            spider.log('Changing Tor IP...')
            self.items_scraped = 0
            
            new_ip = self.tc.renew_ip() 
            if not new_ip:
                raise Exception('FatalError: Failed to find a new IP')
            
            spider.log(f'New Tor IP: {new_ip}')

        # http://127.0.0.1:8118 is the default address for Privoxy
        request.meta['proxy'] = 'http://127.0.0.1:8118'
        self.items_scraped += 1
Exemple #5
0
    def process_request(self, request: Request, spider: Spider):
        if self.items_scraped >= self.limit_usage:
            spider.log(
                f'Changing user-agent "{self.user_agent}" after {self.limit_usage} requests'
            )

            self.items_scraped = 0
            self.limit_usage = random.randint(self.min_usage, self.max_usage)

            self.user_agent = next(self.user_agents)

            spider.log(
                f'User-agent changed to "{self.user_agent}". A new user-agent will be chosen after {self.limit_usage} requests'
            )

        request.headers['user-agent'] = self.user_agent
        self.items_scraped += 1
Exemple #6
0
    def wrapper(self, item: Dict, spider: Spider) -> Any:  # type: ignore

        # message template for debugging
        msg = "%%s %s pipeline step" % (self.__class__.__name__, )

        pipelines = set([])

        if hasattr(spider, "pipelines"):
            if type(spider.pipelines) is set:
                pipelines |= spider.pipelines

        if hasattr(spider, "pipelines_extra"):
            if type(spider.pipelines_extra) is set:
                pipelines |= spider.pipelines_extra

        if self.__class__ in pipelines:
            spider.log(msg % "Executing", level=logging.INFO)
            return process_item_method(self, item, spider)

        else:
            # spider.log(msg % "skipping", level=logging.DEBUG)
            return item