Example #1
0
    def parse(self, response):

        self.host = get_domain_root(response.url)

        datas = response.css(self.site["robot_url_tag"]).extract()

        if datas == None:
            self.log("articles中的robot_url_tag选择器写的不正确", level=logging.ERROR)
            return False

        # 如果获取到内容
        for data in datas:

            title = Selector(text=data).css('a::text').extract_first()
            url = Selector(text=data).css('a::attr(href)').extract_first()

            if not url.startswith('http'):
                url = "http://" + self.host + "/" + url

            # 保存当前
            item = ArticlesItem()
            item['title'] = title
            item['url'] = url
            item['status'] = 0

            yield item
        else:

            self.log("articles的url:%s采集完毕" % response.url, level=logging.INFO)
Example #2
0
    def contents(self, response):

        # 获取页面内容
        datas = response.css(self.contents_conf["robot_url_tag"]).extract()
        if datas == None:
            self.log("contents中的robot_url_tag选择器写的不正确", level=logging.ERROR)
            return False

        c_index = 0
        for data in datas:

            title = Selector(text=data).css('a::text').extract_first()
            url = Selector(text=data).css('a::attr(href)').extract_first()
            onclick_url = Selector(text=data).css('a::attr(onclick)').re(
                r'\w\(\'(.*)\'\)')  # .extract()

            # 如果是JS跳转的就要用JS传的参数,否则用href属性
            url = onclick_url[0] if onclick_url != [] else url

            # 如果是http://xxxxx全路径就不用处理了,否则要加上前面的url
            url = (response.url + url) if not url.startswith('http') else url

            content = {}
            content['article_name'] = self.article_name
            content['content_name'] = title
            content['index'] = c_index

            yield scrapy.Request(url, callback=self.details, meta=content)

            c_index = c_index + 1

        else:

            self.log("contents的url:%s采集完毕" % response.url, level=logging.INFO)
Example #3
0
 def __extract_days_to_go(self, html):
     time_type = Selector(text=html).xpath(self.TIME_TYPE_XPATH).get()
     time_left = Selector(text=html).xpath(self.TIME_LEFT_XPATH).get()
     if time_type.startswith('hours'):
         return str(float(time_left) / self.HOURS_IN_DAY)
     return time_left