def parse(self, response): self.host = get_domain_root(response.url) datas = response.css(self.site["robot_url_tag"]).extract() if datas == None: self.log("articles中的robot_url_tag选择器写的不正确", level=logging.ERROR) return False # 如果获取到内容 for data in datas: title = Selector(text=data).css('a::text').extract_first() url = Selector(text=data).css('a::attr(href)').extract_first() if not url.startswith('http'): url = "http://" + self.host + "/" + url # 保存当前 item = ArticlesItem() item['title'] = title item['url'] = url item['status'] = 0 yield item else: self.log("articles的url:%s采集完毕" % response.url, level=logging.INFO)
def contents(self, response): # 获取页面内容 datas = response.css(self.contents_conf["robot_url_tag"]).extract() if datas == None: self.log("contents中的robot_url_tag选择器写的不正确", level=logging.ERROR) return False c_index = 0 for data in datas: title = Selector(text=data).css('a::text').extract_first() url = Selector(text=data).css('a::attr(href)').extract_first() onclick_url = Selector(text=data).css('a::attr(onclick)').re( r'\w\(\'(.*)\'\)') # .extract() # 如果是JS跳转的就要用JS传的参数,否则用href属性 url = onclick_url[0] if onclick_url != [] else url # 如果是http://xxxxx全路径就不用处理了,否则要加上前面的url url = (response.url + url) if not url.startswith('http') else url content = {} content['article_name'] = self.article_name content['content_name'] = title content['index'] = c_index yield scrapy.Request(url, callback=self.details, meta=content) c_index = c_index + 1 else: self.log("contents的url:%s采集完毕" % response.url, level=logging.INFO)
def __extract_days_to_go(self, html): time_type = Selector(text=html).xpath(self.TIME_TYPE_XPATH).get() time_left = Selector(text=html).xpath(self.TIME_LEFT_XPATH).get() if time_type.startswith('hours'): return str(float(time_left) / self.HOURS_IN_DAY) return time_left