def parse_sub_cinema(self, response): cinema = response.meta['cinema'] # sub cinema use its own name cinema_name = response.xpath( '//div[@id="more-anchor-01"]/h4/text()').extract_first() cinema['names'] = [standardize_cinema_name(cinema_name)] self.parse_seat_number_list(response, cinema) yield cinema
def parse_county(self, response): """ parse cinemas for each county """ cinema_list = response.xpath(self.cinema_xpath) for curr_cinema in cinema_list: cinema_name = curr_cinema.xpath('./text()').extract_first() cinema_name = standardize_cinema_name(cinema_name) if not self.is_cinema_crawl(cinema_name): continue url = curr_cinema.xpath('./@href').extract_first() url = self.adjust_cinema_url(response.urljoin(url)) request = scrapy.Request(url, callback=self.parse_cinema) request.meta['county_name'] = response.meta['county_name'] request.meta['cinema_name'] = cinema_name yield request
def parse_cinema(self, response): cinema_name = response.xpath( '//h1[@class="c-page_heading is-lv-01"]' '/span/text()').extract_first() cinema = CinemaItem() cinema['names'] = [standardize_cinema_name(cinema_name)] cinema['screens'] = {} cinema['county'] = response.meta['county'] cinema['company'] = 'TOHO' cinema['source'] = self.name cinema['site'] = response.meta['site'] # some cinemas have detail page and need to forward sub_page_list = response.xpath( '//section[@class="about"]//a[@class="link bold"]/@href').extract() if sub_page_list: for sub_page_url in sub_page_list: sub_page_url = response.urljoin(sub_page_url) request = scrapy.Request(sub_page_url, callback=self.parse_sub_cinema) request.meta['cinema'] = copy.deepcopy(cinema) yield request else: self.parse_seat_number_list(response, cinema) yield cinema
def replace_cinema_name(self, cinema_name): self.replace_value('cinema_name', standardize_cinema_name(cinema_name))
def add_cinema_name(self, cinema_name): self.add_value('cinema_name', standardize_cinema_name(cinema_name))