def parse(self, response): """ crawl theater list data first """ theater_list = response.xpath('//div[@class="theater_info"]//li/a') for theater_element in theater_list: curr_cinema_url = theater_element.xpath( './@href').extract_first() cinema_name = theater_element.xpath('./text()').extract_first() if not cinema_name: # partner theater element is different cinema_name = ''.join(theater_element.xpath( './/text()').extract()) else: curr_cinema_url = response.urljoin(curr_cinema_url) data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') data_proto.add_cinema_site(curr_cinema_url, cinema_name) data_proto.add_value('source', self.name) if not self.is_cinema_crawl([cinema_name]): continue request = scrapy.Request( curr_cinema_url, callback=self.parse_cinema) request.meta["data_proto"] = data_proto.load_item() yield request
def parse(self, response): """ crawl theater list data first """ theater_list = response.xpath( '//footer/p[position()>=2 and position() <=3]//a') # partner cinema is not included for theater_element in theater_list: curr_cinema_url = theater_element.xpath( './@href').extract_first() cinema_name = theater_element.xpath('./text()').extract_first() data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') data_proto.add_cinema_site(curr_cinema_url, cinema_name) data_proto.add_value('source', self.name) if not self.is_cinema_crawl([cinema_name]): continue cinema_name_en = curr_cinema_url.split('/')[-1].split('?')[0] request = scrapy.Request( curr_cinema_url, callback=self.parse_main_page) request.meta["data_proto"] = data_proto.load_item() request.meta["cinema_name_en"] = cinema_name_en request.meta["dont_merge_cookies"] = True yield request
def parse(self, response): """ crawl theater list data first """ theater_div_list = response.xpath( '//div[@class="theater-list__inner"]') for theater_element in theater_div_list: # forum site have multiple cinema on one site, so we need to # specify cinema name on schedule page cinema_name = theater_element.xpath('./h4/text()').extract_first() data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') if not self.is_cinema_crawl([cinema_name]): continue curr_cinema_url = theater_element.xpath( './p/a/@href').extract_first() data_proto.add_cinema_site(response.urljoin(curr_cinema_url), cinema_name) data_proto.add_value('source', self.name) schedule_url = self.generate_cinema_schedule_url( curr_cinema_url, self.date) request = scrapy.Request(schedule_url, callback=self.parse_cinema) request.meta["data_proto"] = data_proto.load_item() yield request
def parse(self, response): """ crawl theater list data first """ theater_list = response.xpath( '//section[@class="rcol searchTheater"]//li') for theater_element in theater_list: if theater_element.xpath('./@class').extract_first() == "area": continue curr_cinema_url = theater_element.xpath( './a/@href').extract_first() cinema_img = theater_element.xpath('./img/@src').extract_first() cinema_name = theater_element.xpath('./a/img/@alt').extract_first() if cinema_img is not None: if "icon_uc_ss.gif" in cinema_img: cinema_name = "ユナイテッド・シネマ" + cinema_name elif "icon_cpx_ss.gif" in cinema_img: cinema_name = "シネプレックス" + cinema_name data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') data_proto.add_cinema_site( response.urljoin(curr_cinema_url), cinema_name) data_proto.add_value('source', self.name) if not self.is_cinema_crawl([cinema_name]): continue cinema_name_en = curr_cinema_url.split('/')[-2] schedule_url = self.generate_cinema_schedule_url( cinema_name_en, self.date) request = scrapy.Request(schedule_url, callback=self.parse_cinema) request.meta["data_proto"] = data_proto.load_item() yield request
def parse_cinema(self, response): data_proto = ShowingLoader(response=response) data_proto.add_value(None, response.meta["data_proto"]) result_list = [] movie_list = response.xpath('//div[@class="wrapFilm"]') for curr_movie in movie_list: self.parse_movie(response, curr_movie, data_proto, result_list) for result in result_list: if result: yield result
def parse_shechedule(self, response): data_proto = ShowingLoader(response=response) data_proto.add_value(None, response.meta["data_proto"]) result_list = [] movie_section_list = response.xpath('//div[@class="scheduleBox"]') for curr_movie in movie_section_list: self.parse_movie(response, curr_movie, data_proto, result_list) for result in result_list: if result: yield result
def parse_cinema(self, response): data_proto = ShowingLoader(response=response) data_proto.add_value(None, response.meta["data_proto"]) result_list = [] movie_section_list = response.xpath('//div[@id="timetable"]/article') for curr_movie in movie_section_list: self.parse_movie(response, curr_movie, data_proto, result_list) for result in result_list: if result: yield result
def parse_cinema(self, response): data_proto = ShowingLoader(response=response) data_proto.add_value(None, response.meta["data_proto"]) result_list = [] movie_section_list = response.xpath( '//section[@data-accordion-group="movie"]') for curr_movie in movie_section_list: self.parse_movie(response, curr_movie, data_proto, result_list) for result in result_list: if result: yield result
def parse_cinema(self, response): """ cinema home page we have to pass this page to get independent cookie for each cinema """ data_proto = ShowingLoader(response=response) data_proto.add_value(None, response.meta["data_proto"]) result_list = [] movie_title_list = response.xpath('//div[@class="cinemaTitle elp"]') movie_section_list = response.xpath('//div[@class="theaterListWrap"]') for curr_movie in zip(movie_title_list, movie_section_list): self.parse_movie(response, curr_movie, data_proto, result_list) for result in result_list: if result: yield result
def parse_sub_cinema(self, response, sub_cinema, showing_url_parameter, result_list): site_cd = sub_cinema['code'] showing_url_parameter['site_cd'] = site_cd data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(sub_cinema['name']) cinema_site = TohoUtil.generate_cinema_homepage_url(site_cd) data_proto.add_cinema_site(cinema_site, sub_cinema['name']) data_proto.add_value('source', self.name) for curr_movie in sub_cinema['list']: self.parse_movie(response, curr_movie, showing_url_parameter, data_proto, result_list)
def parse_cinema(self, response): try: schedule_data = json.loads(response.text) except json.JSONDecodeError: return if (not schedule_data): return if 'data' not in schedule_data or 'movie' not in schedule_data['data']: return data_proto = ShowingLoader(response=response) data_proto.add_value(None, response.meta["data_proto"]) result_list = [] movie_list = [] if isinstance(schedule_data['data']['movie'], dict): movie_list.append(schedule_data['data']['movie']) else: movie_list = schedule_data['data']['movie'] for curr_movie in movie_list: self.parse_movie(response, curr_movie, data_proto, result_list) for result in result_list: if result: yield result
def parse(self, response): """ crawl theater list data first """ theater_list = response.xpath( '//div[@class="LNbowlingList LNshopList"]//a') for theater_element in theater_list: county_name = theater_element.xpath('./text()').extract_first() cinema_name = county_name + "コロナシネマワールド" data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') if not self.is_cinema_crawl([cinema_name]): continue curr_cinema_url = theater_element.xpath('./@href').extract_first() data_proto.add_cinema_site(curr_cinema_url, cinema_name) data_proto.add_value('source', self.name) cinema_name_en = curr_cinema_url.split('/')[-2] schedule_url = self.generate_cinema_schedule_url( cinema_name_en, self.date) request = scrapy.Request(schedule_url, callback=self.parse_cinema) request.meta["data_proto"] = data_proto.load_item() yield request
def parse_screen(self, response, curr_screen, data_proto, result_list): screen_data_proto = ShowingLoader(response=response) screen_data_proto.add_value(None, data_proto.load_item()) screen_name = curr_screen.xpath('./tr/td[1]/text()').extract_first() screen_data_proto.add_screen_name(screen_name) show_section_list = curr_screen.xpath('./tr/td[2]/a') for curr_showing in show_section_list: self.parse_showing(response, curr_showing, screen_data_proto, result_list)
def parse(self, response): """ crawl theater list data first """ theater_list = response.xpath('//section[@id="theatres"]//a') for theater_element in theater_list: curr_cinema_url = theater_element.xpath('./@href').extract_first() cinema_name = theater_element.xpath('./text()').extract_first() if cinema_name != "ムービル": cinema_name = "109シネマズ" + cinema_name data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') data_proto.add_cinema_site(response.urljoin(curr_cinema_url), cinema_name) data_proto.add_value('source', self.name) if not self.is_cinema_crawl([cinema_name]): continue cinema_name_en = curr_cinema_url.split('/')[-2] schedule_url = self.generate_cinema_schedule_url( cinema_name_en, self.date) request = scrapy.Request(schedule_url, callback=self.parse_cinema) request.meta["data_proto"] = data_proto.load_item() yield request
def parse(self, response): """ crawl theater list data first """ theater_list = response.xpath('//li[@class="clearfix"]') for theater_element in theater_list: cinema_name = theater_element.xpath( './p[@class="theaterName"]/a/text()').extract_first() data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') if not self.is_cinema_crawl([cinema_name]): continue curr_cinema_url = theater_element.xpath( './p[@class="theaterName"]/a/@href').extract_first() data_proto.add_cinema_site(response.urljoin(curr_cinema_url), cinema_name) data_proto.add_value('source', self.name) cinema_name_en = curr_cinema_url.split('/')[-1] json_url = self.generate_cinema_schedule_url( cinema_name_en, self.date) request = scrapy.Request(json_url, callback=self.parse_cinema) request.meta["data_proto"] = data_proto.load_item() yield request
def parse(self, response): """ crawl theater list data first """ theater_link_list = response.xpath( '//div[contains(@class,"area")]//dd//a') for theater_link in theater_link_list: # forum site have multiple cinema on one site, so we need to # specify cinema name on schedule page city_name = theater_link.xpath('./text()').extract_first() cinema_name = "イオンシネマ" + city_name data_proto = ShowingLoader(response=response) data_proto.add_cinema_name(cinema_name) cinema_name = data_proto.get_output_value('cinema_name') if not self.is_cinema_crawl([cinema_name]): continue curr_cinema_url = theater_link.xpath('./@href').extract_first() curr_cinema_url = response.urljoin(curr_cinema_url) data_proto.add_cinema_site(curr_cinema_url, cinema_name) data_proto.add_value('source', self.name) request = scrapy.Request(curr_cinema_url, callback=self.parse_cinema) request.meta["data_proto"] = data_proto.load_item() yield request
def parse_screen(self, response, curr_screen, data_proto, result_list): screen_data_proto = ShowingLoader(response=response) screen_data_proto.add_value(None, data_proto.load_item()) screen_name = ''.join( curr_screen.xpath('./li[@class="theatre"]/a//text()').extract()) screen_data_proto.add_screen_name(screen_name) show_section_list = curr_screen.xpath('./li')[1:] for curr_showing in show_section_list: self.parse_showing(response, curr_showing, screen_data_proto, result_list)
def parse_screen(self, response, curr_screen, data_proto, result_list): screen_data_proto = ShowingLoader(response=response) screen_data_proto.add_value(None, data_proto.load_item()) screen_name = curr_screen.xpath('./p/a/img/@alt').extract_first() screen_name = 'screen' + re.findall(r'\d+', screen_name)[0] screen_data_proto.add_screen_name(screen_name) show_section_list = curr_screen.xpath('./ol/li') for curr_showing in show_section_list: self.parse_showing(response, curr_showing, screen_data_proto, result_list)
def parse_movie(self, response, curr_movie, data_proto, result_list): """ parse movie showing data """ title = curr_movie.xpath('./h3/span/a[1]/text()').extract_first() movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title(title=title) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return screen_section_list = curr_movie.xpath('./ul/li') for curr_screen in screen_section_list: self.parse_screen(response, curr_screen, movie_data_proto, result_list)
def parse_movie(self, response, curr_movie, data_proto, result_list): """ parse movie showing data """ title = curr_movie.xpath('./h2/text()').extract_first() title_en = curr_movie.xpath('./h2/span/text()').extract_first() movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title(title=title, title_en=title_en) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return show_section_list = curr_movie.xpath('.//ul[@class="timetable"]/li') for curr_showing in show_section_list: self.parse_showing(response, curr_showing, movie_data_proto, result_list)
def parse_screen(self, response, curr_screen, showing_url_parameter, data_proto, result_list): showing_url_parameter['theater_cd'] = curr_screen['theaterCd'] showing_url_parameter['screen_cd'] = curr_screen['code'] screen_data_proto = ShowingLoader(response=response) screen_data_proto.add_value(None, data_proto.load_item()) screen_data_proto.add_screen_name(curr_screen['ename']) for curr_showing in curr_screen['list']: # filter empty showing if not curr_showing['unsoldSeatInfo']: continue self.parse_showing(response, curr_showing, showing_url_parameter, screen_data_proto, result_list)
def parse_screen(self, response, curr_screen, data_proto, result_list): screen = curr_screen['name'] screen_data_proto = ShowingLoader(response=response) screen_data_proto.add_value(None, data_proto.load_item()) screen_data_proto.add_screen_name(screen) showing_list = [] if isinstance(curr_screen['time'], dict): showing_list.append(curr_screen['time']) else: showing_list = curr_screen['time'] for curr_showing in showing_list: self.parse_showing(response, curr_showing, screen_data_proto, result_list)
def parse_movie(self, response, curr_movie, showing_url_parameter, data_proto, result_list): """ parse movie showing data movie may have different versions """ movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title( title=curr_movie['name'], title_en=curr_movie['ename']) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return showing_url_parameter['movie_cd'] = curr_movie['code'] for curr_screen in curr_movie['list']: self.parse_screen(response, curr_screen, showing_url_parameter, movie_data_proto, result_list)
def parse_movie(self, response, curr_movie, data_proto, result_list): """ parse movie showing data """ title = curr_movie.xpath('./h4/a/text()').extract_first() if not title: title = curr_movie.xpath('./h4/text()').extract_first() movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title(title=title) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return showing_list = curr_movie.xpath('.//table//tr') for curr_showing in showing_list: self.parse_showing(response, curr_showing, movie_data_proto, result_list)
def parse_movie(self, response, curr_movie, data_proto, result_list): """ parse movie showing data """ title = curr_movie.xpath( './/div[@class="MovieTitle1"]//a/text()').extract_first() movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title(title=title) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return showing_section_list = curr_movie.xpath( './/td[contains(@onmouseover,"eventover")]') for curr_showing in showing_section_list: self.parse_showing(response, curr_showing, movie_data_proto, result_list)
def parse_movie(self, response, curr_movie, data_proto, result_list): """ parse movie showing data curr_movie is a tuple """ title_section, detail_section = curr_movie title = ''.join(title_section.xpath('./text()').extract()) movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title(title=title) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return screen_section_list = detail_section.xpath('.//table') for curr_screen in screen_section_list: self.parse_screen(response, curr_screen, movie_data_proto, result_list)
def parse_movie(self, response, curr_movie, data_proto, result_list): """ parse movie showing data """ title = curr_movie['name'] movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title(title=title) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return screen_list = [] if isinstance(curr_movie['screen'], dict): screen_list.append(curr_movie['screen']) else: screen_list = curr_movie['screen'] for curr_screen in screen_list: self.parse_screen(response, curr_screen, movie_data_proto, result_list)
def parse_movie(self, response, curr_movie, data_proto, result_list): """ parse movie showing data """ title = curr_movie.xpath( './div[1]/p[1]/span/preceding::*[1]/text()').extract_first() if not title: title = curr_movie.xpath('./div[1]/p[1]/text()').extract_first() title_en = curr_movie.xpath( './div[1]/p[1]/span/text()').extract_first() movie_data_proto = ShowingLoader(response=response) movie_data_proto.add_value(None, data_proto.load_item()) movie_data_proto.add_title(title=title, title_en=title_en) title_list = movie_data_proto.get_title_list() if not self.is_movie_crawl(title_list): return show_section_list = curr_movie.xpath('./div[2]/div') for curr_showing in show_section_list: self.parse_showing(response, curr_showing, movie_data_proto, result_list)
def parse_showing(self, response, curr_showing, data_proto, result_list): def parse_time(time_str): time_str = unicodedata.normalize('NFKC', start_time) time = time_str.split(":") return (int(time[0]), int(time[1])) # showing section passed in may be unusable and need to be filtered time_section = curr_showing.xpath('./div[@class="time"]') if not time_section: return showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) start_time = time_section.xpath('./span/span/text()').extract_first() start_hour, start_minute = parse_time(start_time) showing_data_proto.add_value( 'start_time', self.get_time_from_text(start_hour, start_minute)) end_time = time_section.xpath('./span/text()').extract_first() end_hour, end_minute = parse_time(end_time) showing_data_proto.add_value( 'end_time', self.get_time_from_text(end_hour, end_minute)) screen_name = curr_showing.xpath('./div[2]/a/text()').extract_first() showing_data_proto.add_screen_name(screen_name) # when site ordering is stopped stop crawling site_status = curr_showing.xpath('./a/span[2]/text()').extract_first() if site_status == '予約停止中': return # handle free order seat type showings seat_type = curr_showing.xpath( './div[@class="icon"]//img/@alt').extract_first() showing_data_proto.add_value('seat_type', AeonUtil.standardize_seat_type(seat_type)) # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath('./a/span/text()').extract_first() booking_data_proto.add_book_status(book_status, util=AeonUtil) book_status = booking_data_proto.get_output_value('book_status') seat_type = showing_data_proto.get_output_value('seat_type') if (seat_type == 'FreeSeat' or book_status in ['SoldOut', 'NotSold']): # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = (total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, generate request to showing page showing_request = self.generate_agreement_request( response=response, curr_showing=curr_showing) # go to shchedule page again to generate independent cookie # for each showing schedule_url = response.meta['schedule_url'] request = scrapy.Request(schedule_url, dont_filter=True, callback=self.parse_new_cookie) request.meta["data_proto"] = booking_data_proto.load_item() request.meta["showing_request"] = showing_request (performance_id, _, _) = self.extract_showing_parameters(curr_showing) request.meta["cookiejar"] = performance_id result_list.append(request)
def parse_showing(self, response, curr_showing, data_proto, result_list): showing_data_proto = ShowingLoader(response=response) showing_data_proto.add_value(None, data_proto.load_item()) start_time = curr_showing.xpath( './div/text()').extract_first()[:-1] start_hour, start_minute = self.parse_time(start_time) showing_data_proto.add_value('start_time', self.get_time_from_text( start_hour, start_minute)) # end time not displayed in schedule page showing_data_proto.add_value('seat_type', 'NormalSeat') # query screen number from database showing_data_proto.add_total_seat_count() # check whether need to continue crawl booking data or stop now if not self.crawl_booking_data: result_list.append(showing_data_proto.load_item()) return booking_data_proto = init_show_booking_loader(response=response) booking_data_proto.add_value('showing', showing_data_proto.load_item()) book_status = curr_showing.xpath('./div/@class').extract_first() booking_data_proto.add_book_status(book_status, util=KinezoUtil) book_status = booking_data_proto.get_output_value('book_status') if book_status in ['SoldOut', 'NotSold']: # sold out or not sold total_seat_count = showing_data_proto.get_output_value( 'total_seat_count') book_seat_count = ( total_seat_count if book_status == 'SoldOut' else 0) booking_data_proto.add_value('book_seat_count', book_seat_count) booking_data_proto.add_time_data() result_list.append(booking_data_proto.load_item()) return else: # normal, need to crawl book number on order page url = curr_showing.xpath('./@href').extract_first() url = response.urljoin(url) request = scrapy.Request(url, callback=self.parse_normal_showing) request.meta["data_proto"] = booking_data_proto.load_item() result_list.append(request)