def get_movie_data(self, response): release_date, director, cast = clean( response.css('p ::text').extract(), True).split('|') yield { 'title': clean(response.css('h2 ::text').extract(), True), 'release_date': clean(release_date), 'cast': clean(cast.replace('starring:', '')), 'director': clean(cast.replace('dir.:', '')) }
def parse(self, response, **kwargs): movie = { 'title': clean(response.css('.row h1 ::text').extract(), True), 'project_notes': clean(response.css('.filmdetailSynopsis ::text').extract(), True), 'release_date': self.get_detail(response, "Year"), 'duration': self.get_detail(response, "Duration"), 'genres': self.get_detail(response, "Genre"), 'directors': self.get_detail(response, "Director"), 'producers': self.get_detail(response, "Producers"), 'cast': self.get_detail(response, "Cast") } add_credit_css = '.filmdetailHeading:contains("Additional Credits") + div ::text' movie['additional_credits'] = ': '.join(clean(response.css(add_credit_css).extract())) return movie
def parse_start_url(self, response, **kwargs): key_map = { 'LOGLINE': 'project_notes', 'DIRECTOR': 'director', 'DIRECTED BY': 'director', } for movie_s in response.css('._1Z_nJ[data-testid]:contains("GENRE")'): movie_raw_text = clean(movie_s.css(' ::text').extract(), True) movie = {'title': clean(movie_s.css('h5 ::text').extract(), True)} for key in [ 'WRITERS', 'DIRECTOR', 'STARRING', 'PRODUCERS', 'GENRE', 'LOGLINE', 'STATUS', 'EXECUTIVE PRODUCER', 'DIRECTED BY' ]: movie[(key_map.get(key) or key).lower()] = self.get_info(key, movie_raw_text) yield movie
def parse_start_url(self, response, **kwargs): for idx, tr_sel in enumerate(response.css( 'table[bgcolor="black"]+table:contains("series") tr[valign="top"]' ), start=1): raw_movie_data = clean(tr_sel.css('td ::text').extract()) if (idx % 2) != 0: title, start_wrap_schedule, updated, _, _, *status = raw_movie_data movie = { 'title': title, 'start_wrap_schedule': start_wrap_schedule, 'updated': updated, 'status': ' '.join(status) } else: raw_movie_data = ' '.join(raw_movie_data) movie['studios'] = clean(' '.join( re.findall("\[who's behind it\?\] . (.+?)\[", raw_movie_data))) movie['genres'] = clean(' '.join( re.findall("\[related genres] . (.+?)\[", raw_movie_data))) movie['cast'] = clean(' '.join( re.findall("\[who's in it\?] . (.+?)\[", raw_movie_data))) movie['directors'] = clean(' '.join( re.findall("\[who's making it\?] . (.+?)\[", raw_movie_data))) movie['project_notes'] = clean(' '.join( re.findall("\[what's it about\?]\s+(.+?)$", raw_movie_data))) yield movie
def extract_release_date(self, response): release_date_css = '.contentSection div:contains("Release")+div::text' release_date = clean(response.css(release_date_css).extract()) return release_date[0] if release_date else None
def extract_production_companies(self, response): production_css = '.contentSection div:contains("Production company")+div::text' production_companies = clean(response.css(production_css).extract()) return clean(production_companies[0].split( ',')) if production_companies else []
def extract_cast(self, response): cast_css = '.contentSection div:contains("Cast")+div::text' cast = clean(response.css(cast_css).extract()) return clean(cast[0].split(',')) if cast else []
def extract_production_companies(self, response): production_css = '[aria-label="Content"] li:contains("Production Company / Studio:")::text' production_companies = clean(response.css(production_css).extract()) return clean(production_companies[0].split(',')) if production_companies else []
def extract_directors(self, response): direcotors_css = '.contentSection div:contains("Director")+div::text' direcotors = clean(response.css(direcotors_css).extract()) return clean(direcotors[0].split(',')) if direcotors else []
def extract_locations(self, response): locations_css = '.contentSection div:contains("Locations")+div::text' locations = clean(response.css(locations_css).extract()) return clean(locations[0].split(',')) if locations else []
def get_detail(self, response, heading): return clean(response.css(f'.filmdetailHeading:contains("{heading}") + p ::text').extract(), True)
def extract_project_type(self, response): project_type_css = '[aria-label="Content"] li:contains("Project Type:")::text' project_type_date = clean(response.css(project_type_css).extract()) return project_type_date[0] if project_type_date else None
def extract_writers(self, response): writers_css = '[aria-label="Content"] li:contains("Writer(s):")::text' writers = clean(response.css(writers_css).extract()) return clean(writers[0].split(',')) if writers else []
def extract_start_wrap_schedule(self, response): dates_css = '[aria-label="Content"] li:contains("Wrap Date:")::text' dates = clean(response.css(dates_css).extract()) return dates[0] if dates else None
def extract_photography_start_date(self, response): photography_start_date_css = '[aria-label="Content"] li:contains("Start Date:")::text' photography_start_date = clean(response.css(photography_start_date_css).extract()) return photography_start_date[0] if photography_start_date else None
def extract_locations(self, response): locations_css = '[aria-label="Content"] li:contains("Location:")::text,' \ '[aria-label="Content"] p:contains("Job Location:")::text' locations = clean(response.css(locations_css).extract()) return clean(locations[0].split(',')) if locations else []
def extract_plot(self, response): plot_css = '[aria-label="Content"] p:contains("Synopsis:")::text' plot = clean(response.css(plot_css).extract()) return [plot[0]] if plot else []
def extract_cast(self, response): cast_css = '[aria-label="Content"] p:contains("currently casting")~ul strong::text' return clean(response.css(cast_css).extract())
def extract_genres(self, response): genre_css = '.contentSection div:contains("Genre")+div::text' genres = clean(response.css(genre_css).extract()) return clean(genres[0].split(',')) if genres else []
def extract_directors(self, response): direcotors_css = '[aria-label="Content"] li:contains("Director")::text' direcotors = clean(response.css(direcotors_css).extract()) return clean(direcotors[0].split(',')) if direcotors else []
def extract_plot(self, response): plot_css = '.contentSection div:contains("Storyline")+div::text' plot = clean(response.css(plot_css).extract()) return [plot[0]] if plot else []
def extract_title(self, response): title_css = 'nav+h1::text' return clean(response.css(title_css).extract())[0]
def extract_start_wrap_schedule(self, response): dates_css = '.contentSection div:contains("Filming dates")+div::text' dates = clean(response.css(dates_css).extract()) return dates[0] if dates else None
def extract_title(self, response): title_css = 'article header h1::text' return clean(response.css(title_css).extract())[0]
def extract_id(self, response): return clean(response.url.split('/'))[-1]
def extract_producers(self, response): producers_css = '[aria-label="Content"] li:contains("Producer(s):")::text' producers = clean(response.css(producers_css).extract()) return clean(producers[0].split(',')) if producers else []
def get_info(self, key, text): raw_data = re.findall(f'{key}:(.+?)($|[A-Z]+:)', text) return clean(list(raw_data and raw_data[0][:1]), True)
def extract_id(self, response): return clean(url_query_cleaner(response.url).split('/film/'))[-1]