def parse(self, data=None): """Parse movie name. Populates name, year, quality and proper_count attributes""" # Reset before parsing, so the parser can be reused. self.reset() if data is None: data = self.data # Move anything in leading brackets to the end data = re.sub(r'^\[(.*?)\](.*)', r'\2 \1', data) for char in '[]()_,.': data = data.replace(char, ' ') # if there are no spaces if data.find(' ') == -1: data = data.replace('-', ' ') # remove unwanted words (imax, ..) self.remove_words(data, self.remove) data = self.strip_spaces(data) # split to parts parts = data.split(' ') cut_part = 256 all_caps = True for part_pos, part in enumerate(parts): cut = False # Don't let the first word be cutoff word if part_pos < 1: continue # check for year num = str_to_int(part) if num is not None: if 1930 < num < 2050: self.year = num cut = True # Don't consider all caps words cut words if the whole title has been all caps if not part.isupper(): all_caps = False # if length > 3 and whole word in uppers, consider as cut word (most likely a group name) if len(part) > 3 and part.isupper() and part.isalpha() and not all_caps: cut = True # check for cutoff words if part.lower() in self.cutoffs: cut = True # check for propers if part.lower() in self.propers: self.proper_count += 1 cut = True # update cut position if cut and parts.index(part) < cut_part: cut_part = part_pos if cut_part != 256: log.debug('parts: %s, cut is: %s', parts, parts[cut_part]) # calculate cut positon from cut_part abs_cut = len(' '.join(parts[:cut_part])) log.debug('after parts check, cut data would be: `%s` abs_cut: %i', data[:abs_cut], abs_cut) # parse quality quality = qualities.Quality(data) if quality: self.quality = quality # remaining string is same as data but quality information removed # find out position where there is first difference, this is earliest # quality bit, anything after that has no relevance to the movie name dp = diff_pos(data, quality.clean_text) if dp is not None: log.debug('quality start: %s', dp) if dp < abs_cut: log.debug('quality cut is even shorter') abs_cut = dp # make cut data = data[:abs_cut].strip() log.debug('data cut to `%s` - this will be the name', data) # save results self.name = data
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.text) title_overview = soup.find('div', attrs={'class': 'title-overview'}) if not title_overview: raise PluginError('IMDB parser needs updating, imdb format changed. Please report on Github.') # Parse stuff from the title-overview section name_elem = title_overview.find('h1', attrs={'itemprop': 'name'}) if name_elem: self.name = name_elem.contents[0].strip() else: log.error('Possible IMDB parser needs updating, Please report on Github.') raise PluginError('Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url)) year = title_overview.find('span', attrs={'id': 'titleYear'}) if year: m = re.search(r'([0-9]{4})', year.text) if m: self.year = int(m.group(1)) if not self.year: log.debug('No year found for %s' % self.imdb_id) mpaa_rating_elem = title_overview.find(itemprop='contentRating') if mpaa_rating_elem: self.mpaa_rating = mpaa_rating_elem['content'] else: log.debug('No rating found for %s' % self.imdb_id) photo_elem = title_overview.find(itemprop='image') if photo_elem: self.photo = photo_elem['src'] else: log.debug('No photo found for %s' % self.imdb_id) original_name_elem = title_overview.find(attrs={'class': 'originalTitle'}) if original_name_elem: self.original_name = original_name_elem.contents[0].strip().strip('"') else: log.debug('No original title found for %s' % self.imdb_id) votes_elem = title_overview.find(itemprop='ratingCount') if votes_elem: self.votes = str_to_int(votes_elem.text) else: log.debug('No votes found for %s' % self.imdb_id) score_elem = title_overview.find(itemprop='ratingValue') if score_elem: self.score = float(score_elem.text) else: log.debug('No score found for %s' % self.imdb_id) # get director(s) for director in title_overview.select('[itemprop="director"] > a'): director_id = extract_id(director['href']) director_name = director.text # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name # Details section title_details = soup.find('div', attrs={'id': 'titleDetails'}) if title_details: # get languages for link in title_details.find_all('a', href=re.compile('/language/')): lang = link.text.strip().lower() if lang not in self.languages: self.languages.append(lang.strip()) # Storyline section storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) if storyline: plot_elem = storyline.find('p') if plot_elem: # Remove the "Written By" part. if plot_elem.em: plot_elem.em.replace_with('') self.plot_outline = plot_elem.text.strip() else: log.debug('No storyline found for %s' % self.imdb_id) self.genres = [i.text.strip().lower() for i in storyline.select('[itemprop="genre"] > a')] # Cast section cast = soup.find('div', attrs={'id': 'titleCast'}) if cast: for actor in cast.select('[itemprop="actor"] > a'): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.text) # get photo tag_photo = soup.find('td', attrs={'id': 'img_primary'}) if tag_photo: tag_img = tag_photo.find('img') if tag_img: self.photo = tag_img.get('src') log.debug('Detected photo: %s' % self.photo) # get rating. contentRating <span> in infobar. tag_infobar_div = soup.find('div', attrs={'class': 'infobar'}) if tag_infobar_div: tag_mpaa_rating = tag_infobar_div.find( 'span', attrs={'itemprop': 'contentRating'}) if tag_mpaa_rating: if not tag_mpaa_rating.get('class') or not tag_mpaa_rating[ 'class'][0].startswith('us_'): log.warning('Could not determine mpaa rating for %s' % url) else: rating_class = tag_mpaa_rating['class'][0] if rating_class == 'us_not_rated': self.mpaa_rating = 'NR' else: self.mpaa_rating = rating_class.lstrip('us_').replace( '_', '-').upper() log.debug('Detected mpaa rating: %s' % self.mpaa_rating) else: log.debug('Unable to match signature of mpaa rating for %s - ' 'could be a TV episode, or plugin needs update?' % url) else: # We should match the infobar, it's an integral part of the IMDB page. log.warning( 'Unable to get infodiv class for %s - plugin needs update?' % url) # get name tag_name = soup.find('h1').find('span', attrs={'itemprop': 'name'}) if tag_name: self.name = tag_name.text log.debug('Detected name: %s' % self.name) else: log.warning('Unable to get name for %s - plugin needs update?' % url) tag_original_title_i = soup.find('i', text=re.compile(r'original title')) if tag_original_title_i: span = tag_original_title_i.parent tag_original_title_i.decompose() self.original_name = span.text.strip() log.debug('Detected original name: %s' % self.original_name) else: # if title is already in original language, it doesn't have the tag log.debug( 'Unable to get original title for %s - it probably does not exists' % url) # detect if movie is eligible for ratings rating_ineligible = soup.find('div', attrs={'class': 'rating-ineligible'}) if rating_ineligible: log.debug('movie is not eligible for ratings') else: # get votes tag_votes = soup.find(itemprop='ratingCount') if tag_votes: self.votes = str_to_int(tag_votes.string) or 0 log.debug('Detected votes: %s' % self.votes) else: log.warning( 'Unable to get votes for %s - plugin needs update?' % url) # get score - find the ratingValue item that contains a numerical value span_score = soup.find(itemprop='ratingValue', text=re.compile('[\d\.]+')) if span_score: try: self.score = float(span_score.string) except (ValueError, TypeError): log.debug('tag_score %r is not valid float' % span_score.string) log.debug('Detected score: %s' % self.score) else: log.warning( 'Unable to get score for %s - plugin needs update?' % url) # get genres genres = soup.find('div', itemprop='genre') if genres: for link in genres.find_all('a'): self.genres.append(link.text.strip().lower()) else: log.warning( 'Unable to find genres section for %s - plugin needs update?' % url) # get languages for link in soup.find_all('a', href=re.compile('/language/.*')): # skip non-primary languages "(a few words)", etc. m = re.search('(?x) \( [^()]* \\b few \\b', link.next_sibling) if not m: lang = link.text.lower() if not lang in self.languages: self.languages.append(lang.strip()) # get year tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')}) if tag_year: self.year = int(tag_year.text) log.debug('Detected year: %s' % self.year) elif soup.head.title: m = re.search(r'(\d{4})\)', soup.head.title.string) if m: self.year = int(m.group(1)) log.debug('Detected year: %s' % self.year) else: log.warning( 'Unable to get year for %s (regexp mismatch) - plugin needs update?' % url) else: log.warning( 'Unable to get year for %s (missing title) - plugin needs update?' % url) # get main cast tag_cast = soup.find('table', 'cast_list') if tag_cast: for actor in tag_cast.find_all('a', href=re.compile('/name/nm')): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name # get director(s) h4_director = soup.find('h4', text=re.compile('Director')) if h4_director: for director in h4_director.parent.find_all( 'a', href=re.compile('/name/nm')): director_id = extract_id(director['href']) director_name = director.text # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name log.debug('Detected genres: %s' % self.genres) log.debug('Detected languages: %s' % self.languages) log.debug('Detected director(s): %s' % ', '.join(self.directors)) log.debug('Detected actors: %s' % ', '.join(self.actors)) # get plot h2_plot = soup.find('h2', text='Storyline') if h2_plot: p_plot = h2_plot.find_next('p') if p_plot: self.plot_outline = p_plot.next.string.strip() log.debug('Detected plot outline: %s' % self.plot_outline) else: log.debug('Plot does not have p-tag') else: log.debug('Failed to find plot')
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.text) title_overview = soup.find('div', attrs={'class': 'title-overview'}) if not title_overview: raise PluginError( 'IMDB parser needs updating, imdb format changed. Please report on Github.' ) # Parse stuff from the title-overview section name_elem = title_overview.find('h1', attrs={'itemprop': 'name'}) if name_elem: self.name = name_elem.contents[0].strip() else: log.error( 'Possible IMDB parser needs updating, Please report on Github.' ) raise PluginError('Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url)) year = title_overview.find('span', attrs={'id': 'titleYear'}) if year: m = re.search(r'([0-9]{4})', year.text) if m: self.year = int(m.group(1)) if not self.year: log.debug('No year found for %s', self.imdb_id) mpaa_rating_elem = title_overview.find(itemprop='contentRating') if mpaa_rating_elem: self.mpaa_rating = mpaa_rating_elem['content'] else: log.debug('No rating found for %s', self.imdb_id) photo_elem = title_overview.find(itemprop='image') if photo_elem: self.photo = photo_elem['src'] else: log.debug('No photo found for %s', self.imdb_id) original_name_elem = title_overview.find( attrs={'class': 'originalTitle'}) if original_name_elem: self.original_name = original_name_elem.contents[0].strip().strip( '"') else: log.debug('No original title found for %s', self.imdb_id) votes_elem = title_overview.find(itemprop='ratingCount') if votes_elem: self.votes = str_to_int(votes_elem.text) else: log.debug('No votes found for %s', self.imdb_id) score_elem = title_overview.find(itemprop='ratingValue') if score_elem: self.score = float(score_elem.text) else: log.debug('No score found for %s', self.imdb_id) meta_score_elem = title_overview.find( attrs={'class': 'metacriticScore'}) if meta_score_elem: self.meta_score = str_to_int(meta_score_elem.text) else: log.debug('No Metacritic score found for %s', self.imdb_id) # get director(s) for director in title_overview.select('[itemprop="director"] > a'): director_id = extract_id(director['href']) director_name = director.text # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name # get writer(s) for writer in title_overview.select('[itemprop="creator"] > a'): writer_id = extract_id(writer['href']) writer_name = writer.text # tag instead of name if isinstance(writer_name, Tag): writer_name = None self.writers[writer_id] = writer_name # Details section title_details = soup.find('div', attrs={'id': 'titleDetails'}) if title_details: # get languages for link in title_details.find_all( 'a', href=re.compile('^/search/title\?title_type=feature' '&primary_language=')): lang = link.text.strip().lower() if lang not in self.languages: self.languages.append(lang.strip()) # Storyline section storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) if storyline: plot_elem = storyline.find('p') if plot_elem: # Remove the "Written By" part. if plot_elem.em: plot_elem.em.replace_with('') self.plot_outline = plot_elem.text.strip() else: log.debug('No storyline found for %s', self.imdb_id) self.genres = [ i.text.strip().lower() for i in storyline.select('[itemprop="genre"] > a') ] # Cast section cast = soup.find('div', attrs={'id': 'titleCast'}) if cast: for actor in cast.select('[itemprop="actor"] > a'): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name
def parse(self, data=None): """Parse movie name. Populates name, year, quality and proper_count attributes""" # Reset before parsing, so the parser can be reused. self.reset() if data is None: data = self.data # Move anything in leading brackets to the end data = re.sub(r'^\[(.*?)\](.*)', r'\2 \1', data) for char in '[]()_,.': data = data.replace(char, ' ') # if there are no spaces if data.find(' ') == -1: data = data.replace('-', ' ') # remove unwanted words (imax, ..) self.remove_words(data, self.remove) data = self.strip_spaces(data) # split to parts parts = data.split(' ') cut_part = 256 all_caps = True for part_pos, part in enumerate(parts): cut = False # Don't let the first word be cutoff word if part_pos < 1: continue # check for year num = str_to_int(part) if num is not None: if 1930 < num <= datetime.now().year: if self.year_pos == cut_part: # Looks like a year, but we already set the cutpoint to a year, let's move it forward cut_part = part_pos self.year = num self.year_pos = part_pos cut = True # Don't consider all caps words cut words if the whole title has been all caps if not part.isupper(): all_caps = False # if length > 3 and whole word in uppers, consider as cut word (most likely a group name) if len(part) > 3 and part.isupper() and part.isalpha() and not all_caps: cut = True # check for cutoff words if part.lower() in self.cutoffs: cut = True # check for propers if part.lower() in self.propers: # 'real' and 'final' are too common in movie titles, only cut if it comes after year if part.lower() not in ['real', 'final'] or self.year: self.proper_count += 1 cut = True # update cut position if cut and parts.index(part) < cut_part: cut_part = part_pos if cut_part != 256: log.debug('parts: %s, cut is: %s', parts, parts[cut_part]) # calculate cut positon from cut_part abs_cut = len(' '.join(parts[:cut_part])) log.debug('after parts check, cut data would be: `%s` abs_cut: %i', data[:abs_cut], abs_cut) # parse quality quality = qualities.Quality(data) if quality: self.quality = quality # remaining string is same as data but quality information removed # find out position where there is first difference, this is earliest # quality bit, anything after that has no relevance to the movie name dp = diff_pos(data, quality.clean_text) if dp is not None: log.debug('quality start: %s', dp) if dp < abs_cut: log.debug('quality cut is even shorter') abs_cut = dp # make cut data = data[:abs_cut].strip() log.debug('data cut to `%s` - this will be the name', data) # save results self.name = data
def parse(self, imdb_id, soup=None): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url if not soup: page = requests.get(url) soup = get_soup(page.text) title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'}) if not title_wrapper: # New layout, transitional title_wrapper = soup.find( 'div', {"class": re.compile("^TitleBlock__TitleContainer.?")}) if not title_wrapper: raise plugin.PluginError( 'IMDB parser needs updating, imdb format changed. Please report on Github.' ) data = json.loads( soup.find('script', { 'type': 'application/ld+json' }).text) if not data: raise plugin.PluginError( 'IMDB parser needs updating, imdb format changed. Please report on Github.' ) # Parse stuff from the title-overview section name_elem = data['name'] if name_elem: self.name = name_elem.strip() else: logger.error( 'Possible IMDB parser needs updating, Please report on Github.' ) raise plugin.PluginError('Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url)) year = soup.find('span', attrs={'id': 'titleYear'}) if not year: # Test new layout year = title_wrapper.find( 'span', {"class": re.compile("^TitleBlockMetaData__ListItemText.?")}) if year: m = re.search(r'([0-9]{4})', year.text) if m: self.year = int(m.group(1)) if not self.year: logger.debug('No year found for {}', self.imdb_id) mpaa_rating_elem = data.get('contentRating') if mpaa_rating_elem: self.mpaa_rating = mpaa_rating_elem else: logger.debug('No rating found for {}', self.imdb_id) photo_elem = data.get('image') if photo_elem: self.photo = photo_elem else: logger.debug('No photo found for {}', self.imdb_id) strip_pre_text = False original_name_elem = title_wrapper.find('div', {'class': 'originalTitle'}) if not original_name_elem: # Test new layout strip_pre_text = True original_name_elem = title_wrapper.find( 'div', {"class": re.compile("^OriginalTitle.?")}) if original_name_elem: self.name = title_wrapper.find('h1').contents[0].strip() self.original_name = original_name_elem.contents[0].strip().strip( '"') if strip_pre_text: striped_text = re.search(r"([^\:]*)\:? (.*)", self.original_name) if len(striped_text.groups()) == 2: self.original_name = striped_text.group(2) if not original_name_elem: logger.debug('No original title found for {}', self.imdb_id) votes_elem = data.get('aggregateRating', {}).get('ratingCount') if votes_elem: self.votes = str_to_int(votes_elem) if not isinstance( votes_elem, int) else votes_elem else: logger.debug('No votes found for {}', self.imdb_id) score_elem = data.get('aggregateRating', {}).get('ratingValue') if score_elem: self.score = float(score_elem) else: logger.debug('No score found for {}', self.imdb_id) meta_score_elem = soup.find(attrs={'class': 'metacriticScore'}) if not meta_score_elem: # Test new layout meta_score_elem = soup.find('span', attrs={'class': 'score-meta'}) if meta_score_elem: self.meta_score = str_to_int(meta_score_elem.text) else: logger.debug('No Metacritic score found for {}', self.imdb_id) # get director(s) directors = data.get('director', []) if not isinstance(directors, list): directors = [directors] for director in directors: if director['@type'] != 'Person': continue director_id = extract_id(director['url']) director_name = director['name'] self.directors[director_id] = director_name # get writer(s) writers = data.get('creator', []) if not isinstance(writers, list): writers = [writers] for writer in writers: if writer['@type'] != 'Person': continue writer_id = extract_id(writer['url']) writer_name = writer['name'] self.writers[writer_id] = writer_name # Details section title_details = soup.find('div', attrs={'id': 'titleDetails'}) if not title_details: # Test new layout title_details = soup.find( 'div', attrs={'data-testid': 'title-details-section'}) if title_details: # get languages for link in title_details.find_all( 'a', href=re.compile(r'^/search/title\?title_type=feature' '&primary_language=')): lang = link.text.strip().lower() if lang not in self.languages: self.languages.append(lang.strip()) # Storyline section storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) if storyline: plot_elem = storyline.find('p') if plot_elem: # Remove the "Written By" part. if plot_elem.em: plot_elem.em.replace_with('') self.plot_outline = plot_elem.text.strip() else: logger.debug('No storyline found for {}', self.imdb_id) keyword_elem = storyline.find('h4').parent if keyword_elem: # The last "a" tag is a link to the full list self.plot_keywords = [ keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1] ] else: # Test new layout storyline = soup.find( 'div', attrs={'data-testid': 'storyline-plot-summary'}) if storyline: self.plot_outline = storyline.text keyword_elem = soup.find( 'div', attrs={'data-testid': 'storyline-plot-keywords'}) if keyword_elem: self.plot_keywords = [ keyword_elem.text.strip() for keyword_elem in keyword_elem.find_all("a")[:-1] ] genres = data.get('genre', []) if not isinstance(genres, list): genres = [genres] self.genres = [g.strip().lower() for g in genres] # Cast section cast = soup.find('table', attrs={'class': 'cast_list'}) if cast: for actor in cast.select('tr > td:nth-of-type(2) > a'): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name else: # Test new layout cast = soup.find_all( 'a', attrs={'data-testid': 'title-cast-item__actor'}) if cast: for actor in cast: actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.text) # get photo tag_photo = soup.find('td', attrs={'id': 'img_primary'}) if tag_photo: tag_img = tag_photo.find('img') if tag_img: self.photo = tag_img.get('src') log.debug('Detected photo: %s' % self.photo) # get rating. contentRating <span> in infobar. tag_infobar_div = soup.find('div', attrs={'class': 'infobar'}) if tag_infobar_div: tag_mpaa_rating = tag_infobar_div.find('span', attrs={'itemprop': 'contentRating'}) if tag_mpaa_rating: if not tag_mpaa_rating.get('class') or not tag_mpaa_rating['class'][0].startswith('us_'): log.warning('Could not determine mpaa rating for %s' % url) else: rating_class = tag_mpaa_rating['class'][0] if rating_class == 'us_not_rated': self.mpaa_rating = 'NR' else: self.mpaa_rating = rating_class.lstrip('us_').replace('_', '-').upper() log.debug('Detected mpaa rating: %s' % self.mpaa_rating) else: log.debug('Unable to match signature of mpaa rating for %s - ' 'could be a TV episode, or plugin needs update?' % url) else: # We should match the infobar, it's an integral part of the IMDB page. log.warning('Unable to get infodiv class for %s - plugin needs update?' % url) # get name tag_name = soup.find('h1') if tag_name: tag_name = tag_name.find('span', attrs={'itemprop': 'name'}) if tag_name: self.name = tag_name.text log.debug('Detected name: %s' % self.name) else: log.warning('Unable to get name for %s - plugin needs update?' % url) tag_original_title_i = soup.find('i', text=re.compile(r'original title')) if tag_original_title_i: span = tag_original_title_i.parent tag_original_title_i.decompose() self.original_name = span.text.strip().strip('"') log.debug('Detected original name: %s' % self.original_name) else: # if title is already in original language, it doesn't have the tag log.debug('Unable to get original title for %s - it probably does not exists' % url) star_box = soup.find('div', attrs={'class': 'star-box giga-star'}) if star_box: # detect if movie is eligible for ratings rating_ineligible = star_box.find('div', attrs={'class': 'rating-ineligible'}) if rating_ineligible: log.debug('movie is not eligible for ratings') else: # get votes tag_votes = star_box.find(itemprop='ratingCount') if tag_votes: self.votes = str_to_int(tag_votes.string) or 0 log.debug('Detected votes: %s' % self.votes) else: log.warning('Unable to get votes for %s - plugin needs update?' % url) # get score - find the ratingValue item that contains a numerical value span_score = star_box.find(itemprop='ratingValue', text=re.compile('[\d\.]+')) if span_score: try: self.score = float(span_score.string) except (ValueError, TypeError): log.debug('tag_score %r is not valid float' % span_score.string) log.debug('Detected score: %s' % self.score) else: log.warning('Unable to get score for %s - plugin needs update?' % url) else: log.warning('Unable to find score/vote section for %s - plugin needs update?' % url) # get genres genres = soup.find('div', itemprop='genre') if genres: for link in genres.find_all('a'): self.genres.append(link.text.strip().lower()) else: log.warning('Unable to find genres section for %s - plugin needs update?' % url) # get languages for link in soup.find_all('a', href=re.compile('/language/.*')): # skip non-primary languages "(a few words)", etc. m = re.search('(?x) \( [^()]* \\b few \\b', link.next_sibling) if not m: lang = link.text.lower() if not lang in self.languages: self.languages.append(lang.strip()) # get year tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')}) if tag_year: self.year = int(tag_year.text) log.debug('Detected year: %s' % self.year) elif soup.head.title: m = re.search(r'(\d{4})\)', soup.head.title.string) if m: self.year = int(m.group(1)) log.debug('Detected year: %s' % self.year) else: log.warning('Unable to get year for %s (regexp mismatch) - plugin needs update?' % url) else: log.warning('Unable to get year for %s (missing title) - plugin needs update?' % url) # get main cast tag_cast = soup.find('table', 'cast_list') if tag_cast: for actor in tag_cast.find_all('a', href=re.compile('/name/nm')): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name # get director(s) h4_director = soup.find('h4', text=re.compile('Director')) if h4_director: for director in h4_director.parent.find_all('a', href=re.compile('/name/nm')): director_id = extract_id(director['href']) director_name = director.text # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name log.debug('Detected genres: %s' % self.genres) log.debug('Detected languages: %s' % self.languages) log.debug('Detected director(s): %s' % ', '.join(self.directors)) log.debug('Detected actors: %s' % ', '.join(self.actors)) # get plot h2_plot = soup.find('h2', text='Storyline') if h2_plot: p_plot = h2_plot.find_next('p') if p_plot and p_plot.next.string: self.plot_outline = p_plot.next.string.strip() log.debug('Detected plot outline: %s' % self.plot_outline) else: log.debug('Plot does not have p-tag') else: log.debug('Failed to find plot')
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.text) title_overview = soup.find('div', attrs={'class': 'title-overview'}) if not title_overview: log.error('IMDB parser needs updating, imdb format changed.') return # Parse the year from the page title, no good places in the body (in current format) year_match = re.search(r'\((\d{4})\) - IMDb', soup.title.text) if year_match: self.year = int(year_match.group(1)) # Parse stuff from the title-overview section self.name = title_overview.find('h1', itemprop='name').text mpaa_rating_elem = title_overview.find(itemprop='contentRating') if mpaa_rating_elem: self.mpaa_rating = mpaa_rating_elem['content'] else: log.debug('No rating found for %s' % self.imdb_id) photo_elem = title_overview.find(itemprop='image') if photo_elem: self.photo = photo_elem['src'] else: log.debug('No photo found for %s' % self.imdb_id) original_name_elem = title_overview.find(attrs={'class': 'originalTitle'}) if original_name_elem: self.original_name = original_name_elem.find(text=True, recursive=False) else: log.debug('No original title found for %s' % self.imdb_id) votes_elem = title_overview.find(itemprop='ratingCount') if votes_elem: self.votes = str_to_int(votes_elem.text) else: log.debug('No votes found for %s' % self.imdb_id) score_elem = title_overview.find(itemprop='ratingValue') if score_elem: self.score = float(score_elem.text) else: log.debug('No score found for %s' % self.imdb_id) # get director(s) for director in title_overview.select('[itemprop="director"] > a'): director_id = extract_id(director['href']) director_name = director.text # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name # Details section title_details = soup.find('div', attrs={'id': 'titleDetails'}) if title_details: # get languages for link in title_details.find_all('a', href=re.compile('/language/')): lang = link.text.strip().lower() if lang not in self.languages: self.languages.append(lang.strip()) # Storyline section storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) if storyline: plot_elem = storyline.find('p') if plot_elem: self.plot_outline = plot_elem.find(text=True, recursive=False).strip() else: log.debug('No storyline found for %s' % self.imdb_id) self.genres = [i.text.strip().lower() for i in storyline.select('[itemprop="genre"] > a')] # Cast section cast = soup.find('div', attrs={'id': 'titleCast'}) if cast: for actor in cast.select('[itemprop="actor"] > a'): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name
def search(self, name): """Return array of movie details (dict)""" logger.debug('Searching: {}', name) url = 'https://www.imdb.com/find' # This may include Shorts and TV series in the results params = {'q': name, 's': 'tt'} logger.debug('Search query: {}', repr(url)) page = requests.get(url, params=params) actual_url = page.url movies = [] soup = get_soup(page.text) # in case we got redirected to movie page (perfect match) re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url) if re_m: actual_url = re_m.group(0) imdb_id = extract_id(actual_url) movie_parse = ImdbParser() movie_parse.parse(imdb_id, soup=soup) logger.debug('Perfect hit. Search got redirected to {}', actual_url) movie = { 'match': 1.0, 'name': movie_parse.name, 'imdb_id': imdb_id, 'url': make_url(imdb_id), 'year': movie_parse.year, } movies.append(movie) return movies section_table = soup.find('table', 'findList') if not section_table: logger.debug('results table not found') return rows = section_table.find_all('tr') if not rows: logger.debug('Titles section does not have links') for count, row in enumerate(rows): # Title search gives a lot of results, only check the first ones if count > self.max_results: break result_text = row.find('td', 'result_text') movie = {} additional = re.findall(r'\((.*?)\)', result_text.text) if len(additional) > 0: if re.match(r'^\d{4}$', additional[-1]): movie['year'] = str_to_int(additional[-1]) elif len(additional) > 1: movie['year'] = str_to_int(additional[-2]) if additional[-1] not in ['TV Movie', 'Video']: logger.debug('skipping {}', result_text.text) continue primary_photo = row.find('td', 'primary_photo') movie['thumbnail'] = primary_photo.find('a').find('img').get('src') link = result_text.find_next('a') movie['name'] = link.text movie['imdb_id'] = extract_id(link.get('href')) movie['url'] = make_url(movie['imdb_id']) logger.debug('processing name: {} url: {}', movie['name'], movie['url']) # calc & set best matching ratio seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title()) ratio = seq.ratio() # check if some of the akas have better ratio for aka in link.parent.find_all('i'): aka = aka.next.string match = re.search(r'".*"', aka) if not match: logger.debug('aka `{}` is invalid', aka) continue aka = match.group(0).replace('"', '') logger.trace('processing aka {}', aka) seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title()) aka_ratio = seq.ratio() if aka_ratio > ratio: ratio = aka_ratio * self.aka_weight logger.debug( '- aka `{}` matches better to `{}` ratio {} (weighted to {})', aka, name, aka_ratio, ratio, ) # prioritize items by position position_ratio = (self.first_weight - 1) / (count + 1) + 1 logger.debug('- prioritizing based on position {} `{}`: {}', count, movie['url'], position_ratio) ratio *= position_ratio # store ratio movie['match'] = ratio movies.append(movie) movies.sort(key=lambda x: x['match'], reverse=True) return movies
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.text) title_overview = soup.find('div', attrs={'class': 'title-overview'}) if not title_overview: log.error('IMDB parser needs updating, imdb format changed.') return # Parse stuff from the title-overview section name_elem = title_overview.find('h1', itemprop='name') self.name = name_elem.find(text=True, recursive=False).strip() year = name_elem.find('a') if year: self.year = int(year.text) else: log.debug('No year found for %s' % self.imdb_id) mpaa_rating_elem = title_overview.find(itemprop='contentRating') if mpaa_rating_elem: self.mpaa_rating = mpaa_rating_elem['content'] else: log.debug('No rating found for %s' % self.imdb_id) photo_elem = title_overview.find(itemprop='image') if photo_elem: self.photo = photo_elem['src'] else: log.debug('No photo found for %s' % self.imdb_id) original_name_elem = title_overview.find( attrs={'class': 'originalTitle'}) if original_name_elem: self.original_name = original_name_elem.find(text=True, recursive=False) else: log.debug('No original title found for %s' % self.imdb_id) votes_elem = title_overview.find(itemprop='ratingCount') if votes_elem: self.votes = str_to_int(votes_elem.text) else: log.debug('No votes found for %s' % self.imdb_id) score_elem = title_overview.find(itemprop='ratingValue') if score_elem: self.score = float(score_elem.text) else: log.debug('No score found for %s' % self.imdb_id) # get director(s) for director in title_overview.select('[itemprop="director"] > a'): director_id = extract_id(director['href']) director_name = director.text # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name # Details section title_details = soup.find('div', attrs={'id': 'titleDetails'}) if title_details: # get languages for link in title_details.find_all('a', href=re.compile('/language/')): lang = link.text.strip().lower() if lang not in self.languages: self.languages.append(lang.strip()) # Storyline section storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) if storyline: plot_elem = storyline.find('p') if plot_elem: self.plot_outline = plot_elem.find(text=True, recursive=False).strip() else: log.debug('No storyline found for %s' % self.imdb_id) self.genres = [ i.text.strip().lower() for i in storyline.select('[itemprop="genre"] > a') ] # Cast section cast = soup.find('div', attrs={'id': 'titleCast'}) if cast: for actor in cast.select('[itemprop="actor"] > a'): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name
def parse(self, imdb_id, soup=None): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url if not soup: page = requests.get(url) soup = get_soup(page.text) title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'}) data = json.loads( soup.find('script', { 'type': 'application/ld+json' }).text) if not data: raise PluginError( 'IMDB parser needs updating, imdb format changed. Please report on Github.' ) # Parse stuff from the title-overview section name_elem = data['name'] if name_elem: self.name = name_elem.strip() else: log.error( 'Possible IMDB parser needs updating, Please report on Github.' ) raise PluginError('Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url)) year = soup.find('span', attrs={'id': 'titleYear'}) if year: m = re.search(r'([0-9]{4})', year.text) if m: self.year = int(m.group(1)) if not self.year: log.debug('No year found for %s', self.imdb_id) mpaa_rating_elem = data.get('contentRating') if mpaa_rating_elem: self.mpaa_rating = mpaa_rating_elem else: log.debug('No rating found for %s', self.imdb_id) photo_elem = data.get('image') if photo_elem: self.photo = photo_elem else: log.debug('No photo found for %s', self.imdb_id) original_name_elem = title_wrapper.find('div', {'class': 'originalTitle'}) if original_name_elem: self.name = title_wrapper.find('h1').contents[0].strip() self.original_name = original_name_elem.contents[0].strip().strip( '"') else: log.debug('No original title found for %s', self.imdb_id) votes_elem = data.get('aggregateRating', {}).get('ratingCount') if votes_elem: self.votes = str_to_int(votes_elem) if not isinstance( votes_elem, int) else votes_elem else: log.debug('No votes found for %s', self.imdb_id) score_elem = data.get('aggregateRating', {}).get('ratingValue') if score_elem: self.score = float(score_elem) else: log.debug('No score found for %s', self.imdb_id) meta_score_elem = soup.find(attrs={'class': 'metacriticScore'}) if meta_score_elem: self.meta_score = str_to_int(meta_score_elem.text) else: log.debug('No Metacritic score found for %s', self.imdb_id) # get director(s) directors = data.get('director', []) if not isinstance(directors, list): directors = [directors] for director in directors: if director['@type'] != 'Person': continue director_id = extract_id(director['url']) director_name = director['name'] self.directors[director_id] = director_name # get writer(s) writers = data.get('creator', []) if not isinstance(writers, list): writers = [writers] for writer in writers: if writer['@type'] != 'Person': continue writer_id = extract_id(writer['url']) writer_name = writer['name'] self.writers[writer_id] = writer_name # Details section title_details = soup.find('div', attrs={'id': 'titleDetails'}) if title_details: # get languages for link in title_details.find_all( 'a', href=re.compile('^/search/title\?title_type=feature' '&primary_language=')): lang = link.text.strip().lower() if lang not in self.languages: self.languages.append(lang.strip()) # Storyline section storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) if storyline: plot_elem = storyline.find('p') if plot_elem: # Remove the "Written By" part. if plot_elem.em: plot_elem.em.replace_with('') self.plot_outline = plot_elem.text.strip() else: log.debug('No storyline found for %s', self.imdb_id) genres = data.get('genre', []) if not isinstance(genres, list): genres = [genres] self.genres = [g.strip().lower() for g in genres] # Cast section cast = soup.find('table', attrs={'class': 'cast_list'}) if cast: for actor in cast.select('tr > td:nth-of-type(2) > a'): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.content) # get photo tag_photo = soup.find('div', attrs={'class': 'photo'}) if tag_photo: tag_img = tag_photo.find('img') if tag_img: self.photo = tag_img.get('src') log.debug('Detected photo: %s' % self.photo) # get rating. Always the first absmiddle. tag_infobar_div = soup.find('div', attrs={'class': 'infobar'}) if tag_infobar_div: tag_mpaa_rating = tag_infobar_div.find('img', attrs={'class': 'absmiddle'}) if tag_mpaa_rating: if tag_mpaa_rating['alt'] != tag_mpaa_rating['title']: # If we've found something of class absmiddle in the infobar, # it should be mpaa_rating, since that's the only one in there. log.warning("MPAA rating alt and title don't match for URL %s - plugin needs an update?" % url) else: self.mpaa_rating = tag_mpaa_rating['alt'] log.debug('Detected mpaa rating: %s' % self.mpaa_rating) else: log.debug('Unable to match signature of mpaa rating for %s - could be a TV episode, or plugin needs update?' % url) else: # We should match the infobar, it's an integral part of the IMDB page. log.warning('Unable to get infodiv class for %s - plugin needs update?' % url) # get name tag_name = soup.find('h1') if tag_name: if tag_name.next: # Handle a page not found in IMDB. tag_name.string is # "<br/> Page Not Found" and there is no next tag. Thus, None. if tag_name.next.string is not None: self.name = tag_name.next.string.strip() log.debug('Detected name: %s' % self.name) else: log.warning('Unable to get name for %s - plugin needs update?' % url) # detect if movie is eligible for ratings rating_ineligible = soup.find('div', attrs={'class': 'rating-ineligible'}) if rating_ineligible: log.debug('movie is not eligible for ratings') else: # get votes tag_votes = soup.find(itemprop='ratingCount') if tag_votes: self.votes = str_to_int(tag_votes.string) or 0 log.debug('Detected votes: %s' % self.votes) else: log.warning('Unable to get votes for %s - plugin needs update?' % url) # get score span_score = soup.find(itemprop='ratingValue') if span_score: try: self.score = float(span_score.string) except ValueError: log.debug('tag_score %s is not valid float' % span_score.contents[0]) log.debug('Detected score: %s' % self.score) else: log.warning('Unable to get score for %s - plugin needs update?' % url) # get genres for link in soup.find_all('a', attrs={'itemprop': 'genre'}): self.genres.append(unicode(link.contents[0].lower())) # get languages for link in soup.find_all('a', attrs={'itemprop': 'inLanguage'}): # skip non-primary languages "(a few words)", etc. m = re.search('(?x) \( [^()]* \\b few \\b', unicode(link.next_sibling)) if not m: lang = unicode(link.contents[0].lower()) if not lang in self.languages: self.languages.append(lang.strip()) # get year tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')}) if tag_year: self.year = int(tag_year.contents[0]) log.debug('Detected year: %s' % self.year) elif soup.head.title: m = re.search(r'(\d{4})\)', unicode(soup.head.title.string)) if m: self.year = int(m.group(1)) log.debug('Detected year: %s' % self.year) else: log.warning('Unable to get year for %s (regexp mismatch) - plugin needs update?' % url) else: log.warning('Unable to get year for %s (missing title) - plugin needs update?' % url) # get main cast tag_cast = soup.find('table', 'cast_list') if tag_cast: for actor in tag_cast.find_all('a', href=re.compile('/name/nm')): actor_id = extract_id(actor['href']) actor_name = unicode(actor.contents[0]) # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name # get director(s) h4_director = soup.find('h4', text=re.compile('Director')) if h4_director: for director in h4_director.parent.parent.find_all('a', href=re.compile('/name/nm')): director_id = extract_id(director['href']) director_name = unicode(director.contents[0]) # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name log.debug('Detected genres: %s' % self.genres) log.debug('Detected languages: %s' % self.languages) log.debug('Detected director(s): %s' % ', '.join(self.directors)) log.debug('Detected actors: %s' % ', '.join(self.actors)) # get plot h2_plot = soup.find('h2', text='Storyline') if h2_plot: p_plot = h2_plot.find_next('p') if p_plot: self.plot_outline = p_plot.next.string.strip() log.debug('Detected plot outline: %s' % self.plot_outline) else: log.debug('Plot does not have p-tag') else: log.debug('Failed to find plot')
def parse(self, imdb_id, soup=None): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url if not soup: page = requests.get(url) soup = get_soup(page.text) data = json.loads( soup.find('script', { 'type': 'application/ld+json' }).string) if not data: raise plugin.PluginError( 'IMDB parser needs updating, imdb format changed. Please report on Github.' ) props_data = json.loads( soup.find('script', { 'type': 'application/json' }).string) if (not props_data or not props_data.get('props') or not props_data.get('props').get('pageProps')): raise plugin.PluginError( 'IMDB parser needs updating, imdb props_data format changed. Please report on Github.' ) above_the_fold_data = props_data['props']['pageProps'].get( 'aboveTheFoldData') if not above_the_fold_data: raise plugin.PluginError( 'IMDB parser needs updating, imdb above_the_fold_data format changed. Please report on Github.' ) title = above_the_fold_data.get('titleText') if title: self.name = title.get('text') if not self.name: raise plugin.PluginError( 'IMDB parser needs updating, imdb above_the_fold_data format changed for title. Please report on Github.' ) original_name = above_the_fold_data.get('originalTitleText') if original_name: self.original_name = original_name.get('text') if not self.original_name: logger.debug('No original title found for {}', self.imdb_id) # NOTE: We cannot use the get default approach here .(get(x, {})) # as the data returned in imdb has all fields with null values if they do not exist. if above_the_fold_data.get('releaseYear'): self.year = above_the_fold_data['releaseYear'].get('year') if not self.year: logger.debug('No year found for {}', self.imdb_id) self.mpaa_rating = data.get('contentRating') if not self.mpaa_rating: logger.debug('No rating found for {}', self.imdb_id) self.photo = data.get('image') if not self.photo: logger.debug('No photo found for {}', self.imdb_id) rating_data = data.get('aggregateRating') if rating_data: rating_count = rating_data.get('ratingCount') if rating_count: self.votes = (str_to_int(rating_count) if not isinstance(rating_count, int) else rating_count) else: logger.debug('No votes found for {}', self.imdb_id) score = rating_data.get('ratingValue') if score: self.score = float(score) else: logger.debug('No score found for {}', self.imdb_id) meta_critic = above_the_fold_data.get('metacritic') if meta_critic: meta_score = meta_critic.get('metascore') if meta_score: self.meta_score = meta_score.get('score') if not self.meta_score: logger.debug('No Metacritic score found for {}', self.imdb_id) # get director(s) directors = data.get('director', []) if not isinstance(directors, list): directors = [directors] for director in directors: if director['@type'] != 'Person': continue director_id = extract_id(director['url']) director_name = director['name'] self.directors[director_id] = director_name # get writer(s) writers = data.get('creator', []) if not isinstance(writers, list): writers = [writers] for writer in writers: if writer['@type'] != 'Person': continue writer_id = extract_id(writer['url']) writer_name = writer['name'] self.writers[writer_id] = writer_name # Details section main_column_data = props_data['props']['pageProps'].get( 'mainColumnData') if not main_column_data: raise plugin.PluginError( 'IMDB parser needs updating, imdb main_column_data format changed. Please report on Github.' ) for language in (main_column_data.get('spokenLanguages') or {}).get('spokenLanguages', []): self.languages.append(language['text'].lower()) # Storyline section # NOTE: We cannot use the get default approach here .(get(x, {})) # as the data returned in imdb has all fields with null values if they do not exist. summaries = main_column_data.get('summaries') or {} summary_edges = summaries.get('edges') or [] if len(summary_edges) > 0: edge_node = summary_edges[0].get('node') or {} plot_text = edge_node.get('plotText') or {} # Strip out html plot_html = get_soup(plot_text.get('plaidHtml')) if plot_html: self.plot_outline = plot_html.text if not self.plot_outline: logger.debug('No storyline found for {}', self.imdb_id) storyline_keywords = main_column_data.get('storylineKeywords') or {} for keyword_node in storyline_keywords.get('edges') or []: keyword = keyword_node.get('node') or {} if keyword: self.plot_keywords.append(keyword.get('text').lower()) genres = (above_the_fold_data.get('genres', {}) or {}).get('genres') self.genres = [g['text'].lower() for g in genres] # Cast section cast_data = main_column_data.get('cast', {}) or {} for cast_node in cast_data.get('edges') or []: actor_node = (cast_node.get('node') or {}).get('name') or {} actor_id = actor_node.get('id') actor_name = (actor_node.get('nameText') or {}).get('text') if actor_id and actor_name: self.actors[actor_id] = actor_name principal_cast_data = main_column_data.get('principalCast', []) or [] if principal_cast_data: for cast_node in principal_cast_data[0].get('credits') or []: actor_node = cast_node.get('name') or {} actor_id = actor_node.get('id') actor_name = (actor_node.get('nameText') or {}).get('text') if actor_id and actor_name: self.actors[actor_id] = actor_name
def parse(self, imdb_id, soup=None): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url if not soup: page = requests.get(url) soup = get_soup(page.text) title_wrapper = soup.find('div', attrs={'class': 'title_wrapper'}) data = json.loads(soup.find('script', {'type': 'application/ld+json'}).text) if not data: raise PluginError('IMDB parser needs updating, imdb format changed. Please report on Github.') # Parse stuff from the title-overview section name_elem = data['name'] if name_elem: self.name = name_elem.strip() else: log.error('Possible IMDB parser needs updating, Please report on Github.') raise PluginError('Unable to set imdb_name for %s from %s' % (self.imdb_id, self.url)) year = soup.find('span', attrs={'id': 'titleYear'}) if year: m = re.search(r'([0-9]{4})', year.text) if m: self.year = int(m.group(1)) if not self.year: log.debug('No year found for %s', self.imdb_id) mpaa_rating_elem = data.get('contentRating') if mpaa_rating_elem: self.mpaa_rating = mpaa_rating_elem else: log.debug('No rating found for %s', self.imdb_id) photo_elem = data.get('image') if photo_elem: self.photo = photo_elem else: log.debug('No photo found for %s', self.imdb_id) original_name_elem = title_wrapper.find('div', {'class': 'originalTitle'}) if original_name_elem: self.name = title_wrapper.find('h1').contents[0].strip() self.original_name = original_name_elem.contents[0].strip().strip('"') else: log.debug('No original title found for %s', self.imdb_id) votes_elem = data.get('aggregateRating', {}).get('ratingCount') if votes_elem: self.votes = str_to_int(votes_elem) if not isinstance(votes_elem, int) else votes_elem else: log.debug('No votes found for %s', self.imdb_id) score_elem = data.get('aggregateRating', {}).get('ratingValue') if score_elem: self.score = float(score_elem) else: log.debug('No score found for %s', self.imdb_id) meta_score_elem = soup.find(attrs={'class': 'metacriticScore'}) if meta_score_elem: self.meta_score = str_to_int(meta_score_elem.text) else: log.debug('No Metacritic score found for %s', self.imdb_id) # get director(s) directors = data.get('director', []) if not isinstance(directors, list): directors = [directors] for director in directors: if director['@type'] != 'Person': continue director_id = extract_id(director['url']) director_name = director['name'] self.directors[director_id] = director_name # get writer(s) writers = data.get('creator', []) if not isinstance(writers, list): writers = [writers] for writer in writers: if writer['@type'] != 'Person': continue writer_id = extract_id(writer['url']) writer_name = writer['name'] self.writers[writer_id] = writer_name # Details section title_details = soup.find('div', attrs={'id': 'titleDetails'}) if title_details: # get languages for link in title_details.find_all('a', href=re.compile('^/search/title\?title_type=feature' '&primary_language=')): lang = link.text.strip().lower() if lang not in self.languages: self.languages.append(lang.strip()) # Storyline section storyline = soup.find('div', attrs={'id': 'titleStoryLine'}) if storyline: plot_elem = storyline.find('p') if plot_elem: # Remove the "Written By" part. if plot_elem.em: plot_elem.em.replace_with('') self.plot_outline = plot_elem.text.strip() else: log.debug('No storyline found for %s', self.imdb_id) genres = data.get('genre', []) if not isinstance(genres, list): genres = [genres] self.genres = [g.strip().lower() for g in genres] # Cast section cast = soup.find('table', attrs={'class': 'cast_list'}) if cast: for actor in cast.select('tr > td:nth-of-type(2) > a'): actor_id = extract_id(actor['href']) actor_name = actor.text.strip() # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name
def search(self, name): """Return array of movie details (dict)""" log.debug('Searching: %s', name) url = u'https://www.imdb.com/find' # This may include Shorts and TV series in the results params = {'q': name, 's': 'tt', } log.debug('Search query: %s', repr(url)) page = requests.get(url, params=params) actual_url = page.url movies = [] soup = get_soup(page.text) # in case we got redirected to movie page (perfect match) re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url) if re_m: actual_url = re_m.group(0) imdb_id = extract_id(actual_url) movie_parse = ImdbParser() movie_parse.parse(imdb_id, soup=soup) log.debug('Perfect hit. Search got redirected to %s', actual_url) movie = { 'match': 1.0, 'name': movie_parse.name, 'imdb_id': imdb_id, 'url': make_url(imdb_id), 'year': movie_parse.year } movies.append(movie) return movies section_table = soup.find('table', 'findList') if not section_table: log.debug('results table not found') return rows = section_table.find_all('tr') if not rows: log.debug('Titles section does not have links') for count, row in enumerate(rows): # Title search gives a lot of results, only check the first ones if count > self.max_results: break result_text = row.find('td', 'result_text') movie = {} additional = re.findall(r'\((.*?)\)', result_text.text) if len(additional) > 0: if re.match('^\d{4}$', additional[-1]): movie['year'] = str_to_int(additional[-1]) elif len(additional) > 1: movie['year'] = str_to_int(additional[-2]) if additional[-1] not in ['TV Movie', 'Video']: log.debug('skipping %s', result_text.text) continue primary_photo = row.find('td', 'primary_photo') movie['thumbnail'] = primary_photo.find('a').find('img').get('src') link = result_text.find_next('a') movie['name'] = link.text movie['imdb_id'] = extract_id(link.get('href')) movie['url'] = make_url(movie['imdb_id']) log.debug('processing name: %s url: %s' % (movie['name'], movie['url'])) # calc & set best matching ratio seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title()) ratio = seq.ratio() # check if some of the akas have better ratio for aka in link.parent.find_all('i'): aka = aka.next.string match = re.search(r'".*"', aka) if not match: log.debug('aka `%s` is invalid' % aka) continue aka = match.group(0).replace('"', '') log.trace('processing aka %s' % aka) seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title()) aka_ratio = seq.ratio() if aka_ratio > ratio: ratio = aka_ratio * self.aka_weight log.debug('- aka `%s` matches better to `%s` ratio %s (weighted to %s)' % (aka, name, aka_ratio, ratio)) # prioritize items by position position_ratio = (self.first_weight - 1) / (count + 1) + 1 log.debug('- prioritizing based on position %s `%s`: %s' % (count, movie['url'], position_ratio)) ratio *= position_ratio # store ratio movie['match'] = ratio movies.append(movie) movies.sort(key=lambda x: x['match'], reverse=True) return movies
def parse(self, imdb_id): self.imdb_id = extract_id(imdb_id) url = make_url(self.imdb_id) self.url = url page = requests.get(url) soup = get_soup(page.content) # get photo tag_photo = soup.find('div', attrs={'class': 'photo'}) if tag_photo: tag_img = tag_photo.find('img') if tag_img: self.photo = tag_img.get('src') log.debug('Detected photo: %s' % self.photo) # get rating. Always the first absmiddle. tag_infobar_div = soup.find('div', attrs={'class': 'infobar'}) if tag_infobar_div: tag_mpaa_rating = tag_infobar_div.find( 'img', attrs={'class': 'absmiddle'}) if tag_mpaa_rating: if tag_mpaa_rating['alt'] != tag_mpaa_rating['title']: # If we've found something of class absmiddle in the infobar, # it should be mpaa_rating, since that's the only one in there. log.warning( "MPAA rating alt and title don't match for URL %s - plugin needs an update?" % url) else: self.mpaa_rating = tag_mpaa_rating['alt'] log.debug('Detected mpaa rating: %s' % self.mpaa_rating) else: log.debug( 'Unable to match signature of mpaa rating for %s - could be a TV episode, or plugin needs update?' % url) else: # We should match the infobar, it's an integral part of the IMDB page. log.warning( 'Unable to get infodiv class for %s - plugin needs update?' % url) # get name tag_name = soup.find('h1') if tag_name: if tag_name.next: # Handle a page not found in IMDB. tag_name.string is # "<br/> Page Not Found" and there is no next tag. Thus, None. if tag_name.next.string is not None: self.name = tag_name.next.string.strip() log.debug('Detected name: %s' % self.name) else: log.warning('Unable to get name for %s - plugin needs update?' % url) # detect if movie is eligible for ratings rating_ineligible = soup.find('div', attrs={'class': 'rating-ineligible'}) if rating_ineligible: log.debug('movie is not eligible for ratings') else: # get votes tag_votes = soup.find(itemprop='ratingCount') if tag_votes: self.votes = str_to_int(tag_votes.string) or 0 log.debug('Detected votes: %s' % self.votes) else: log.warning( 'Unable to get votes for %s - plugin needs update?' % url) # get score span_score = soup.find(itemprop='ratingValue') if span_score: try: self.score = float(span_score.string) except ValueError: log.debug('tag_score %s is not valid float' % span_score.contents[0]) log.debug('Detected score: %s' % self.score) else: log.warning( 'Unable to get score for %s - plugin needs update?' % url) # get genres for link in soup.find_all('a', attrs={'itemprop': 'genre'}): self.genres.append(unicode(link.contents[0].lower())) # get languages for link in soup.find_all('a', attrs={'itemprop': 'inLanguage'}): # skip non-primary languages "(a few words)", etc. m = re.search('(?x) \( [^()]* \\b few \\b', unicode(link.next_sibling)) if not m: lang = unicode(link.contents[0].lower()) if not lang in self.languages: self.languages.append(lang.strip()) # get year tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')}) if tag_year: self.year = int(tag_year.contents[0]) log.debug('Detected year: %s' % self.year) elif soup.head.title: m = re.search(r'(\d{4})\)', unicode(soup.head.title.string)) if m: self.year = int(m.group(1)) log.debug('Detected year: %s' % self.year) else: log.warning( 'Unable to get year for %s (regexp mismatch) - plugin needs update?' % url) else: log.warning( 'Unable to get year for %s (missing title) - plugin needs update?' % url) # get main cast tag_cast = soup.find('table', 'cast_list') if tag_cast: for actor in tag_cast.find_all('a', href=re.compile('/name/nm')): actor_id = extract_id(actor['href']) actor_name = unicode(actor.contents[0]) # tag instead of name if isinstance(actor_name, Tag): actor_name = None self.actors[actor_id] = actor_name # get director(s) h4_director = soup.find('h4', text=re.compile('Director')) if h4_director: for director in h4_director.parent.parent.find_all( 'a', href=re.compile('/name/nm')): director_id = extract_id(director['href']) director_name = unicode(director.contents[0]) # tag instead of name if isinstance(director_name, Tag): director_name = None self.directors[director_id] = director_name log.debug('Detected genres: %s' % self.genres) log.debug('Detected languages: %s' % self.languages) log.debug('Detected director(s): %s' % ', '.join(self.directors)) log.debug('Detected actors: %s' % ', '.join(self.actors)) # get plot h2_plot = soup.find('h2', text='Storyline') if h2_plot: p_plot = h2_plot.find_next('p') if p_plot: self.plot_outline = p_plot.next.string.strip() log.debug('Detected plot outline: %s' % self.plot_outline) else: log.debug('Plot does not have p-tag') else: log.debug('Failed to find plot')