def __init__(self, title_id): self.title_id = title_id self.ratings_uri = imdb_uris["criticreviews"] % self.title_id soup = BeautifulSoup(get(self.ratings_uri).text, 'lxml') """ :returns: Holds page Info tags """ critic_tag = catch('None', lambda: soup.select('tr[itemprop="reviews"]')) movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) """ :returns: Movie Title """ self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: Critic Review Demographics """ self.critic_reviews_df = catch('None', lambda: critic_df(critic_tag))
def __init__(self, title_id): self.title_id = title_id self.ratings_uri = imdb_uris["ratings"] % self.title_id soup = BeautifulSoup(get(self.ratings_uri).text, 'lxml') """ :returns: Holds page Info tags """ rating_demo_tag = catch( 'None', lambda: soup.find('div', string=tag_search['rating_demo']). findNext('table')) """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: Rating Demographics """ self.rating_df = catch('None', lambda: rating_df(rating_demo_tag)) self.rating_demo_df = catch('None', lambda: rating_demo_df(rating_demo_tag)) self.rating_demo_us_df = catch( 'None', lambda: rating_demo_region_df(rating_demo_tag)) self.rating_math = catch( 'dict', lambda: { 'Arithmetic Mean': rating_demo_tag.findPrevious('table').findNextSibling( 'div', class_="allText").get_text().split()[3], 'Median': rating_demo_tag.findPrevious('table').findNextSibling( 'div', class_="allText").get_text().split()[-1] }) """ :returns: Rating """ self.votes = catch( 'None', lambda: digits(soup.select_one('.allText').contents[0])) self.rating = catch( 'None', lambda: float( soup.select_one('.allText').contents[2].split()[2:][0]))
def __init__(self, title_id): self.title_id = title_id self.plot_uri = imdb_uris["plotsummary"] % self.title_id soup = BeautifulSoup(get(self.plot_uri).text, 'lxml') """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: Movie Plot """ self.plot = catch( 'None', lambda: unicode( soup.select_one('#synopsis').findNext('ul').get_text()). replace("\'", "")) """ :returns: Movies Summaries """ block = catch( 'None', lambda: soup.select_one('#summaries').findNext('ul').select('li')) self.summaries = catch( 'None', lambda: [unicode(' '.join(tag.text.split())) for tag in block]) """ :returns: Creates Dict from the above info. if available. """ self.imdb_plot_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Plot URI": self.plot_uri, "Plot": self.plot, "Summaries": self.summaries })
def __init__(self, text: str) -> bool: self.entered_text = text self.url = search_imdb_id["character"] % ''.join( self.entered_text.split()) soup = BeautifulSoup(get(self.url).text, 'lxml') suggestions, names, imdbids = [], [], [] name = soup.select('.result_text') try: if len(name) > 20: for item in name[:20]: print('%s: %s' % ((name.index(item) + 1), unicode(item.get_text()))) suggestions.append(item.a['href'][7:16]) names.append(item.a.text.strip()) else: for item in name: print('%s: %s' % ((name.index(item) + 1), unicode(item.get_text()))) suggestions.append(item.a['href'][7:16]) names.append(item.a.text.strip()) input_name = re.findall(r"[\w']+", input('Enter serial number\t')) imdbids = [ suggestions[int(load) - 1] if int(load) != 0 else '' for load in input_name ] names = [ names[int(load) - 1] if int(load) != 0 else '' for load in input_name ] if len(imdbids) == 1: self.character_name = names[0] self.character_id = imdbids[0] else: self.character_name = names self.character_id = imdbids except Exception as es: print("{0} :".format(type(es)), es) sys.exit(0)
def __init__(self, title_id: str) -> bool: self.title_id = title_id self.taglines_uri = imdb_uris["taglines"] % self.title_id soup = BeautifulSoup(get(self.taglines_uri).text, 'lxml') """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: Holds page Info tags """ taglines_tag = catch( 'None', lambda: soup.select_one('#taglines_content').select('.soda')) """ returns: taglines if available """ self.taglines = catch( 'list', lambda: [unicode(tagline.get_text()) for tagline in taglines_tag]) """ :returns: Creates Dict from the above info. if available. """ self.imdb_taglines_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Taglines URI": self.taglines_uri, "Taglines": self.taglines })
def __init__(self, title_id): self.title_id = title_id self.plot_keywords_url = imdb_uris["keywords"] % self.title_id soup = BeautifulSoup(get(self.plot_keywords_url).text, 'lxml') """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: Movie Plot Keywords """ block = catch('None', lambda: soup.select('td.soda')) self.plot_keywords = catch( 'list', lambda: [tag['data-item-keyword'] for tag in block]) """ :returns: Creates Dict from the above info. if available. """ self.imdb_plot_Keywords_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Plot Keywords URL": self.plot_keywords_url, "Plot Keywords": self.plot_keywords })
def __init__(self, title_id): self.title_id = title_id self.release_info_url = imdb_uris["releaseinfo"] % self.title_id soup = BeautifulSoup(get(self.release_info_url).text, 'lxml') """ :returns: table tag index """ table_tag = catch('None', lambda: soup.select('h4')) """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ returns: tags """ releases = catch( 'None', lambda: table_tag[index_finder(table_tag, 'release')]. findNext('table').select('tr')) """ returns: Release Info DataFrame if available. """ try: self.releases_df = pd.DataFrame( columns=['Country', 'URI', 'Date', 'Location']) for tag in releases: self.releases_df.loc[len(self.releases_df)] = [ catch( 'None', lambda: unicode( tag.select_one('td.release-date-item__country-name' ).a.get_text())), catch( 'None', lambda: "%s%s" % (base_uri, unicode( tag.select_one( 'td.release-date-item__country-name').a[ 'href'][1:]))), catch( 'None', lambda: unicode( tag.select_one('td.release-date-item__date'). get_text())), catch( 'None', lambda: unicode( tag.select_one('td.release-date-item__attributes'). get_text())) ] self.releases_df = dataframe_data(self.releases_df) except: self.releases_df = None """ :returns: Released Countries, Dates, Location list if available. """ self.released_country_names = catch( 'list', lambda: self.releases_df.Country.tolist()) self.released_country_uri = catch( 'list', lambda: self.releases_df.URI.tolist()) self.released_dates = catch('list', lambda: self.releases_df.Date.tolist()) self.released_locations = catch( 'list', lambda: self.releases_df.Location.tolist()) """ :returns: Released Date in India if available. """ self.release_date_in_india = catch( 'None', lambda: unicode(releases[india_index_finder(releases, 'india')]. select_one('td').findNext('td').get_text())) """ returns: Also Known As DataFrame if available. """ try: aka = table_tag[index_finder( table_tag, 'also known as')].findNext('table').select('tr') self.also_known_as_df = pd.DataFrame(columns=['Country', 'Title']) for tag in aka: self.also_known_as_df.loc[len(self.also_known_as_df)] = [ catch( 'None', lambda: unicode( tag.select_one('td.aka-item__name').get_text())), catch( 'None', lambda: unicode( tag.select_one('td.aka-item__title').get_text())) ] self.also_known_as_df = dataframe_data(self.also_known_as_df) except: self.also_known_as_df = None """ :returns: Also Known As Countries, Title list if available. """ self.also_known_as_country_names = catch( 'list', lambda: self.also_known_as_df.Country.tolist()) self.also_known_as_titles = catch( 'list', lambda: self.also_known_as_df.Title.tolist()) """ :returns: Creates Meta Data from the above info. if available. """ self.imdb_release_info_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Release Info URL": self.release_info_url, "India Release Date": self.release_date_in_india, "Release Dates": { "Country": self.released_country_names, "URI": self.released_country_uri, "Date": self.released_dates, "Location": self.released_locations }, "Also Known As (AKA)": { "Country": self.also_known_as_country_names, "Title": self.also_known_as_titles } })
def __init__(self, title_id): self.title_id = title_id self.technical_spec_url = imdb_uris["technical"] % self.title_id soup = BeautifulSoup(get(self.technical_spec_url).text, 'lxml') """ :returns: Holds page Info tags """ technical_spec_tag = catch( 'None', lambda: soup.select('td[class="label"]')) movie_tag = catch( 'None', lambda: soup.select_one('h3[itemprop="name"]')) """ :returns: Movie Title """ self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch('None', lambda: unicode( '%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch('None', lambda: int(re.findall( r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[0])) """ :returns: movie runtime if available. """ self.runtime = catch('None', lambda: unicode(technical_spec_tag[index_finder( technical_spec_tag, tag_search['runtime'])].findNext('td').text)) """ :returns: movie sound mix if available. """ sound_mix = catch('None', lambda: technical_spec_tag[index_finder( technical_spec_tag, tag_search['sound mix'])].findNext('td').select('a')) self.sound_mix_df = catch('None', lambda: technical_specs(sound_mix)) self.sound_mix_name = catch( 'list', lambda: self.sound_mix_df.Name.tolist()) self.sound_mix_uri = catch( 'list', lambda: self.sound_mix_df.URI.tolist()) """ :returns: movie color if available. """ color = catch('None', lambda: technical_spec_tag[index_finder( technical_spec_tag, tag_search['color'])].findNext('td').select('a')) self.color_df = catch('None', lambda: technical_specs(color)) self.color_name = catch('list', lambda: self.color_df.Name.tolist()) self.color_uri = catch('list', lambda: self.color_df.URI.tolist()) """ :returns: movie aspect ratio if available. """ aspect_ratio = catch('list', lambda: list(technical_spec_tag[index_finder( technical_spec_tag, 'aspect ratio')].findNext('td').stripped_strings)) self.aspect_ratio = catch('list', lambda: [unicode( ' '.join(item.split())) for item in aspect_ratio]) """ :returns: movie camera if available. """ camera = catch('list', lambda: list(technical_spec_tag[index_finder( technical_spec_tag, 'camera')].findNext('td').stripped_strings)) self.camera = catch('list', lambda: [unicode( ' '.join(item.split())) for item in camera]) """ :returns: movie laboratory if available. """ laboratory = catch('list', lambda: list(technical_spec_tag[index_finder( technical_spec_tag, 'laboratory')].findNext('td').stripped_strings)) self.laboratory = catch('list', lambda: [unicode( ' '.join(item.split())) for item in laboratory]) """ :returns: negative format if available. """ self.negative_format = catch('list', lambda: unicode(' '.join(technical_spec_tag[index_finder( technical_spec_tag, 'negative format')].findNext('td').get_text().split()))) """ :returns: cinematography process if available. """ cinematographic_process = catch('list', lambda: list(technical_spec_tag[index_finder( technical_spec_tag, 'cinematographic process')].findNext('td').stripped_strings)) self.cinematographic_process = catch('list', lambda: [unicode( ' '.join(item.split())) for item in cinematographic_process]) """ :returns: printed film format if available. """ printed_film_format = catch('list', lambda: list(technical_spec_tag[index_finder( technical_spec_tag, 'printed film format')].findNext('td').stripped_strings)) self.printed_film_format = catch('list', lambda: [unicode( ' '.join(item.split())) for item in printed_film_format]) """ :returns: film length if available. """ film_length = catch('list', lambda: list(technical_spec_tag[index_finder( technical_spec_tag, 'film length')].findNext('td').stripped_strings)) self.film_length = catch('list', lambda: [unicode( ' '.join(item.split())) for item in film_length]) """ :returns: Creates Dict from the above info. if available. """ self.imdb_technical_spec_metadata = catch('dict', lambda: {"Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Technical Spec URL": self.technical_spec_url, "Runtime": self.runtime, "Sound Mix": {"Name": self.sound_mix_name, "URI": self.sound_mix_uri}, "Color": {"Name": self.color_name, "URI": self.color_uri}, "Aspect Ratio": self.aspect_ratio, "Camera": self.camera, "Laboratory": self.laboratory, "Negative Film Format": self.negative_format, "Cinematography Process": self.cinematographic_process, "Printed Film Format": self.printed_film_format, "Film Length": self.film_length})
def __init__(self, title_id): self.title_id = title_id self.external_sites_url = imdb_uris["externalsites"] % self.title_id soup = BeautifulSoup(get(self.external_sites_url).text, 'lxml') """ :returns: Movie Title """ movie_tag = catch( 'None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch('None', lambda: unicode( '%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch('None', lambda: int(re.findall( r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[0])) """ returns: Official Sites DataFrame if available. """ self.official_sites_df = catch( 'None', lambda: external_site(tag_search['official'], soup)) self.official_sites_names = catch( 'list', lambda: self.official_sites_df.Name.tolist()) self.official_sites_urls = catch( 'list', lambda: self.official_sites_df.URI.tolist()) """ returns: Miscellaneous Sites DataFrame if available. """ self.miscellaneous_sites_df = catch( 'None', lambda: external_site(tag_search['miscellaneous'], soup)) self.miscellaneous_sites_names = catch( 'list', lambda: self.miscellaneous_sites_df.Name.tolist()) self.miscellaneous_sites_urls = catch( 'list', lambda: self.miscellaneous_sites_df.URI.tolist()) """ returns: Photographs Sites DataFrame if available. """ self.photographs_sites_df = catch( 'None', lambda: external_site(tag_search['photo'], soup)) self.photographs_sites_names = catch( 'list', lambda: self.photographs_sites_df.Name.tolist()) self.photographs_sites_urls = catch( 'list', lambda: self.photographs_sites_df.URI.tolist()) """ returns: Videos Clips and Trailers Sites DataFrame if available. """ self.videos_clips_and_trailers_sites_df = catch( 'None', lambda: external_site(tag_search['videos'], soup)) self.videos_clips_and_trailers_sites_names = catch( 'list', lambda: self.videos_clips_and_trailers_sites_df.Name.tolist()) self.videos_clips_and_trailers_sites_urls = catch( 'list', lambda: self.videos_clips_and_trailers_sites_df.URI.tolist()) """ :returns: Creates Meta Data from the above info. if available. """ self.imdb_external_sites_metadata = catch('dict', lambda: {"Movie Title": self.title, "Movie URL": self.external_sites_url, "Title ID": self.title_id, "Year": self.year, "Official Sites": {"Name": self.official_sites_names, "URL": self.official_sites_urls}, "Miscellaneous Sites": {"Name": self.miscellaneous_sites_names, "URL": self.miscellaneous_sites_urls}, "Photographs": {"Name": self.photographs_sites_names, "URL": self.photographs_sites_urls}, "Video Clips and Trailers": {"Name": self.videos_clips_and_trailers_sites_names, "URL": self.videos_clips_and_trailers_sites_urls}})
def __init__(self): """ returns: Country Name & Country Code """ soup = BeautifulSoup(get(imdb_uris['calendar']).text, 'lxml') countries, country_code, = [], [] country = soup.select_one('#sidebar').select('a') try: for item in country: print('%s : %s' % (country.index(item) + 1, item.text.strip())) countries.append(item.text) country_code.append(item['href'][17:19].lower()) input_name = re.findall(r"[\w']+", input('Enter serial number\t')) countries = [ countries[int(load) - 1] if int(load) != 0 else '' for load in input_name ] country_code = [ country_code[int(load) - 1] if int(load) != 0 else '' for load in input_name ] if len(country_code) == 1: self.country_name = countries[0] self.country_code = country_code[0] else: self.country_name = countries self.country_code = country_code except Exception as es: print("{0} :".format(type(es)), es) sys.exit(0) """ returns: Upcoming Release for selected regions """ self.region_url = imdb_uris['region'] % self.country_code region_soup = BeautifulSoup(get(self.region_url).text, 'lxml') try: release_dates = region_soup.select_one('#pagecontent').select('h4') self.upcoming_releases_df = pd.DataFrame( columns=['Release Date', 'Movie Title', 'ID', 'URI', 'Year']) for item in release_dates: movies = item.findNext('ul').select('a') years = item.findNext('ul').select('li') for i in zip(movies, years): self.upcoming_releases_df.loc[len( self.upcoming_releases_df)] = [ catch('None', lambda: unicode(item.get_text())), catch('None', lambda: unicode(i[0].get_text())), catch('None', lambda: unicode(i[0]['href'][7:16])), catch( 'None', lambda: "%s%s" % (base_uri, unicode(i[0]['href'][1:]))), catch( 'None', lambda: int( re.findall(r"\d+", unicode(i[1].contents[2] ))[-1])) ] self.upcoming_releases_df = dataframe_data( self.upcoming_releases_df) except: self.upcoming_releases_df = None
def __init__(self, title_id): self.title_id = title_id self.company_uri = imdb_uris["companycredits"] % self.title_id soup = BeautifulSoup(get(self.company_uri).text, 'lxml') """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: Production Company Credits DataFrame """ self.production_company_df = catch( 'None', lambda: company_data(tag_search['production'], soup)) self.production_company_name = catch( 'list', lambda: self.production_company_df.Name.tolist()) self.production_company_id = catch( 'list', lambda: self.production_company_df.ID.tolist()) self.production_company_uri = catch( 'list', lambda: self.production_company_df.URI.tolist()) """ :returns: Distributors DataFrame """ self.distributors_df = catch( 'None', lambda: company_data(tag_search['distributor'], soup)) self.distributors_name = catch( 'list', lambda: self.distributors_df.Name.tolist()) self.distributors_id = catch('list', lambda: self.distributors_df.ID.tolist()) self.distributors_uri = catch( 'list', lambda: self.distributors_df.URI.tolist()) """ :returns: Special Effects DataFrame """ self.specials_effects_df = catch( 'None', lambda: company_data(tag_search['special_effects'], soup)) self.specials_effects_name = catch( 'list', lambda: self.specials_effects_df.Name.tolist()) self.specials_effects_id = catch( 'list', lambda: self.specials_effects_df.ID.tolist()) self.specials_effects_uri = catch( 'list', lambda: self.specials_effects_df.URI.tolist()) """ :returns: Other Companies DataFrame """ self.other_companies_df = catch( 'None', lambda: company_data(tag_search['other'], soup)) self.other_companies_name = catch( 'list', lambda: self.other_companies_df.Name.tolist()) self.other_companies_id = catch( 'list', lambda: self.other_companies_df.ID.tolist()) self.other_companies_uri = catch( 'list', lambda: self.other_companies_df.URI.tolist()) """ returns: Creates Dict from the above info. if available. """ self.imdb_company_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Company URL": self.company_uri, "Distributors": { "Name": self.distributors_name, "ID": self.distributors_id, "URI": self.distributors_uri }, "Other Companies": { "Name": self.other_companies_name, "ID": self.other_companies_id, "URI": self.other_companies_uri }, "Production Company": { "Name": self.production_company_name, "ID": self.production_company_id, "URI": self.production_company_uri }, "Special Effects": { "Name": self.specials_effects_name, "ID": self.specials_effects_id, "URI": self.specials_effects_uri } })
def __init__(self, title_id: str) -> bool: self.title_id = title_id self.movie_uri = imdb_uris["title"] % self.title_id soup = BeautifulSoup(get(self.movie_uri).text, 'lxml') """ :returns: Holds page Info tags """ tags = { 'movie': catch('None', lambda: soup.select_one('h1[class=""]')), 'genre': catch( 'None', lambda: soup.find('h4', string='Genres:'). findNextSiblings('a')), 'rating': catch('None', lambda: soup.select_one('span[itemprop="ratingValue"]')), 'votes': catch('None', lambda: soup.select_one('span[itemprop="ratingCount"]')), 'metascore': catch('None', lambda: soup.select_one('.metacriticScore')), 'summary': catch('None', lambda: soup.select_one('.summary_text')), 'budget': catch('None', lambda: soup.find('h4', string='Budget:')), 'opening_weekend_usa': catch('None', lambda: soup.find('h4', string='Opening Weekend USA:')), 'gross_usa': catch('None', lambda: soup.find('h4', string='Gross USA:')), 'cumulative_worldwide_gross': catch( 'None', lambda: soup.find('h4', string='Cumulative Worldwide Gross:')), 'poster': catch('None', lambda: soup.select_one('.poster')), 'year': catch('None', lambda: soup.select_one('span[id="titleYear"]')), 'review': catch( 'None', lambda: soup.select_one( 'div[class="titleReviewBarItem titleReviewbarItemBorder"]') ), 'popularity': catch('None', lambda: soup.find('div', string=tag_search['popularity'])) } """ :returns: Movie Name if available. """ self.title = catch('None', lambda: unicode(tags['movie'].contents[0])) """ :returns: Genre types for the movie if available. """ self.genre = catch( 'list', lambda: [unicode(genre.get_text()) for genre in tags['genre']]) self.genre_links = catch( 'list', lambda: [ unicode('%s%s' % (base_uri, genre['href'][1:])) for genre in tags['genre'] ]) """ :returns: IMDb rating for the movie if available. """ self.rating = catch('None', lambda: unicode(tags['rating'].get_text())) """ :returns: IMDb votes obtained for the movie if available. """ self.votes = catch('None', lambda: digits(tags['votes'].get_text())) """ :returns: Metascore of the movie if available. """ self.metascore = catch('None', lambda: unicode(tags['metascore'].get_text())) self.metascore_uri = catch( 'None', lambda: unicode("%s/%s" % (self.movie_uri, tags[ 'metascore'].findParent('a')['href']))) """ :returns: Movie User Review info if available. """ self.user_review_count = catch( 'None', lambda: digits(tags['review'].select_one( 'a[href="reviews"]').get_text())) self.user_review_uri = catch( 'None', lambda: "%s/%s" % (self.movie_uri, tags[ 'review'].select_one('a[href="reviews"]')['href'])) """ :returns: Movie Critic Review info if available. """ self.critic_review_count = catch( 'None', lambda: digits(tags['review'].select_one( 'a[href="externalreviews"]').get_text())) self.critic_review_uri = catch( 'None', lambda: "%s/%s" % (self.movie_uri, tags[ 'review'].select_one('a[href="externalreviews"]')['href'])) """ :returns: Popularity of the movie if available. """ self.popularity_initial = catch( 'None', lambda: digits(tags['popularity'].findNext('span').contents[0])) self.popularity_name = catch( 'None', lambda: tags['popularity'].findNext( 'span', class_='titleOverviewSprite').findNext('span')['class'] [0]) self.popularity_value = catch( 'None', lambda: digits(tags['popularity'].findNext( 'span', class_='titleOverviewSprite').findNext('span'). get_text())) """ :returns: Storyline of the movie if available. """ self.storyline = catch('None', lambda: unicode(tags['summary'].get_text())) """ :returns: Budget of the movie if available. """ self.budget = catch('None', lambda: unicode(tags['budget'].nextSibling)) """ :returns: Opening Weekend USA of the movie if available. """ self.opening_weekend_usa = catch( 'None', lambda: unicode(tags['opening_weekend_usa'].nextSibling)) """ :returns: Gross USA of the movie if available. """ self.gross_usa = catch('None', lambda: unicode(tags['gross_usa'].nextSibling)) """ :returns: Cumulative Worldwide Gross of the movie if available. """ self.cumulative_worldwide_gross = catch( 'None', lambda: unicode(tags['cumulative_worldwide_gross'].nextSibling)) """ :returns: Movie Poster URL if available. """ self.movie_poster_uri = catch( 'None', lambda: unicode('%s%s' % (base_uri, tags['poster'].a['href'][1:]))) """ :returns: Movie Released Year if available. """ self.movie_release_year = catch( 'None', lambda: unicode(tags['year'].a.get_text())) self.movie_release_year_link = catch( 'None', lambda: unicode(imdb_uris['year'] % self.movie_release_year)) """ :returns: Creates Meta Data from the above info. if available. """ self.imdb_movie_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.movie_uri, "Title ID": self.title_id, "Rating": self.rating, "IMDb Votes": self.votes, "Genre": self.genre, "Genre Links": self.genre_links, "Year": self.movie_release_year, "Year Links": self.movie_release_year_link, "Metascore": self.metascore, "Metascore Link": self.metascore_uri, "User Review Count": self.user_review_count, "User Review Link": self.user_review_uri, "Critic Review Count": self.critic_review_count, "Critic Review Link": self.critic_review_uri, "Popularity Initial": self.popularity_initial, "Popularity Name": self.popularity_name, "Popularity value": self.popularity_value, "Movie Poster URI": self.movie_poster_uri, "Budget": self.budget, "Opening Weekend USA": self.opening_weekend_usa, "Cumulative Worldwide Gross": self.cumulative_worldwide_gross, "Gross USA": self.gross_usa, "Storyline": self.storyline })
def __init__(self, title_id: str, remove_spoiler: False): self.title_id = title_id if remove_spoiler is False: self.user_reviews_url = imdb_uris['reviews'] % self.title_id else: self.user_reviews_url = imdb_uris['spoiler_reviews'] % self.title_id # Creating soup for the website soup = BeautifulSoup(get(self.user_reviews_url).text, 'lxml') """ :returns: movie title if available. """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) # for collection of number of reviews reviews_count = catch( 'None', lambda: digits(soup.select_one('div.header').span.text)) maxclicks = catch('None', lambda: int(reviews_count) // 25) options = Options() options.add_argument("--headless") browser = webdriver.Chrome(options=options) wait = WebDriverWait(browser, 100) browser.get(self.user_reviews_url) clicks = 0 while True: clicks += 1 if clicks <= maxclicks: wait.until( ec.visibility_of_element_located( (By.CLASS_NAME, "ipl-load-more__button"))).click() else: break sys.stdout.write( "\r%s - clicks has made for scrolling out of - %s\r" % (str(clicks), str(maxclicks))) sys.stdout.flush() time.sleep(1) soup = BeautifulSoup(browser.page_source, 'lxml') browser.quit() container = soup.select('.review-container') self.total_user_reviews = len(container) analyser = SentimentIntensityAnalyzer() neu_sum, neg_sum, compound_sum, pos_sum, count = [0] * 5 self.user_reviews_df = catch('None', lambda: review_df(analyser, container)) self.user_reviews = catch( 'list', lambda: self.user_reviews_df.User_Reviews.tolist()) for review in self.user_reviews: count += 1 score = analyser.polarity_scores(review) neu_sum += score['neu'] neg_sum += score['neg'] pos_sum += score['pos'] if count: self.final_sentiment_scores = catch( 'None', lambda: { "neu": round(neu_sum / count, 3), "neg": round(neg_sum / count, 3), "pos": round(pos_sum / count, 3), "compound": round(compound_sum / count, 3) })
def __init__(self, title_id): self.title_id = title_id self.parental_guide_url = imdb_uris["parentalguide"] % self.title_id soup = BeautifulSoup(get(self.parental_guide_url).text, 'lxml') """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: MPAA available. """ mpaa = catch( 'None', lambda: soup.select_one(tag_search['certificates']). select_one(tag_search['mpaa'])) mpaa_tag = catch( 'None', lambda: mpaa.select_one('td[class="ipl-zebra-list__label"]')) self.mpaa_name = catch('None', lambda: unicode(mpaa_tag.get_text())) self.mpaa_description = catch( 'None', lambda: unicode(mpaa_tag.findNext('td').get_text())) """ :returns: Certificate DataFrame if available. """ try: certificates = catch( 'None', lambda: soup.select_one(tag_search['certificates']). select_one(tag_search['certifications']).find( 'td', string='Certification').findNextSibling('td').select( 'li.ipl-inline-list__item')) self.certificates_df = pd.DataFrame(columns=['Name', 'URI']) for tag in certificates: self.certificates_df.loc[len(self.certificates_df)] = [ catch('None', lambda: unicode(tag.a.get_text())), catch( 'None', lambda: unicode("%s%s" % (base_uri, tag.a['href'][1:]))) ] self.certificates_df = dataframe_data(self.certificates_df) except: self.certificates_df = None self.certificates_name = catch( 'list', lambda: self.certificates_df.Name.tolist()) self.certificates_uri = catch( 'list', lambda: self.certificates_df.URI.tolist()) """ :returns: Adivsory Nudity status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-nudity'])) severity = catch('None', lambda: advisory.select_one(tag_search['nudity'])) self.adivsory_nudity_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_nudity_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Violence status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-violence'])) severity = catch('None', lambda: advisory.select_one(tag_search['violence'])) self.advisory_violence_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_violence_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Profanity status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-profanity'])) severity = catch('None', lambda: advisory.select_one(tag_search['profanity'])) self.advisory_profanity_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_profanity_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Alcohol status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-alcohol'])) severity = catch('None', lambda: advisory.select_one(tag_search['alcohol'])) self.advisory_alcohol_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_alcohol_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Adivsory Frightening status if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-frightening'])) severity = catch( 'None', lambda: advisory.select_one(tag_search['frightening'])) self.advisory_frightening_severity_status = catch( 'dict', lambda: adivsory_satus(severity)) self.advisory_frightening_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Spoilers Violence & Gore if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-spoilers']). select_one('section[id="advisory-spoiler-violence"]')) self.spoiler_violence_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Spoilers Alcohol, Drugs & Smoking if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-spoilers']). select_one('section[id="advisory-spoiler-profanity"]')) self.spoiler_alcohol_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Spoilers Frightening & Intense Scenes if available. """ advisory = catch( 'None', lambda: soup.select_one(tag_search['advisory-spoilers']). select_one('section[id="advisory-spoiler-frightening"]')) self.spoiler_frightening_reviews = catch( 'list', lambda: advisory_reviews(advisory)) """ :returns: Creates Dict from the above info. if available. """ self.imdb_parental_guide_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URI": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Parental Guide URL": self.parental_guide_url, "MPAA Name": self.mpaa_name, "MPAA Description": self.mpaa_description, "Certificate": self.certificates_name, "Certificate URI": self.certificates_uri, "Sex & Nudity": { "Nudity Severity": self.adivsory_nudity_severity_status, "Nudity Review": self.advisory_nudity_reviews }, "Alcohol & Smoking": { "Alcohol Severity": self.advisory_alcohol_severity_status, "Alcohol Review": self.advisory_alcohol_reviews }, "Violence": { "Violence Severity": self. advisory_violence_severity_status, "Violence Review": self.advisory_violence_reviews }, "Frighten": { "Frighten Severity": self. advisory_frightening_severity_status, "Frighten Review": self.advisory_frightening_reviews }, "Profanity": { "Profanity Severity": self. advisory_profanity_severity_status, "Profanity Review": self.advisory_profanity_reviews }, "Spoiler Violence": self.spoiler_violence_reviews, "Spoiler Alcohol": self.spoiler_alcohol_reviews, "Spoiler Frighten": self.spoiler_frightening_reviews })
def __init__(self, title_id: str) -> bool: self.title_id = title_id self.full_cast_and_crew_uri = imdb_uris["fullcredits"] % self.title_id soup = BeautifulSoup(get(self.full_cast_and_crew_uri).text, 'lxml') """ :returns: table tag index """ table_tag = catch('None', lambda: soup.select('h4')) """ :returns: Movie Title """ movie_tag = catch('None', lambda: soup.select_one('h3[itemprop="name"]')) self.title = catch('None', lambda: unicode(movie_tag.a.get_text())) self.title_url = catch( 'None', lambda: unicode('%s%s' % (base_uri, movie_tag.a['href'][1:]))) self.year = catch( 'None', lambda: int( re.findall(r"\d+", unicode(movie_tag.select_one('.nobr').get_text()))[ 0])) """ :returns: Writtern Credits DataFrame """ self.writers_df = catch( 'None', lambda: cast_credit(tag_search['writer'], table_tag)) self.writers_name = catch('list', lambda: self.writers_df.Name.tolist()) self.writers_id = catch('list', lambda: self.writers_df.ID.tolist()) self.writers_uri = catch('list', lambda: self.writers_df.URI.tolist()) self.writers_credit = catch('list', lambda: self.writers_df.Credit.tolist()) """ :returns: Directed_by DataFrame """ self.directors_df = catch( 'None', lambda: cast_non_credit(tag_search['director'], table_tag)) self.directors_name = catch('list', lambda: self.directors_df.Name.tolist()) self.directors_id = catch('list', lambda: self.directors_df.ID.tolist()) self.directors_uri = catch('list', lambda: self.directors_df.URI.tolist()) """ :returns: Produced_by DataFrame """ self.producers_df = catch( 'None', lambda: cast_non_credit(tag_search['producer'], table_tag)) self.producers_name = catch('list', lambda: self.producers_df.Name.tolist()) self.producers_id = catch('list', lambda: self.producers_df.ID.tolist()) self.producers_credit = catch( 'list', lambda: self.producers_df.Credit.tolist()) self.producers_uri = catch('list', lambda: self.producers_df.URI.tolist()) """ :returns: Cast DataFrame """ self.cast_df = catch('None', lambda: full_cast(tag_search['cast'], table_tag)) self.cast_name = catch('list', lambda: self.cast_df.Name.tolist()) self.cast_id = catch('list', lambda: self.cast_df.Name_ID.tolist()) self.cast_uri = catch('list', lambda: self.cast_df.Name_URI.tolist()) self.cast_image_uri = catch('list', lambda: self.cast_df.Image.tolist()) self.cast_character = catch( 'list', lambda: self.cast_df.Character_Name.tolist()) self.cast_character_id = catch( 'list', lambda: self.cast_df.Character_ID.tolist()) self.cast_character_uri = catch( 'list', lambda: self.cast_df.Character_URI.tolist()) """ :returns: Music by DataFrame """ self.music_df = catch( 'None', lambda: cast_non_credit(tag_search['music'], table_tag)) self.music_name = catch('list', lambda: self.music_df.Name.tolist()) self.music_id = catch('list', lambda: self.music_df.ID.tolist()) self.music_uri = catch('list', lambda: self.music_df.URI.tolist()) """ :returns: Cinematography by DataFrame """ self.cinematography_df = catch( 'None', lambda: cast_non_credit(tag_search['cinematography'], table_tag)) self.cinematography_name = catch( 'list', lambda: self.cinematography_df.Name.tolist()) self.cinematography_id = catch( 'list', lambda: self.cinematography_df.ID.tolist()) self.cinematography_uri = catch( 'list', lambda: self.cinematography_df.URI.tolist()) """ :returns: Production Design by DataFrame """ self.production_designer_df = catch( 'None', lambda: cast_non_credit(tag_search['production design'], table_tag)) self.production_designer_name = catch( 'list', lambda: self.production_designer_df.Name.tolist()) self.production_designer_id = catch( 'list', lambda: self.production_designer_df.ID.tolist()) self.production_designer_uri = catch( 'list', lambda: self.production_designer_df.URI.tolist()) """ :returns: Film Editing by DataFrame """ self.film_editing_df = catch( 'None', lambda: cast_credit(tag_search['film editing'], table_tag)) self.film_editing_name = catch( 'list', lambda: self.film_editing_df.Name.tolist()) self.film_editing_id = catch('list', lambda: self.film_editing_df.ID.tolist()) self.film_editing_credit = catch( 'list', lambda: self.film_editing_df.Credit.tolist()) self.film_editing_uri = catch( 'list', lambda: self.film_editing_df.URI.tolist()) """ :returns: Casting by DataFrame """ self.casting_df = catch( 'None', lambda: cast_credit(tag_search['casting'], table_tag)) self.casting_name = catch('list', lambda: self.casting_df.Name.tolist()) self.casting_id = catch('list', lambda: self.casting_df.ID.tolist()) self.casting_credit = catch('list', lambda: self.casting_df.Credit.tolist()) self.casting_uri = catch('list', lambda: self.casting_df.URI.tolist()) """ :returns: Creates Dict from the above info. if available. """ self.imdb_full_cast_and_crew_metadata = catch( 'dict', lambda: { "Movie Name": self.title, "Movie URL": self.title_url, "Title ID": self.title_id, "Year": self.year, "Movie Full Cast and Crew URI": self.full_cast_and_crew_uri, "Director": { "Name": self.directors_name, "ID": self.directors_id, "URI": self.directors_uri }, "Writer": { "Name": self.writers_name, "Credit": self.writers_credit, "ID": self.writers_id, "URI": self.writers_uri }, "Cast": { "Name": self.cast_name, "Name ID": self.cast_id, "Name URI": self.cast_uri, "Image": self.cast_image_uri, "Character Name": self.cast_character, "Characte ID": self.cast_character_id, "Characte URI": self.cast_character_uri }, "Producer": { "Name": self.producers_name, "Credit": self.producers_credit, "ID": self.producers_id, "URI": self.producers_uri }, "Music": { "Name": self.music_name, "ID": self.music_id, "URI": self.music_uri }, "Cinematography": { "Name": self.cinematography_name, "ID": self.cinematography_id, "URI": self.cinematography_uri }, "Production Desing": { "Name": self.production_designer_name, "ID": self.production_designer_id, "URI": self.production_designer_uri }, "Flim Editing": { "Name": self.film_editing_name, "Credit": self.film_editing_credit, "ID": self.film_editing_id, "URI": self.film_editing_uri }, "Casting": { "Name": self.casting_name, "Credit": self.casting_credit, "ID": self.casting_id, "URI": self.casting_uri } })