def download_imdb_reviews(rating_range, directory_path, imdb_id, max_results=1000): imdb = Imdb() for review in imdb.get_title_reviews(imdb_id, max_results=max_results): if review.rating in rating_range: download_imdb_review(directory_path, review)
class CommonMetadataIMDB(object): """ Class for interfacing with imdb """ def __init__(self, cache=True, cache_dir=None): # open connection to imdb if cache is not None: if cache_dir is not None: self.imdb = Imdb(cache=True, cache_dir=cache_dir) else: self.imdb = Imdb(cache=True) else: self.imdb = Imdb() def com_imdb_title_search(self, media_title): """ # fetch info from title """ return self.imdb.search_for_title(media_title) def com_imdb_id_search(self, media_id): """ # fetch info by ttid """ return self.imdb.get_title_by_id(media_id) def com_imdb_person_by_id(self, person_id): """ # fetch person info by id """ return self.imdb.get_person_by_id(person_id) def com_imdb_person_images_by_id(self, person_id): """ # fetch person images by id """ return self.imdb.get_person_images(person_id) def com_imdb_title_review_by_id(self, media_id): """ # fetch the title review """ return self.imdb.get_title_reviews(media_id)
from imdbpie import Imdb imdb = Imdb() imdb = Imdb(anonymize=True) # to proxy requests reviews = imdb.get_title_reviews("tt0468569", max_results=100) reviews_s = sorted(reviews, key=lambda x: x.rating, reverse=True)[:20] print(reviews_s) print(1)
from imdbpie import Imdb import json imdb = Imdb(anonymize=True, cache=True) for i in xrange(0, 9999999): cnt = 0 movie_id = "tt" + str(i).zfill(7) with open('reviews.json', 'w') as fp: if imdb.title_exists(movie_id): cnt += 1 title = imdb.get_title_by_id(movie_id) print movie_id, title.title reviews = imdb.get_title_reviews(movie_id) # if reviews == None: # reviews = [] # else: # reviews = [review.__dict__ for review in reviews] json.dump([title, reviews], fp, indent=2, default=lambda o: o.__dict__) if cnt > 10: break
reviews_df.columns = ['tconst'] pure_list_of_tconst = list(reviews_df.tconst) list_of_reviews = [] #print i.text #print i.username #print i.date #print i.rating #print i.summary #print i.status #print i.user_location #print i.user_score #print i.user_score_count #print for x in pure_list_of_tconst: reviews = imdb.get_title_reviews(x, max_results=5) sub_list = '' for rev in reviews: sub_list + reviews[0].text list_of_reviews.append(reviews[0].text) print print list_of_reviews reviews_df['reviews'] = list_of_reviews print reviews_df reviews_df.to_csv('./reviews_df.csv')
rating.append(top_mov[i]['rating']) title.append(top_mov[i]['title']) id.append(top_mov[i]['tconst']) votes.append(top_mov[i]['num_votes']) prod_year.append(top_mov[i]['year']) #print rating reviews={} reviewScore={} num = 15 for item in id[201:250]: reviews[item] = [] reviewScore[item] = [] for j in range(num): review=imdb.get_title_reviews(item, max_results = num)[j].text print review reviews[item].append(review) response= alchemyapi.sentiment("html",review) if 'score' in response['docSentiment']: reviewScore[item].append(response["docSentiment"]['score']) reviews[item].append(response["docSentiment"]['score']) print reviewScore #print review reviewsOutput=open("movie reviews_201-250)",'w') reviewsOutput.write(str(reviews)) reviewScoreOutput=open("movie review score_201-250",'w') reviewScoreOutput.write(str(reviewScore)) print len(id)
imdb = Imdb() imdbid_to_movie = {} for movie_id in movies_ids: imdbid_to_movie[movie_id] = imdb.get_title_by_id(movie_id) print "Movies to download:\n" for movie_id in movies_ids: movie = imdbid_to_movie[movie_id] print "%s (%s):" % (movie.title, movie.imdb_id) print "\tYear:", movie.year print "\tTagline:", movie.tagline print "\tRating:", movie.rating print "\tGenres:", ", ".join(movie.genres) print "\tDirectors:", ", ".join([person.name for person in movie.directors_summary]) print "\n" reviews_list = [] for movie_id in movies_ids: print "Downloading reviews for:", imdbid_to_movie[movie_id].title raw_reviews = imdb.get_title_reviews(movie_id, max_results=10000) reviews_list += [{"username": r.username, "text": r.text, "rating": r.rating} for r in raw_reviews] import json with open(output, 'w') as outfile: json.dump(reviews_list, outfile)
rating.append(top_mov[i]['rating']) title.append(top_mov[i]['title']) id.append(top_mov[i]['tconst']) votes.append(top_mov[i]['num_votes']) prod_year.append(top_mov[i]['year']) #print rating reviews = {} reviewScore = {} num = 15 for item in id[201:250]: reviews[item] = [] reviewScore[item] = [] for j in range(num): review = imdb.get_title_reviews(item, max_results=num)[j].text print review reviews[item].append(review) response = alchemyapi.sentiment("html", review) if 'score' in response['docSentiment']: reviewScore[item].append(response["docSentiment"]['score']) reviews[item].append(response["docSentiment"]['score']) print reviewScore #print review reviewsOutput = open("movie reviews_201-250)", 'w') reviewsOutput.write(str(reviews)) reviewScoreOutput = open("movie review score_201-250", 'w') reviewScoreOutput.write(str(reviewScore)) print len(id)
class ImdbExtractor(object): def __init__(self, data_path=None): super(ImdbExtractor, self).__init__() self.search_api = IMDBPy() self.info_api = IMDBPie(anonymize=True) self.movie_lens = MovieLens(data_path) # self.data_path = "data/movies_data" self.data_path = data_path + ".out" if data_path \ else "data/movies_data" self.errors = [] def retrieve_objects(self): movies = self.movie_lens.movies() with open(self.data_path, "w", 1, encoding="utf-8") as file: for movie in movies: print("\n") print(movie.id) print(movie.data["name"]) while True: try: m = self.find_movie(movie.data["name"]) except IMDbDataAccessError as e: print("========== CONNECTION ERROR ==========") print(e) sleep(5) else: break data = str(movie.id) if m: plots, genres = self.movie_info(m.movieID) reviews = self.movie_reviews(m.movieID) if plots or genres or reviews: movie.data["genres"].extend(genres) data += u'::' + movie.data["name"] data += u'::' + u' '.join(filter(None, plots)) data += u'::' + u' '.join(filter(None, movie.data["genres"])) data += u'::' + u' '.join(filter(None, reviews)) data = data.replace('\r', ' ').replace('\n', ' ') else: data += u"::ERROR" else: data += u"::ERROR" file.write(data + u"\n") def movie_reviews(self, movie_id): try: reviews = self.info_api.get_title_reviews("tt" + movie_id, max_results=20) except ValueError as e: return [] reviews_arr = [] if reviews: for r in reviews: review = r.summary if r.summary else "" review += " " + r.text if r.text else "" reviews_arr.append(review) return reviews_arr def movie_info(self, movie_id): try: movie = self.info_api.get_title_by_id("tt" + movie_id) except ValueError as e: return [], [] plots = movie.plots if movie.plots else [] genres = movie.genres if movie.genres else [] return plots, genres def find_movie(self, name): movies = self.search_api.search_movie(name) if not movies: name = re.sub("\((\D*)\)", "", name) print("---------- SEARCHING AGAIN: ----------") print(name) movies = self.search_api.search_movie(name) print(movies) if not movies: print("########## NO MOVIE FOUND ##########") return None def sanitize_name(_str): new_str = _str.strip().lower() for char in string.punctuation: new_str = new_str.replace(char, "") return new_str name_split = name.split("(") title = sanitize_name(name_split[0]) year = int(name_split[-1][:-1].strip()) movie = None for i in movies: if "year" in i.keys() and int(i["year"]) == year: movie = i break if not movie: print("########## NO MOVIE FROM SAME YEAR ##########") return None self.search_api.update(movie) eng_title = "" if "akas" in movie.keys(): print("tem akas") for aka in movie["akas"]: aka_split = aka.split("::") if len(aka_split) > 1 \ and (aka_split[1].find("(English title)") != -1 \ or aka_split[1].find("USA") != -1): eng_title = aka_split[0].strip().lower() break imdb_title = sanitize_name(movie["title"]) original_title = name_split[1].strip()[:-1].lower() print("imdb title: " + imdb_title) print("english title: " + eng_title) print("year: " + str(movie["year"])) if imdb_title == title or eng_title == title \ or (len(name_split) == 3 \ and imdb_title == original_title): return movie else: print("########## FOUND DIFFERENT MOVIE ##########") print(movie["title"] + " (" + str(movie["year"]) + ")") return None
top100.set_value(num,'length',length_mins) top100.set_value(num,'genre',genres) # Add a column that we will use to merge this DF with the reviews DF below top100['movie_num'] = range(100) # Convert release_year to an int top100.release_year = pd.to_numeric(top100.release_year, errors='coerce') '''Part 2: Wrangle the Text Data''' # Create an Imdb object from the imdbpie package imdb = Imdb() # Build a list of all reviews and ratings for each movie in the top 100 movie_reviews = [] for num,movie in top100.iterrows(): reviews = imdb.get_title_reviews(movie.id,max_results=10000) for review in reviews: this_review = [] this_review.append(num) this_review.append(review.rating) this_review.append(review.text) movie_reviews.append(this_review) # Convert our list of reviews to a DataFrame reviews_df = pd.DataFrame(movie_reviews,columns=['movie_num','review_score', 'text']) # Drop all reviews that didn't give a score (only has review text) reviews_df.dropna(axis=0,how='any',inplace=True) # This function removes all characters that aren't a letter, number, or space
top_100_clean = pd.merge(top_100, df) top_100_clean top_100_clean.to_csv('top_100.csv', encoding = 'utf-8', index=False) ######################################### # part 2 - reviews and such - unable to do scraping, geting reviews from the omdb api # reading in product of above code from .csv for time and such # formatting shit for reviews movie_list = pd.read_csv('top_100.csv') movie_list = movie_list['tconst'] review_list = [] for i in movie_list: x = imdb.get_title_reviews(i, max_results=10000) review_list.append(x) print i master_reviews = [] counter = 0 for i in review_list: imdb_value = movie_list[counter] for j in i: text = j.text score = j.rating master_reviews.append({'imdb_value':imdb_value, 'text':text, 'user_movie_score':score}) counter +=1 len(master_reviews) master_reviews = pd.DataFrame(master_reviews)
class ImdbClient: def __init__(self): self.imdbpy = IMDb() self.imdb = Imdb(exclude_episodes=False) self.imdb = Imdb(anonymize=True) # to proxy requests self.db = api.TVDB('B43FF87DE395DF56') def readFromMongo(self, show, limit): # Connect to mongo client = MongoClient() # access movie stream db movies = client['movieratings_stream'] # colletion of tweets tweets = movies['tweets'] tweet_text = [] counter = 0 # iterate through cursor that takes the 'limit' most recent tweets with hashtag 'show' for tweet in tweets.find({'show_title': show}): # .sort('created_at', pymongo.DESCENDING): if counter < limit: tweet_text.append(tweet.get("tweet_text")) counter += 1 else: break return tweet_text def getTitle(self, show_title): m = self.imdbpy.get_movie('0389564') # The 4400. m['kind'] # kind is 'tv series'. self.imdbpy.update(m, 'episodes') # retrieves episodes information. m['episodes'] # a dictionary with the format: # {#season_number: { # #episode_number: Movie object, # #episode_number: Movie object, # ... # }, # ... # } # season_number always starts with 1, episode_number # depends on the series' numbering schema: some series # have a 'episode 0', while others starts counting from 1. m['episodes'][1][1] # <Movie id:0502803[http] title:_"The 4400" Pilot (2004)_> e = m['episodes'][1][2] # second episode of the first season. e['kind'] # kind is 'episode'. e['season'], e['episode'] # return 1, 2. e['episode of'] # <Movie id:0389564[http] title:_"4400, The" (2004)_> # XXX: beware that e['episode of'] and m _are not_ the # same object, while both represents the same series. # This is to avoid circular references; the # e['episode of'] object only contains basics # information (title, movieID, year, ....) i.update(e) # retrieve normal information about this episode (cast, ...) e['title'] # 'The New and Improved Carl Morrissey' e['series title'] # 'The 4400' e['long imdb episode title'] # '"The 4400" The New and Improved Carl Morrissey (2004)' # print(show_title) # sleep(3) # title_list = list(self.imdb.search_for_title(show_title)) # print(list(self.imdb.search_for_title("Days Gone Bye The Walking Dead"))) # print(title_list) # sleep(3) # index = 0 # show_id = None # while show_id is None: # print ("title_list", title_list[index][u'title']) # print ("show title", show_title) # result = title_list[index][u'title'].lower() # query = show_title.lower() # if result in query: # print title_list # show_id = title_list[index][u'imdb_id'] # # endless loop # index += 1 # return show_id def searchShow(self, tvshow): print tvshow title_id = self.getTitle(tvshow) # if tvshow is not self.tvshow: print title_id print tvshow # print('title: ', title_id) reviews = self.imdb.get_title_reviews(title_id, max_results=sys.maxint) title = self.imdb.get_title_by_id(title_id) print title_id print tvshow # print("title: " + str(title.data)) # print len(reviews) return reviews def getCurrentImdbRating(self, tvshow): tvshowid = self.getTitle(tvshow) title = self.imdb.get_title_by_id(tvshowid) return float(title.rating) def get_all_episode_names(self, tvshow): result = self.db.search(tvshow, 'en') show = result[0] res = [] for x in range(1, len(show)): season = show[x] for y in range(1, len(season) + 1): if season[y].EpisodeName is not None: res.append(season[y].EpisodeName) return res def get_specific_episode_names(self, tvshow, season): result = self.db.search(tvshow, 'en') show = result[0] res = [] season = show[1] for x in range(1, len(season) + 1): if season[x].EpisodeName is not None: print season[x].EpisodeName res.append(season[x].EpisodeName) return res def get_all_episodes(self, episodelist, tvshow): for episode in episodelist: currEpisode = episode + " " + tvshow reviews = [] reviews.append(searchshow(currEpisode)) #call searchshow for each # get list of all episode names given a tv show # create review list, for each episode name, call searchshow append # call method that trains
def main(title): reviews = [] # Search tweets ts = TwitterSearch( consumer_key=os.environ.get('TWITTER_CONSUMER_KEY'), consumer_secret=os.environ.get('TWITTER_CONSUMER_SECRET'), access_token=os.environ.get('TWITTER_ACCESS_TOKEN'), access_token_secret=os.environ.get('TWITTER_TOKEN_SECRET')) try: ts.connect() tso = TwitterSearchOrder() # create a TwitterSearchOrder object tso.setKeywords([ '#' + title + 'Movie' ]) # let's define all words we would like to have a look for tso.setLanguage('en') # we want to see German tweets only tso.setIncludeEntities( False) # and don't give us all those entity information # add tweets to reviews list results = ts.getSearchResults(tso) except TwitterSearchException as e: # take care of all those ugly errors if there are some logging.exception(str(e)) ts.cleanUp() else: for offset in range(results.getSize()): if offset > 9: break tweet = results.getTweetByIndex(offset) reviews.append({ 'author': tweet.getUserName(), 'summary': tweet.getText(), 'text': tweet.getText(), 'date': parser.parse(tweet.getCreatedDate(), ignoretz=True), 'source': 'Twitter' }) finally: ts.disconnect() # Search Imdb imdb = Imdb() try: response = imdb.search_for_title(title)[0] title_id = response['imdb_id'] response = imdb.get_title_reviews(title_id, max_results=10) except IndexError as e: logging.exception(str(e)) else: for review in response: reviews.append({ 'author': review.username, 'summary': review.summary, 'text': review.text, 'date': parser.parse(review.date, ignoretz=True), 'source': 'IMDB' }) # Search NYTimes url = "https://api.nytimes.com/svc/movies/v2/reviews/search.json" data = {'query': title, 'api-key': os.environ.get('NY_TIMES_API_KEY')} response = requests.get(url, data) count = 0 for review in response.json()['results']: if count > 9: break reviews.append({ 'author': review['byline'], 'summary': review['headline'], 'text': review['summary_short'], 'date': parser.parse(review['date_updated'], ignoretz=True), 'source': 'NYTimes' }) count += 1 # Sort reviews by date reviews.sort(cmp=_cmprev) # Print reviews for review in reviews: print('(%s) @%s: %s [Source: %s]' % (review['date'].strftime('%Y-%m-%d'), review['author'], review['summary'], review['source']))
''' #Load data from csvs. top100_new = pd.read_csv('top100_summary.csv') top250_new = pd.read_csv('top250_summary.csv') top100_new.drop('Unnamed: 0', axis = 1, inplace = True) top250_new.drop('Unnamed: 0', axis = 1, inplace = True) #Pulling all reviews from IMDB with API review_list = [] for title in top100_new.tconst: review = imdb.get_title_reviews(title, max_results = 6000) for i in range(len(review)): review_list.append((title, review[i].rating, review[i].summary, review[i].text)) review_list[0] #Checking length of reviews_list len(review_list) #makes sense #Putting in DataFrame. reviews = pd.DataFrame(review_list, columns = ['title', 'rating', 'header', 'text']) reviews.info() #makes sense. Not all reviews have ratings filled out.
from imdbpie import Imdb imdb = Imdb(anonymize=True) x = imdb.search_for_title("matrix") _first = x[0]['imdb_id'] print(_first) reviews = imdb.get_title_reviews(_first, max_results=1000) print(reviews)
class ImdbClient: def __init__(self): self.imdbpy = IMDb() self.imdb = Imdb(exclude_episodes=False) self.imdb = Imdb(anonymize=True) # to proxy requests self.db = api.TVDB('B43FF87DE395DF56') def get_tweets_from_mongo(self, show, limit): # Connect to mongo client = MongoClient() # access movRie stream db movies = client['movieratings_stream'] # colletion of tweets tweets = movies['tweets'] tweet_text = [] counter = 0 # iterate through cursor that takes the 'limit' most recent tweets with hashtag 'show' for tweet in tweets.find({'show_title': show}): # .sort('created_at', pymongo.DESCENDING): if counter < limit: tweet_text.append(tweet.get("tweet_text")) counter += 1 else: break return tweet_text def get_show_id(self, show_title): title_list = list(self.imdb.search_for_title(show_title)) index = 0 show_id = None while index < len(title_list) and show_id is None: if title_list[index] is not None: result = title_list[index][u'title'].lower() query = show_title.lower() # if result in query: if fuzz.ratio(result, query) >= 90: # print title_list show_id = title_list[index][u'imdb_id'] index += 1 return show_id # TODO: get rid of usage of this def searchShow(self, tvshow): title_id = self.get_show_id(tvshow) print(title_id) reviews = [] print(tvshow) if title_id is not None and title_id != '': reviews = self.imdb.get_title_reviews(title_id, max_results=sys.maxint) print reviews else: print("Invalid show id") return reviews def fetch_reviews(self, episode_id): reviews = self.imdb.get_title_reviews(episode_id, max_results=sys.maxint) return reviews def getCurrentImdbRating(self, tvshow): tvshowid = self.get_show_id(tvshow) title = self.imdb.get_title_by_id(tvshowid) return float(title.rating) # dont use this, use example from # http://imdbpy.sourceforge.net/docs/README.series.txt def get_all_episode_names(self, tvshow): result = self.db.search(tvshow, 'en') show = result[0] res = [] for x in range(1, len(show)): season = show[x] for y in range(1, len(season) + 1): if season[y].EpisodeName is not None and season[y].EpisodeName != '': res.append(season[y].EpisodeName) return res def get_show(self, show_id): show = self.imdbpy.get_movie(show_id.replace('t', '')) self.imdbpy.update(show, 'episodes') print("show_show(" + show_id + "): " + str(show)) return show # episode names for a specific season of tvshow def get_specific_episode_names(self, tvshow, season): result = self.db.search(tvshow, 'en') show = result[0] res = [] season = show[1] for x in range(1, len(season) + 1): if season[x].EpisodeName is not None: print season[x].EpisodeName res.append(season[x].EpisodeName) return res def get_all_episode_reviews(self, episodelist, tvshow): reviews = [] for episode in episodelist: curEpisode = episode + " " + tvshow reviews.append(self.searchShow(curEpisode)) # call searchshow for each print("Episodes:\n" + str(reviews)) return reviews
from imdbpie import Imdb import json imdb = Imdb(anonymize=True) # to proxy requests reviews = imdb.get_title_reviews("tt0120338", max_results=2500) classified_reviews = [] positive_reviews = [x for x in reviews if x.rating > 7] negative_reviews = [x for x in reviews if x.rating < 5] for i in range(0, 550): classified_reviews.append({ 'text': positive_reviews[i].text, 'class': 'POSITIVE' }) classified_reviews.append({ 'text': negative_reviews[i].text, 'class': 'NEGATIVE' }) with open('result.json', 'w') as fp: json.dump(classified_reviews, fp)