Exemple #1
0
def download_imdb_reviews(rating_range,
                          directory_path,
                          imdb_id,
                          max_results=1000):
    imdb = Imdb()
    for review in imdb.get_title_reviews(imdb_id, max_results=max_results):
        if review.rating in rating_range:
            download_imdb_review(directory_path, review)
class CommonMetadataIMDB(object):
    """
    Class for interfacing with imdb
    """

    def __init__(self, cache=True, cache_dir=None):
        # open connection to imdb
        if cache is not None:
            if cache_dir is not None:
                self.imdb = Imdb(cache=True, cache_dir=cache_dir)
            else:
                self.imdb = Imdb(cache=True)
        else:
            self.imdb = Imdb()

    def com_imdb_title_search(self, media_title):
        """
        # fetch info from title
        """
        return self.imdb.search_for_title(media_title)

    def com_imdb_id_search(self, media_id):
        """
        # fetch info by ttid
        """
        return self.imdb.get_title_by_id(media_id)

    def com_imdb_person_by_id(self, person_id):
        """
        # fetch person info by id
        """
        return self.imdb.get_person_by_id(person_id)

    def com_imdb_person_images_by_id(self, person_id):
        """
        # fetch person images by id
        """
        return self.imdb.get_person_images(person_id)

    def com_imdb_title_review_by_id(self, media_id):
        """
        # fetch the title review
        """
        return self.imdb.get_title_reviews(media_id)
Exemple #3
0
from imdbpie import Imdb

imdb = Imdb()
imdb = Imdb(anonymize=True)  # to proxy requests

reviews = imdb.get_title_reviews("tt0468569", max_results=100)
reviews_s = sorted(reviews, key=lambda x: x.rating, reverse=True)[:20]
print(reviews_s)
print(1)
Exemple #4
0
from imdbpie import Imdb
import json

imdb = Imdb(anonymize=True, cache=True)

for i in xrange(0, 9999999):
    cnt = 0
    movie_id = "tt" + str(i).zfill(7)
    with open('reviews.json', 'w') as fp:
        if imdb.title_exists(movie_id):
            cnt += 1
            title = imdb.get_title_by_id(movie_id)
            print movie_id, title.title
            reviews = imdb.get_title_reviews(movie_id)
            # if reviews == None:
            # 	reviews = []
            # else:
            # 	reviews = [review.__dict__ for review in reviews]
            json.dump([title, reviews],
                      fp,
                      indent=2,
                      default=lambda o: o.__dict__)
            if cnt > 10:
                break
reviews_df.columns = ['tconst']
pure_list_of_tconst = list(reviews_df.tconst)

list_of_reviews = []

#print i.text
#print i.username
#print i.date
#print i.rating
#print i.summary
#print i.status
#print i.user_location
#print i.user_score
#print i.user_score_count
#print

for x in pure_list_of_tconst:
    reviews = imdb.get_title_reviews(x, max_results=5)
    sub_list = ''
    for rev in reviews:
        sub_list + reviews[0].text
    list_of_reviews.append(reviews[0].text)

print
print list_of_reviews

reviews_df['reviews'] = list_of_reviews
print reviews_df

reviews_df.to_csv('./reviews_df.csv')
    rating.append(top_mov[i]['rating'])
    title.append(top_mov[i]['title'])
    id.append(top_mov[i]['tconst'])
    votes.append(top_mov[i]['num_votes'])
    prod_year.append(top_mov[i]['year'])

#print rating

reviews={}
reviewScore={}
num = 15
for item in id[201:250]:
    reviews[item] = []
    reviewScore[item] = []
    for j in range(num):
        review=imdb.get_title_reviews(item, max_results = num)[j].text
        print review
        reviews[item].append(review)
        response= alchemyapi.sentiment("html",review)
        if 'score' in response['docSentiment']:
            reviewScore[item].append(response["docSentiment"]['score'])
            reviews[item].append(response["docSentiment"]['score'])

print reviewScore
#print review
reviewsOutput=open("movie reviews_201-250)",'w')
reviewsOutput.write(str(reviews))

reviewScoreOutput=open("movie review score_201-250",'w')
reviewScoreOutput.write(str(reviewScore))
print len(id)
Exemple #7
0
imdb = Imdb()

imdbid_to_movie = {}
for movie_id in movies_ids:
    imdbid_to_movie[movie_id] = imdb.get_title_by_id(movie_id)

print "Movies to download:\n"

for movie_id in movies_ids:
    movie = imdbid_to_movie[movie_id]
    print "%s (%s):" % (movie.title, movie.imdb_id)
    print "\tYear:", movie.year
    print "\tTagline:", movie.tagline
    print "\tRating:", movie.rating
    print "\tGenres:", ", ".join(movie.genres)
    print "\tDirectors:", ", ".join([person.name for person in movie.directors_summary])
    print "\n"


reviews_list = []
for movie_id in movies_ids:
    print "Downloading reviews for:", imdbid_to_movie[movie_id].title
    raw_reviews = imdb.get_title_reviews(movie_id, max_results=10000)
    reviews_list += [{"username": r.username, "text": r.text, "rating": r.rating} for r in raw_reviews]


import json
with open(output, 'w') as outfile:
    json.dump(reviews_list, outfile)
Exemple #8
0
    rating.append(top_mov[i]['rating'])
    title.append(top_mov[i]['title'])
    id.append(top_mov[i]['tconst'])
    votes.append(top_mov[i]['num_votes'])
    prod_year.append(top_mov[i]['year'])

#print rating

reviews = {}
reviewScore = {}
num = 15
for item in id[201:250]:
    reviews[item] = []
    reviewScore[item] = []
    for j in range(num):
        review = imdb.get_title_reviews(item, max_results=num)[j].text
        print review
        reviews[item].append(review)
        response = alchemyapi.sentiment("html", review)
        if 'score' in response['docSentiment']:
            reviewScore[item].append(response["docSentiment"]['score'])
            reviews[item].append(response["docSentiment"]['score'])

print reviewScore
#print review
reviewsOutput = open("movie reviews_201-250)", 'w')
reviewsOutput.write(str(reviews))

reviewScoreOutput = open("movie review score_201-250", 'w')
reviewScoreOutput.write(str(reviewScore))
print len(id)
Exemple #9
0
class ImdbExtractor(object):

    def __init__(self, data_path=None):
        super(ImdbExtractor, self).__init__()
        self.search_api = IMDBPy()
        self.info_api = IMDBPie(anonymize=True)
        self.movie_lens = MovieLens(data_path)
        # self.data_path = "data/movies_data"
        self.data_path = data_path + ".out" if data_path \
            else "data/movies_data"
        self.errors = []

    def retrieve_objects(self):
        movies = self.movie_lens.movies()
        with open(self.data_path, "w", 1, encoding="utf-8") as file:
            for movie in movies:
                print("\n")
                print(movie.id)
                print(movie.data["name"])
                while True:
                    try:
                        m = self.find_movie(movie.data["name"])
                    except IMDbDataAccessError as e:
                        print("========== CONNECTION ERROR ==========")
                        print(e)
                        sleep(5)
                    else:
                        break

                data = str(movie.id)
                if m:
                    plots, genres = self.movie_info(m.movieID)
                    reviews = self.movie_reviews(m.movieID)
                    if plots or genres or reviews:
                        movie.data["genres"].extend(genres)
                        data += u'::' + movie.data["name"]
                        data += u'::' + u' '.join(filter(None, plots))
                        data += u'::' + u' '.join(filter(None,
                                                         movie.data["genres"]))
                        data += u'::' + u' '.join(filter(None, reviews))
                        data = data.replace('\r', ' ').replace('\n', ' ')
                    else:
                        data += u"::ERROR"
                else:
                    data += u"::ERROR"
                file.write(data + u"\n")

    def movie_reviews(self, movie_id):
        try:
            reviews = self.info_api.get_title_reviews("tt" + movie_id,
                                                      max_results=20)
        except ValueError as e:
            return []

        reviews_arr = []
        if reviews:
            for r in reviews:
                review = r.summary if r.summary else ""
                review += " " + r.text if r.text else ""
                reviews_arr.append(review)
        return reviews_arr

    def movie_info(self, movie_id):
        try:
            movie = self.info_api.get_title_by_id("tt" + movie_id)
        except ValueError as e:
            return [], []
        plots = movie.plots if movie.plots else []
        genres = movie.genres if movie.genres else []
        return plots, genres

    def find_movie(self, name):
        movies = self.search_api.search_movie(name)
        if not movies:
            name = re.sub("\((\D*)\)", "", name)
            print("---------- SEARCHING AGAIN: ----------")
            print(name)
            movies = self.search_api.search_movie(name)
            print(movies)
            if not movies:
                print("########## NO MOVIE FOUND ##########")
                return None

        def sanitize_name(_str):
            new_str = _str.strip().lower()
            for char in string.punctuation:
                new_str = new_str.replace(char, "")
            return new_str

        name_split = name.split("(")
        title = sanitize_name(name_split[0])
        year = int(name_split[-1][:-1].strip())

        movie = None
        for i in movies:
            if "year" in i.keys() and int(i["year"]) == year:
                movie = i
                break
        if not movie:
            print("########## NO MOVIE FROM SAME YEAR ##########")
            return None

        self.search_api.update(movie)

        eng_title = ""
        if "akas" in movie.keys():
            print("tem akas")
            for aka in movie["akas"]:
                aka_split = aka.split("::")
                if len(aka_split) > 1                                   \
                        and (aka_split[1].find("(English title)") != -1 \
                             or aka_split[1].find("USA") != -1):
                    eng_title = aka_split[0].strip().lower()
                    break

        imdb_title = sanitize_name(movie["title"])
        original_title = name_split[1].strip()[:-1].lower()
        print("imdb title: " + imdb_title)
        print("english title: " + eng_title)
        print("year: " + str(movie["year"]))
        if imdb_title == title or eng_title == title                    \
                or (len(name_split) == 3                                \
                    and imdb_title == original_title):
            return movie
        else:
            print("########## FOUND DIFFERENT MOVIE ##########")
            print(movie["title"] + " (" + str(movie["year"]) + ")")
            return None
Exemple #10
0
    top100.set_value(num,'length',length_mins)
    top100.set_value(num,'genre',genres)

# Add a column that we will use to merge this DF with the reviews DF below
top100['movie_num'] = range(100)

# Convert release_year to an int
top100.release_year = pd.to_numeric(top100.release_year, errors='coerce')

'''Part 2: Wrangle the Text Data'''
# Create an Imdb object from the imdbpie package
imdb = Imdb()
# Build a list of all reviews and ratings for each movie in the top 100
movie_reviews = []
for num,movie in top100.iterrows():
    reviews = imdb.get_title_reviews(movie.id,max_results=10000)
    for review in reviews:
        this_review = []
        this_review.append(num)
        this_review.append(review.rating)
        this_review.append(review.text)
        movie_reviews.append(this_review)

# Convert our list of reviews to a DataFrame
reviews_df = pd.DataFrame(movie_reviews,columns=['movie_num','review_score',
                                                'text'])

# Drop all reviews that didn't give a score (only has review text)
reviews_df.dropna(axis=0,how='any',inplace=True)

# This function removes all characters that aren't a letter, number, or space
Exemple #11
0
top_100_clean = pd.merge(top_100, df)
top_100_clean
top_100_clean.to_csv('top_100.csv', encoding = 'utf-8', index=False)

#########################################
# part 2 - reviews and such - unable to do scraping, geting reviews from the omdb api
# reading in product of above code from .csv for time and such

# formatting shit for reviews
movie_list = pd.read_csv('top_100.csv')
movie_list = movie_list['tconst']

review_list = []
for i in movie_list:
    x = imdb.get_title_reviews(i, max_results=10000)
    review_list.append(x)
    print i


master_reviews = []
counter = 0
for i in review_list:
    imdb_value = movie_list[counter]
    for j in i:
        text = j.text
        score = j.rating
        master_reviews.append({'imdb_value':imdb_value, 'text':text, 'user_movie_score':score})
    counter +=1
len(master_reviews)
master_reviews = pd.DataFrame(master_reviews)
Exemple #12
0
class ImdbClient:
    def __init__(self):
        self.imdbpy = IMDb()
        self.imdb = Imdb(exclude_episodes=False)
        self.imdb = Imdb(anonymize=True)  # to proxy requests
        self.db = api.TVDB('B43FF87DE395DF56')

    def readFromMongo(self, show, limit):
        # Connect to mongo
        client = MongoClient()

        # access movie stream db
        movies = client['movieratings_stream']

        # colletion of tweets
        tweets = movies['tweets']

        tweet_text = []
        counter = 0

        # iterate through cursor that takes the 'limit' most recent tweets with hashtag 'show'
        for tweet in tweets.find({'show_title': show}):  # .sort('created_at', pymongo.DESCENDING):
            if counter < limit:
                tweet_text.append(tweet.get("tweet_text"))
                counter += 1
            else:
                break
        return tweet_text

    def getTitle(self, show_title):
        m = self.imdbpy.get_movie('0389564')  # The 4400.
        m['kind']    # kind is 'tv series'.
        self.imdbpy.update(m, 'episodes')   # retrieves episodes information.

        m['episodes']    # a dictionary with the format:
                       #    {#season_number: {
                       #                      #episode_number: Movie object,
                       #                      #episode_number: Movie object,
                       #                      ...
                       #                     },
                       #     ...
                       #    }
                       # season_number always starts with 1, episode_number
                       # depends on the series' numbering schema: some series
                       # have a 'episode 0', while others starts counting from 1.

        m['episodes'][1][1] # <Movie id:0502803[http] title:_"The 4400" Pilot (2004)_>

        e = m['episodes'][1][2]  # second episode of the first season.
        e['kind']    # kind is 'episode'.
        e['season'], e['episode']   # return 1, 2.
        e['episode of']  # <Movie id:0389564[http] title:_"4400, The" (2004)_>
                       # XXX: beware that e['episode of'] and m _are not_ the
                       #      same object, while both represents the same series.
                       #      This is to avoid circular references; the
                       #      e['episode of'] object only contains basics
                       #      information (title, movieID, year, ....)
        i.update(e)  # retrieve normal information about this episode (cast, ...)

        e['title']  # 'The New and Improved Carl Morrissey'
        e['series title']  # 'The 4400'
        e['long imdb episode title']  # '"The 4400" The New and Improved Carl Morrissey (2004)'


        # print(show_title)
        # sleep(3)
        # title_list = list(self.imdb.search_for_title(show_title))
        # print(list(self.imdb.search_for_title("Days Gone Bye The Walking Dead")))
        # print(title_list)
        # sleep(3)
        # index = 0
        # show_id = None

        # while show_id is None:
        #     print ("title_list", title_list[index][u'title'])
        #     print ("show title", show_title)
        #     result = title_list[index][u'title'].lower()
        #     query = show_title.lower()
        #     if result in query:
        #         print title_list
        #         show_id = title_list[index][u'imdb_id']
        #         # endless loop
        #     index += 1
        # return show_id

    def searchShow(self, tvshow):
        print tvshow
        title_id = self.getTitle(tvshow)
#        if tvshow is not self.tvshow:
        print title_id
        print tvshow
       # print('title: ', title_id)
        reviews = self.imdb.get_title_reviews(title_id, max_results=sys.maxint)
        title = self.imdb.get_title_by_id(title_id)
        print title_id
        print tvshow
       # print("title: " + str(title.data))
       # print len(reviews)
        return reviews

    def getCurrentImdbRating(self, tvshow):
        tvshowid = self.getTitle(tvshow)
        title = self.imdb.get_title_by_id(tvshowid)
        return float(title.rating)

    def get_all_episode_names(self, tvshow):
        result = self.db.search(tvshow, 'en')
        show = result[0]
        res = []
        for x in range(1, len(show)):
            season = show[x]
            for y in range(1, len(season) + 1):
                if season[y].EpisodeName is not None:
                    res.append(season[y].EpisodeName)
        return res

    def get_specific_episode_names(self, tvshow, season):
        result = self.db.search(tvshow, 'en')
        show = result[0]
        res = []
        season = show[1]
        for x in range(1, len(season) + 1):
            if season[x].EpisodeName is not None:
                print season[x].EpisodeName
                res.append(season[x].EpisodeName)
        return res

    def get_all_episodes(self, episodelist, tvshow):
        for episode in episodelist:
            currEpisode = episode + " " + tvshow
            reviews = []
            reviews.append(searchshow(currEpisode))
        #call searchshow for each

# get list of all episode names given a tv show
# create review list, for each episode name, call searchshow append
# call method that trains
def main(title):
    reviews = []

    # Search tweets
    ts = TwitterSearch(
        consumer_key=os.environ.get('TWITTER_CONSUMER_KEY'),
        consumer_secret=os.environ.get('TWITTER_CONSUMER_SECRET'),
        access_token=os.environ.get('TWITTER_ACCESS_TOKEN'),
        access_token_secret=os.environ.get('TWITTER_TOKEN_SECRET'))
    try:
        ts.connect()

        tso = TwitterSearchOrder()  # create a TwitterSearchOrder object
        tso.setKeywords([
            '#' + title + 'Movie'
        ])  # let's define all words we would like to have a look for
        tso.setLanguage('en')  # we want to see German tweets only
        tso.setIncludeEntities(
            False)  # and don't give us all those entity information

        # add tweets to reviews list
        results = ts.getSearchResults(tso)

    except TwitterSearchException as e:  # take care of all those ugly errors if there are some
        logging.exception(str(e))
        ts.cleanUp()
    else:
        for offset in range(results.getSize()):
            if offset > 9:
                break
            tweet = results.getTweetByIndex(offset)
            reviews.append({
                'author':
                tweet.getUserName(),
                'summary':
                tweet.getText(),
                'text':
                tweet.getText(),
                'date':
                parser.parse(tweet.getCreatedDate(), ignoretz=True),
                'source':
                'Twitter'
            })
    finally:
        ts.disconnect()

    # Search Imdb
    imdb = Imdb()
    try:
        response = imdb.search_for_title(title)[0]
        title_id = response['imdb_id']
        response = imdb.get_title_reviews(title_id, max_results=10)
    except IndexError as e:
        logging.exception(str(e))
    else:
        for review in response:
            reviews.append({
                'author': review.username,
                'summary': review.summary,
                'text': review.text,
                'date': parser.parse(review.date, ignoretz=True),
                'source': 'IMDB'
            })

    # Search NYTimes
    url = "https://api.nytimes.com/svc/movies/v2/reviews/search.json"
    data = {'query': title, 'api-key': os.environ.get('NY_TIMES_API_KEY')}
    response = requests.get(url, data)
    count = 0
    for review in response.json()['results']:
        if count > 9:
            break
        reviews.append({
            'author':
            review['byline'],
            'summary':
            review['headline'],
            'text':
            review['summary_short'],
            'date':
            parser.parse(review['date_updated'], ignoretz=True),
            'source':
            'NYTimes'
        })
        count += 1

    # Sort reviews by date
    reviews.sort(cmp=_cmprev)

    # Print reviews
    for review in reviews:
        print('(%s) @%s: %s [Source: %s]' %
              (review['date'].strftime('%Y-%m-%d'), review['author'],
               review['summary'], review['source']))
Exemple #14
0
from imdbpie import Imdb
import json

imdb = Imdb(anonymize=True, cache=True)

for i in xrange(0, 9999999):
	cnt = 0
	movie_id = "tt" + str(i).zfill(7)
	with open('reviews.json', 'w') as fp:
		if imdb.title_exists(movie_id):
			cnt += 1
			title = imdb.get_title_by_id(movie_id)
			print movie_id, title.title
			reviews = imdb.get_title_reviews(movie_id)
			# if reviews == None:
			# 	reviews = []
			# else:
			# 	reviews = [review.__dict__ for review in reviews]
			json.dump([title, reviews], fp, indent=2, default=lambda o: o.__dict__)
			if cnt > 10:
				break
 '''

 #Load data from csvs.

top100_new = pd.read_csv('top100_summary.csv')
top250_new = pd.read_csv('top250_summary.csv')
top100_new.drop('Unnamed: 0', axis = 1, inplace = True)
top250_new.drop('Unnamed: 0', axis = 1, inplace = True)

#Pulling all reviews from IMDB with API

review_list = []

for title in top100_new.tconst:
    review = imdb.get_title_reviews(title, max_results = 6000)
    for i in range(len(review)):
        review_list.append((title, review[i].rating,
                            review[i].summary,
                            review[i].text))

review_list[0]

#Checking length of reviews_list

len(review_list) #makes sense

#Putting in DataFrame.

reviews = pd.DataFrame(review_list, columns = ['title', 'rating', 'header', 'text'])
reviews.info() #makes sense. Not all reviews have ratings filled out.
Exemple #16
0
from imdbpie import Imdb

imdb = Imdb(anonymize=True)

x = imdb.search_for_title("matrix")
_first = x[0]['imdb_id']
print(_first)

reviews = imdb.get_title_reviews(_first, max_results=1000)
print(reviews)
class ImdbClient:
    def __init__(self):
        self.imdbpy = IMDb()
        self.imdb = Imdb(exclude_episodes=False)
        self.imdb = Imdb(anonymize=True)  # to proxy requests
        self.db = api.TVDB('B43FF87DE395DF56')

    def get_tweets_from_mongo(self, show, limit):
        # Connect to mongo
        client = MongoClient()

        # access movRie stream db
        movies = client['movieratings_stream']

        # colletion of tweets
        tweets = movies['tweets']

        tweet_text = []
        counter = 0

        # iterate through cursor that takes the 'limit' most recent tweets with hashtag 'show'
        for tweet in tweets.find({'show_title': show}):  # .sort('created_at', pymongo.DESCENDING):
            if counter < limit:
                tweet_text.append(tweet.get("tweet_text"))
                counter += 1
            else:
                break
        return tweet_text

    def get_show_id(self, show_title):
        title_list = list(self.imdb.search_for_title(show_title))
        index = 0
        show_id = None

        while index < len(title_list) and show_id is None:
            if title_list[index] is not None:
                result = title_list[index][u'title'].lower()
                query = show_title.lower()
                # if result in query:
                if fuzz.ratio(result, query) >= 90:
                    # print title_list
                    show_id = title_list[index][u'imdb_id']
            index += 1
        return show_id

    # TODO: get rid of usage of this
    def searchShow(self, tvshow):
        title_id = self.get_show_id(tvshow)
        print(title_id)
        reviews = []
        print(tvshow)

        if title_id is not None and title_id != '':
            reviews = self.imdb.get_title_reviews(title_id, max_results=sys.maxint)
            print reviews
        else:
            print("Invalid show id")

        return reviews

    def fetch_reviews(self, episode_id):
        reviews = self.imdb.get_title_reviews(episode_id, max_results=sys.maxint)

        return reviews

    def getCurrentImdbRating(self, tvshow):
        tvshowid = self.get_show_id(tvshow)
        title = self.imdb.get_title_by_id(tvshowid)
        return float(title.rating)

    # dont use this, use example from
    # http://imdbpy.sourceforge.net/docs/README.series.txt
    def get_all_episode_names(self, tvshow):
        result = self.db.search(tvshow, 'en')
        show = result[0]
        res = []
        for x in range(1, len(show)):
            season = show[x]
            for y in range(1, len(season) + 1):
                if season[y].EpisodeName is not None and season[y].EpisodeName != '':
                    res.append(season[y].EpisodeName)
        return res

    def get_show(self, show_id):
        show = self.imdbpy.get_movie(show_id.replace('t', ''))
        self.imdbpy.update(show, 'episodes')
        print("show_show(" + show_id + "): " + str(show))

        return show

    # episode names for a specific season of tvshow
    def get_specific_episode_names(self, tvshow, season):
        result = self.db.search(tvshow, 'en')
        show = result[0]
        res = []
        season = show[1]
        for x in range(1, len(season) + 1):
            if season[x].EpisodeName is not None:
                print season[x].EpisodeName
                res.append(season[x].EpisodeName)
        return res

    def get_all_episode_reviews(self, episodelist, tvshow):
        reviews = []
        for episode in episodelist:
            curEpisode = episode + " " + tvshow
            reviews.append(self.searchShow(curEpisode))
            # call searchshow for each

        print("Episodes:\n" + str(reviews))
        return reviews
Exemple #18
0
from imdbpie import Imdb
import json
imdb = Imdb(anonymize=True) # to proxy requests

reviews = imdb.get_title_reviews("tt0120338", max_results=2500)

classified_reviews = []

positive_reviews = [x for x in reviews if x.rating > 7]
negative_reviews = [x for x in reviews if x.rating < 5]

for i in range(0, 550):
  classified_reviews.append({
    'text': positive_reviews[i].text,
    'class': 'POSITIVE'
  })
  classified_reviews.append({
    'text': negative_reviews[i].text,
    'class': 'NEGATIVE'
  })

with open('result.json', 'w') as fp:
    json.dump(classified_reviews, fp)