Beispiel #1
0
def predict_rotten_tomatoes(movie):
    link = ''
    name_split = movie.name.split()
    for t in name_split:
        word = []
        word_as_list = list(t)
        # Only add word if alphanumeric
        for c in word_as_list:
            if c.isalnum():
                c = unidecode(c)
                c = c.lower()
                word.append(c)
        # Covert byte to string
        word_as_string = ''.join(word)
        # Rotten Tomatoes URLs are in the format: https://www.rottentomatoes.com/m/word1_word2_word3
        # Each word in the title separated by an underscore
        # No underscore at the end of URL
        link = ''.join([
            link, word_as_string, '_'
        ]) if len(word) > 0 else ''.join([link, word_as_string])
    new_link = urllib.parse.urljoin('https://www.rottentomatoes.com/m/',
                                    link[:-1])
    if not utils.check_link(new_link):
        link = ''.join([link[:-1], '_', movie.year])
        new_link = urllib.parse.urljoin('https://www.rottentomatoes.com/m/',
                                        link)

    Printer.print_minus(''.join([
        "MISSING ROTTEN TOMATOES: ", movie.name, ", Predicted Link: ", new_link
    ]))

    return new_link
Beispiel #2
0
def parse(file):
    films = {}
    with open(file) as json_file:
        data = json.load(json_file)
        for film in data['films']:
            movie_title = film['Name']
            movie_wiki_link = film['Link']
            movie = Movie(movie_wiki_link, movie_title)
            films[movie_title] = movie

            Printer.print_minus(''.join(
                ['PARSING: ', str(len(films)), ". ", movie_title]))

            if film['IMDB'] is not None:
                imdb_link = film['IMDB']['Link']
                movie.imdb = IMDB(imdb_link, movie_title)

            if film['Rotten Tomatoes'] is not None:
                rt_link = film['Rotten Tomatoes']['Link']
                movie.rotten_tomatoes = RottenTomatoes(rt_link, movie_title)

            if film['Metacritic'] is not None:
                meta_link = film['Metacritic']['Link']
                movie.metacritic = Metacritic(meta_link, movie_title)

            if film['Box Office Mojo'] is not None:
                bom_link = film['Box Office Mojo']['Link']
                movie.box_office_mojo = BoxOfficeMojo(bom_link)

    return films
Beispiel #3
0
def predict_metacritic(movie):
    link = ''
    name_split = movie.name.split()
    for t in name_split:
        word = []
        word_as_list = list(t)
        # Only add word if alphanumeric
        for c in word_as_list:
            if c.isalnum():
                c = unidecode(c)
                c = c.lower()
                word.append(c)
        # Covert byte to string
        word_as_string = ''.join(word)
        # Metacritic URLs are in the format: https://www.metacritic.com/movie/word1-word2-word3/
        # Each word in the title separated by a dash
        # No dash at the end of URL
        link = ''.join([
            link, word_as_string, '-'
        ]) if len(word) > 0 else ''.join([link, word_as_string])
    new_link = urllib.parse.urljoin('http://www.metacritic.com/movie/',
                                    link[:-1])

    Printer.print_minus(''.join(
        ["MISSING METACRITIC: ", movie.name, ", Predicted Link: ", new_link]))

    return new_link
Beispiel #4
0
def parse_tsv(file):
    films = {}
    with open(file) as tsv_file:
        data = csv.reader(tsv_file, delimiter="\t", quotechar='"')

        next(data)

        for film in data:
            movie_title = film[1]
            movie_wiki_link = film[2]
            movie = Movie(movie_wiki_link, movie_title)
            films[movie_title] = movie

            Printer.print_minus(''.join(
                ['PARSING: ', str(len(films)), ". ", movie_title]))
            imdb_link = film[3]
            movie.imdb = IMDB(imdb_link, movie_title)

            rt_link = film[5]
            movie.rotten_tomatoes = RottenTomatoes(rt_link, movie_title)

            meta_link = film[6]
            movie.metacritic = Metacritic(meta_link, movie_title)

            bom_link = film[7]
            movie.box_office_mojo = BoxOfficeMojo(bom_link)

    return films
Beispiel #5
0
def parse_rottentomatoes(url, url_split, movie):
    # The /m/ prefix indicates that this is a link to a movie: We're only interested in movies
    if url_split[2].startswith("com/m/"):
        movie.rotten_tomatoes = RottenTomatoes(url,
                                               movie.name,
                                               year=movie.year)

        Printer.print_minus(''.join(["FOUND ROTTEN TOMATOES: ", url]))
Beispiel #6
0
def predict_missing_links(movie):
    if movie.imdb is None:
        Printer.print_minus(''.join(
            ["MISSING IMDB: ", movie.name, " - ", movie.year]))

    if movie.metacritic is None:
        movie.metacritic = Metacritic(predict_metacritic(movie),
                                      movie.name,
                                      year=movie.year)

    if movie.rotten_tomatoes is None:
        movie.rotten_tomatoes = RottenTomatoes(predict_rotten_tomatoes(movie),
                                               movie.name,
                                               year=movie.year)
Beispiel #7
0
def parse_imdb(url, url_split, movie):
    # The /title/ prefix indicates that this isn't a link to an article
    if url_split[2].startswith("com/title/"):
        movie.imdb = IMDB(url, movie.name, year=movie.year)

        Printer.print_minus(''.join(["FOUND IMDB: ", url]))

        # BoxOfficeMojo uses the IMDB ids for indexing: We can easily find the bom link from the imdb link
        # IMDB links are in the format: https://www.imdb.com/title/ttXXXXXXX/
        # Splitting by / gives us: https: / / www.imdb.com / title / ttXXXXXXX /
        # Thus giving us the bom link by concatenating 'ttXXXXXXX' to the end of
        # 'https://www.boxofficemojo.com/title/'
        box_office_mojo_link = ''.join(
            ['https://www.boxofficemojo.com/title/',
             url.split('/')[4], '/'])
        movie.box_office_mojo = BoxOfficeMojo(box_office_mojo_link)

        Printer.print_minus(''.join(
            ["FOUND BOXOFFICEMOJO: ", box_office_mojo_link]))
Beispiel #8
0
def scrape_wikipedia(url, year):
    Printer.print_equal('SCRAPING WIKIPEDIA')

    movies = {}
    req = requests.get(url).text
    soup = BeautifulSoup(req, 'html.parser')

    # <table class="wikitable sortable jquery-tablesorter">
    # --- <tbody>
    # ------- <tr>
    # ----------- <td>
    # --------------- <i>
    # ------------------- <a href="/wiki/Name_Of_The_Movie" title="Name Of The Movie">Name Of The Movie</a>
    for table in soup.find_all('table', class_="wikitable sortable"):
        tbody = table.find('tbody')
        for tr in tbody.find_all('tr'):
            for td in tr.find_all('td'):
                if td is not None:
                    i = td.find('i')
                    if i is not None:
                        a = i.find('a')
                        if a is not None:
                            title = a.contents[0]
                            if a['href'].split('/')[2] != 'ja.wikipedia.org':
                                url = ''.join(
                                    ['https://en.wikipedia.org', a['href']])
                                movies[title] = Movie(url, title, year)

                                Printer.print_minus(''.join([
                                    "RETRIEVING DATA: ",
                                    str(len(movies)), ". ", title
                                ]))

                                scrape_external_links(movies[title])
                                predict_missing_links(movies[title])

    return movies
Beispiel #9
0
def parse_metacritic(url, url_split, movie):
    # The /movie/ prefix indicates that this is a link to a movie: We're only interested in movies
    if url_split[2].startswith("com/movie/"):
        movie.metacritic = Metacritic(url, movie.name, year=movie.year)

        Printer.print_minus(''.join(["FOUND METACRITIC: ", url]))
Beispiel #10
0
    def check_url(link, title):
        if not utils.check_link(link):
            Printer.print_minus(''.join(["INCORRECT ROTTEN TOMATOES: ", title]))

            return None
        return link
Beispiel #11
0
    def check_url(link, title):
        if not utils.check_link(link):
            Printer.print_minus(''.join(["INCORRECT METACRITIC: ", title]))

            return None
        return link