コード例 #1
0
ファイル: scraper.py プロジェクト: NTranVUW/MovieDataProject
def scrape(year):
    Printer.print_equal('RETRIEVING NEW DATA')
    if year == '2019':
        movies = {}
        req = requests.get('https://en.wikipedia.org/wiki/2019_in_film').text
        soup = BeautifulSoup(req, 'html.parser')
        for a in soup.find(class_='navbox-list navbox-odd hlist').findAll(
                'a', href=True):
            split_one = a['href'].split('/')
            if len(split_one) > 2:
                split_two = split_one[2].split('_')
                if len(split_two) > 4:
                    if (split_two[0] == 'List') & (split_two[1] == 'of') & (
                            split_two[-1] == '2019'):
                        movies = {
                            **movies,
                            **scrape_wikipedia(
                                ''.join([
                                    'https://en.wikipedia.org', a['href']
                                ]), year)
                        }
        return movies
    else:
        wiki_link = ''.join(
            ['https://en.wikipedia.org/wiki/', year, '_in_film'])
        return scrape_wikipedia(wiki_link, year)
コード例 #2
0
ファイル: scraper.py プロジェクト: NTranVUW/MovieDataProject
def scrape_wikipedia(url, year):
    Printer.print_equal('SCRAPING WIKIPEDIA')

    movies = {}
    req = requests.get(url).text
    soup = BeautifulSoup(req, 'html.parser')

    # <table class="wikitable sortable jquery-tablesorter">
    # --- <tbody>
    # ------- <tr>
    # ----------- <td>
    # --------------- <i>
    # ------------------- <a href="/wiki/Name_Of_The_Movie" title="Name Of The Movie">Name Of The Movie</a>
    for table in soup.find_all('table', class_="wikitable sortable"):
        tbody = table.find('tbody')
        for tr in tbody.find_all('tr'):
            for td in tr.find_all('td'):
                if td is not None:
                    i = td.find('i')
                    if i is not None:
                        a = i.find('a')
                        if a is not None:
                            title = a.contents[0]
                            if a['href'].split('/')[2] != 'ja.wikipedia.org':
                                url = ''.join(
                                    ['https://en.wikipedia.org', a['href']])
                                movies[title] = Movie(url, title, year)

                                Printer.print_minus(''.join([
                                    "RETRIEVING DATA: ",
                                    str(len(movies)), ". ", title
                                ]))

                                scrape_external_links(movies[title])
                                predict_missing_links(movies[title])

    return movies
コード例 #3
0
def check_file_exists():
    for y in years:
        Printer.print_equal(''.join(["CREATE YEAR: ", y]))

        file_path = ''.join(['resources//', str(y), '_data.json'])
        tsv_path = ''.join(['resources//', str(y), '_data.tsv'])

        if os.path.isfile(file_path):
            Printer.print_equal('FILE EXISTS: Parsing Data...')
            data[y] = movie_parser.parse(file_path)

        elif os.path.isfile(tsv_path):
            Printer.print_equal('TSV EXISTS: Parsing TSV...')
            data[y] = movie_parser.parse_tsv(tsv_path)
        else:
            Printer.print_equal(''.join(
                ['FILE DOES NOT EXIST: Creating ', y, ' File...']))

            data[y] = scraper.scrape(y)

            print_to_tsv(data[y], y)
コード例 #4
0

def save_rotten_tomatoes(movie):
    if movie.rotten_tomatoes is not None:
        return {'Link': movie.rotten_tomatoes.link}
    return None


def save_box_office_mojo(movie):
    if movie.box_office_mojo is not None:
        return {'Link': movie.box_office_mojo.link}
    return None


if __name__ == '__main__':
    Printer.print_equal('PROGRAM START')

    for i, item in enumerate(range(10)):
        year = ''.join(['201', str(i)])

        years.append(year)

    check_file_exists()

    Printer.print_equal('SAVING FILES')

    for year in years:
        save_data(year)

Printer.print_equal('PROGRAM END')