def scrape(year): Printer.print_equal('RETRIEVING NEW DATA') if year == '2019': movies = {} req = requests.get('https://en.wikipedia.org/wiki/2019_in_film').text soup = BeautifulSoup(req, 'html.parser') for a in soup.find(class_='navbox-list navbox-odd hlist').findAll( 'a', href=True): split_one = a['href'].split('/') if len(split_one) > 2: split_two = split_one[2].split('_') if len(split_two) > 4: if (split_two[0] == 'List') & (split_two[1] == 'of') & ( split_two[-1] == '2019'): movies = { **movies, **scrape_wikipedia( ''.join([ 'https://en.wikipedia.org', a['href'] ]), year) } return movies else: wiki_link = ''.join( ['https://en.wikipedia.org/wiki/', year, '_in_film']) return scrape_wikipedia(wiki_link, year)
def scrape_wikipedia(url, year): Printer.print_equal('SCRAPING WIKIPEDIA') movies = {} req = requests.get(url).text soup = BeautifulSoup(req, 'html.parser') # <table class="wikitable sortable jquery-tablesorter"> # --- <tbody> # ------- <tr> # ----------- <td> # --------------- <i> # ------------------- <a href="/wiki/Name_Of_The_Movie" title="Name Of The Movie">Name Of The Movie</a> for table in soup.find_all('table', class_="wikitable sortable"): tbody = table.find('tbody') for tr in tbody.find_all('tr'): for td in tr.find_all('td'): if td is not None: i = td.find('i') if i is not None: a = i.find('a') if a is not None: title = a.contents[0] if a['href'].split('/')[2] != 'ja.wikipedia.org': url = ''.join( ['https://en.wikipedia.org', a['href']]) movies[title] = Movie(url, title, year) Printer.print_minus(''.join([ "RETRIEVING DATA: ", str(len(movies)), ". ", title ])) scrape_external_links(movies[title]) predict_missing_links(movies[title]) return movies
def check_file_exists(): for y in years: Printer.print_equal(''.join(["CREATE YEAR: ", y])) file_path = ''.join(['resources//', str(y), '_data.json']) tsv_path = ''.join(['resources//', str(y), '_data.tsv']) if os.path.isfile(file_path): Printer.print_equal('FILE EXISTS: Parsing Data...') data[y] = movie_parser.parse(file_path) elif os.path.isfile(tsv_path): Printer.print_equal('TSV EXISTS: Parsing TSV...') data[y] = movie_parser.parse_tsv(tsv_path) else: Printer.print_equal(''.join( ['FILE DOES NOT EXIST: Creating ', y, ' File...'])) data[y] = scraper.scrape(y) print_to_tsv(data[y], y)
def save_rotten_tomatoes(movie): if movie.rotten_tomatoes is not None: return {'Link': movie.rotten_tomatoes.link} return None def save_box_office_mojo(movie): if movie.box_office_mojo is not None: return {'Link': movie.box_office_mojo.link} return None if __name__ == '__main__': Printer.print_equal('PROGRAM START') for i, item in enumerate(range(10)): year = ''.join(['201', str(i)]) years.append(year) check_file_exists() Printer.print_equal('SAVING FILES') for year in years: save_data(year) Printer.print_equal('PROGRAM END')