コード例 #1
0
def fix_oscar_dates():
    soup = BeautifulSoup(rq.get('https://en.wikipedia.org/wiki/List_of_Academy_Awards_ceremonies').text, 'lxml')
    table = soup.find('table', {'class': 'wikitable'}).find('tbody')
    table = table.findNext('table', {'class': 'wikitable'}).find('tbody')
    data = []
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values

    dates = [x[1] for x in data[1:] if len(x) >= 2]
    parsed_dates_dict = {d.year: d.date() for d in [dateparser.parse(date) for date in dates]}
    with session_scope() as session:
        for award in session.query(MovieAward).all():
            award.award_date = parsed_dates_dict.get(award.award_date.year, award.award_date)
    with session_scope() as session:
        dates = set([x.award_date for x in session.query(MovieAward).all()])
コード例 #2
0
def scrape_imdb_awards():
    with session_scope() as session:
        for award in session.query(Award).all():
            for year in range(award.start_year, award.end_year + 1):
                try:
                    print(f"Scraping {award.award_name} {year}")
                    SCRAPER_CLS = SCRAPER_CLS_DICT.get(award.award_id, IMDBAwardScraper)
                    SCRAPER_CLS(award.award_id, year, award.award_name, award.date_timedelta).scrape()
                except Exception as e:
                    print(f"ERROR, problem with {award.award_name} {year}, Exception: {e}")
                    traceback.print_exc()
コード例 #3
0
ファイル: rt_scraper.py プロジェクト: Licho59/movie_analysis
def scrape_oscar_movies():
    with session_scope() as session:
        movies_to_scrape = list(session.query(Movie) \
                                       .join(MovieAward, Movie.movie_wiki_url==MovieAward.movie_wiki_url) \
                                       .filter(MovieAward.award_category==OSCARS_BEST_FILM) \
                                       .distinct() \
                                       .values(Movie.imdb_id, Movie.rt_url))

    for imdb_id, rt_url in movies_to_scrape:
        if rt_url is not None:
            print(f"Start scraping {rt_url}, id: {imdb_id}")
            RTMovie(imdb_id, rt_url).scrape()
            print(f"Scraped {rt_url}")
コード例 #4
0
ファイル: rt_scraper.py プロジェクト: Licho59/movie_analysis
    def scrape(self, update_after=None):
        scrape_id = f"rt_{self.imdb_id}_critic_reviews"
        scrape_date = ScrapingLog.get_date(scrape_id)
        if not scrape_date or (update_after and scrape_date < update_after):
            soup = BeautifulSoup(rq.get(self.url).text, 'lxml')
            with session_scope() as session:
                movie = session.query(Movie).get(self.imdb_id)
                movie.rt_tomato_score = self.criticScore(soup)
                movie.rt_audience_score = self.audienceScore(soup)
                session.bulk_save_objects([movie])

            critic_reviews = self.criticReviews()
            self.save_reviews(critic_reviews)
            ScrapingLog.add_log(scrape_id)
        else:
            print(f"Skip scraping {scrape_id}: Already scraped")
コード例 #5
0
 def save_results(self, results):
     with session_scope() as session:
         objects = [
             MovieAward(
                 award_id=self.award_id,
                 award_category=award_category,
                 movie_imdb_id=movie_imdb_id,
                 award_name=self.award_name,
                 person_imdb_id=person_imdb_id,
                 person_name=person_name,
                 winner=winner,
                 award_date=self.award_date
             )
             for award_category, movie_imdb_id, person_imdb_id, person_name, winner in results
         ]
         session.bulk_save_objects(objects)
コード例 #6
0
ファイル: rt_scraper.py プロジェクト: Licho59/movie_analysis
 def save_reviews(self, reviews):
     reviews = self.removeDuplicatedReviews(reviews)
     try:
         with session_scope() as session:
             session.bulk_save_objects([
                 RTReview(type=r.reviewer_type,
                          movie_imdb_id=self.imdb_id,
                          reviewer_url=r.reviewer_url,
                          reviewer_name=r.reviewer_name,
                          fresh=r.fresh,
                          original_score=r.original_score,
                          review_text=r.text,
                          review_date=r.date) for r in reviews
             ])
     except:
         print(f"Problem saving")
         for r in sorted(reviews,
                         key=lambda x:
                         (x.reviewer_type, x.reviewer_url, x.date)):
             print(
                 f"{r.reviewer_url}, {r.date}, {r.reviewer_name}, {r.fresh}, {r.original_score} {r.text}"
             )
         raise