def follow_imdb(mov_id): amzn_page = 'lists/prime/amzn/{}'.format(mov_id) if os.path.exists(amzn_page): soup = read_html(amzn_page) else: imdb_soup = save_soup('https://www.imdb.com/title/{}/'.format( mov_id), None) time.sleep(10) offsite = imdb_soup.find( 'div', class_='winner-option')['data-href'] print("\t{}".format( mov_id), end='\r') soup = save_soup('https://www.imdb.com/{}/'.format(offsite), amzn_page) time.sleep(10) try: temp = soup.find('h1', id='aiv-content-title').text print('\t{}'.format( soup.find('h1', id='aiv-content-title').text.strip( ).split('\n')[0])) except AttributeError: pass table = soup.find('table', class_='a-keyvalue') for tr in table.find_all('tr'): head = tr.find('th').text.strip() val = tr.find('td').text.strip() if head == 'Sprachen': if "," in val: return val.split(',') return [val]
def follow_imdb(mov_id): amzn_page = 'lists/prime/amzn/{}'.format(mov_id) if os.path.exists(amzn_page): soup = read_html(amzn_page) else: imdb_soup = save_soup( 'https://www.imdb.com/title/{}/'.format(mov_id), None) time.sleep(10) offsite = imdb_soup.find('div', class_='winner-option')['data-href'] print("\t{}".format(mov_id), end='\r') soup = save_soup('https://www.imdb.com/{}/'.format(offsite), amzn_page) time.sleep(10) try: temp = soup.find('h1', id='aiv-content-title').text print('\t{}'.format( soup.find( 'h1', id='aiv-content-title').text.strip().split('\n')[0])) except AttributeError: pass table = soup.find('table', class_='a-keyvalue') for tr in table.find_all('tr'): head = tr.find('th').text.strip() val = tr.find('td').text.strip() if head == 'Sprachen': if "," in val: return val.split(',') return [val]
def main(): general_save_dir = 'lists/prime' imdb_save_dir = '{}/imdb_index'.format(general_save_dir) for page_num in range(1, 48): imdb_index = ('https://www.imdb.com/search/title?online_availability=DE/' 'today/Amazon/subs&title_type=feature&sort=release_date,asc' '&page={}&view=simple').format(page_num) filename = '{}/page-{}.html'.format(imdb_save_dir, page_num) if (os.path.exists(filename) or os.path.exists('{}.ids'.format(filename))): print(": {}.".format(filename), end='\r') continue else: save_soup(imdb_index, filename) print("Downloaded: {}".format(filename)) print("") films_on_page = get_films_from_index(read_html(filename)) with open('{}.ids'.format(filename), 'w') as wf: for film in films_on_page: wf.write('{}\n'.format(film)) os.remove(filename) time.sleep(20) get_missing()
def main(): general_save_dir = 'lists/prime' imdb_save_dir = '{}/imdb_index'.format(general_save_dir) for page_num in range(1, 48): imdb_index = ( 'https://www.imdb.com/search/title?online_availability=DE/' 'today/Amazon/subs&title_type=feature&sort=release_date,asc' '&page={}&view=simple').format(page_num) filename = '{}/page-{}.html'.format(imdb_save_dir, page_num) if (os.path.exists(filename) or os.path.exists('{}.ids'.format(filename))): print(": {}.".format(filename), end='\r') continue else: save_soup(imdb_index, filename) print("Downloaded: {}".format(filename)) print("") films_on_page = get_films_from_index(read_html(filename)) with open('{}.ids'.format(filename), 'w') as wf: for film in films_on_page: wf.write('{}\n'.format(film)) os.remove(filename) time.sleep(20) get_missing()
def update_user(user, list_type, date): print(">> {}\n".format(user)) files = sorted(os.listdir('user/{}/'.format(user)), reverse=True) recent_csv = 'user/{}/{}'.format(user, files[0]) # Find out which movie was added with the last update with open(recent_csv, 'r') as rcsv: reader = csv.reader(rcsv, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in reader: last_addition = row[0] if str(last_addition).startswith('film:'): print("Retrieving new title for {} ...".format(last_addition)) soup = save_soup( 'http://letterboxd.com/film/{}/'.format(last_addition), None) last_addition = soup.find( 'meta', property='og:url')['content'].split('/')[-2] new_csv = 'user/{}/{}_{}.csv'.format(user, user, date) with open(new_csv, 'a+') as wcsv: writer = csv.writer(wcsv, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) soup = save_soup( 'http://letterboxd.com/{}/{}/'.format(user, list_type), None) # Find all movie posters / links on this page movie_li = [ lm for lm in soup.find_all('li', class_='poster-container') ] new_num = 0 new_additions = [] for mov in movie_li: # Retrieve metadata for each movie this_user_rating = mov['data-owner-rating'] mov_str = mov.div['data-target-link'].split('/')[2] # If the current movie is already in the csv, cancel here if last_addition == mov_str: break new_additions.append((mov_str, this_user_rating)) print("\tAdded {} (Rating: {}).".format(mov_str, this_user_rating)) new_num += 1 time.sleep(1) for upd in reversed(new_additions): writer.writerow([upd[0], upd[1]]) # Delete file if empty if os.stat(new_csv).st_size == 0: os.remove(new_csv) print("")
def traverse_network(from_user, date, start=None): # Get a list of all users in network friends = [from_user] page_num = 1 while True: if start is not None: page_num = start soup = save_soup( 'http://letterboxd.com/{}/following/page/{}/'.format( from_user, page_num), None) friends += [ lm['href'].strip('/') for lm in soup.find_all('a', class_='avatar -a40') ] if not soup.find('a', class_='next'): break page_num += 1 return friends
def check_ratings(user_folder, initial_retrieval_date): # Check if ratings have changed list_parts = [] first_csv_date = None for user_csv in sorted(os.listdir('user/{}'.format(user_folder))): list_parts.append( pd.read_csv('user/{}/{}'.format(user_folder, user_csv), sep=r'\t', names=["title", user_folder], engine='python')) current_csv_date = user_csv.split('_')[-1].strip('.csv') if first_csv_date is None: first_csv_date = current_csv_date last_csv_date = current_csv_date db = pd.concat(list_parts, ignore_index=True).drop_duplicates( subset='title', keep='last').reset_index(drop=True) page_num = 1 rating_time = first_csv_date while True: lb_url = '' soup = save_soup( 'http://letterboxd.com/{}/films/ratings/page/{}'.format( user_folder, page_num), None) # Find all movie posters / links on this page movie_li = [ lm for lm in soup.find_all('li', class_='poster-container') ] if len(movie_li) == 0: break for mov in movie_li: # Retrieve metadata for each rating/movie rating_time = mov.find('time')['datetime'].split('T')[0] if rating_time < last_csv_date: this_user_rating = mov.find('meta', itemprop='ratingValue')['content'] mov_str = mov.div['data-target-link'].split('/')[2] # Check with database try: db_r = db[db['title'] == mov_str][user_folder].iloc[0] if (int(db_r) != int(this_user_rating) and int(this_user_rating) != 0 and rating_time > initial_retrieval_date): time.sleep(1) print("\tUpdated {} (Rating: {}, " "previously: {}) in csv {}.".format( mov_str, this_user_rating, db_r, rating_time)) with open( 'user/{}/{}_{}.csv'.format( user_folder, user_folder, rating_time), 'a+') as wcsv: writer = csv.writer(wcsv, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow([mov_str, this_user_rating]) except IndexError: time.sleep(2) print("\tAdded {}.".format(mov_str), end='\r') if this_user_rating != 0: print(this_user_rating) else: print("") with open( 'user/{}/{}_{}.csv'.format(user_folder, user_folder, rating_time), 'a+') as wcsv: writer = csv.writer(wcsv, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow([mov_str, this_user_rating]) page_num += 1 if rating_time <= initial_retrieval_date: break
def get_all_movies_from_page(user, list_title, save_dir='lists', output_name=None, with_ratings=True, to_reverse=True, create_meta_file=False, return_path=False): if not output_name: output_name = list_title.replace('/', '_') full_path = '{}/{}/{}'.format(save_dir, user, output_name) with open("{}.csv".format(full_path), 'a+') as wcsv: writer = csv.writer(wcsv, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) page = 1 while True: if create_meta_file: soup = save_soup( 'http://letterboxd.com/{}/{}/page/{}/'.format( user, list_title, page), '{}_META.html'.format(full_path)) create_meta_file = False else: soup = save_soup( 'http://letterboxd.com/{}/{}/page/{}/'.format( user, list_title, page), None) # Find all movie posters / links on this page movie_li = [ lm for lm in soup.find_all('li', class_='poster-container') ] for mov in movie_li: write_stuff = [] # Retrieve title write_stuff.append(mov.div['data-target-link'].split('/')[2]) # Retrieve rating if with_ratings: write_stuff.append(mov['data-owner-rating']) writer.writerow(write_stuff) print("\tPage {} : {} movies".format(page, len(movie_li))) try: last_page = soup.find_all(class_='paginate-page')[-1].text if page >= int(last_page): break page += 1 time.sleep(5) except IndexError: break if to_reverse: # Reverse csv print("\tReversing order of movies in csv...\n") filename = "{}.csv".format(full_path) temp_file = "{}.temp".format(filename) os.rename(filename, temp_file) with open(filename, 'w') as wf: with open(temp_file, 'r') as cf: lines = [x for x in cf] for line in lines[::-1]: wf.write(line) os.remove(temp_file) if return_path: return full_path