def add_years_to_movies(self, movies): """Get year for each movie dict in movies list, if movie not already in database. Netflix show year is recent not first air year, so can't use in tmdb search """ logging.info('getting year for movies if not in database') movies = [dict(t) for t in set([tuple(d.items()) for d in movies])] logging.info('unique movies count: {}'.format(len(movies))) count = 0 for i, movie in enumerate(movies): if 'link' in movie.keys() and not flaskapp.db_lookup_via_link( movie['link']): sleep(randint(10, 15)) try: count += 1 self.driver.get(movie['link']) year = self.driver.find_element_by_xpath( "//span[@class='year']").text movie['year'] = year logging.info('Media #{}, YEAR LOOKUP #{}: {}'.format( i, count, movie)) except: pass return movies
def get_netflix_year(medias): """Get netflix year on movie page if record not already in database""" # netflix show year is recent not first air year, cant use in tmdb search medias = [dict(t) for t in set([tuple(d.items()) for d in medias])] logging.info('unique medias in get_netflix_year(): {}'.format(len(medias))) # use browser, no need to sign in driver2 = webdriver.PhantomJS(service_log_path='log/phantomjs.log') driver2.implicitly_wait(10) # seconds driver2.set_window_size(1920, 1080) count = 0 for i, media in enumerate(medias): if count >= 190: logging.error('Exiting get_netflix_year early via counter') break if 'link' in media.keys(): if not flaskapp.db_lookup_via_link(media['link']): time.sleep(float(random.randrange(20000, 30000, 1)) / 1000) try: # only for new media not in database count += 1 driver2.get(media['link']) soup = BeautifulSoup(driver2.page_source, 'html.parser') year = soup.find('span', 'year').text media['year'] = year logging.info('Media #{} YEAR LOOKUP #{}: {}'.format( i, count, media)) except: pass driver2.quit() return medias
def add_years_to_movies(self, movies): """Get year for each movie dict in movies list, if movie not already in database """ logging.info('getting year for movies if not in database') for m in movies: if not flaskapp.db_lookup_via_link(m['link']): self.driver.get(m['link']) sleep(randint(5, 10)) texts = self.driver.find_element_by_tag_name("body").text texts = texts.split('\n') years = [t for t in texts if re.search(r'^\d{4}.+min$', t)] if len(years) > 0: m['year'] = years[0][:4] logging.info('year lookup: {}: {}'.format( m['title'], m.get('year', ''))) return movies
def lookup_and_write_medias(medias, mtype, source): # get unique: list of dict into list of tuples, set, back to dict logging.info('len(medias) before take unique: {}'.format(len(medias))) medias = [dict(t) for t in set([tuple(d.items()) for d in medias])] logging.info('len(medias) after take unique: {}'.format(len(medias))) for m in medias: source_to_write = dict(source) # if media link exists, set source link, try link db lookup / update if 'link' in m.keys(): source_to_write['link'] = m['link'] full_media = flaskapp.db_lookup_via_link(m['link']) if full_media: # logging.info(u'db media link found: {}'.format(m['title'])) flaskapp.update_media_with_source(full_media, source_to_write) continue # link url was not in database, therefore do themoviedb search time.sleep(0.2) year = m.get('year', '') results = flaskapp.themoviedb_search(m['title'], mtype, year=year) # exit iteration if search not complete or no results if 'total_results' not in results: logging.error(u'tmdb search not complete for {}: {} {}'.format( mtype, m['title'], year)) continue if results['total_results'] < 1: logging.warning(u'tmdb 0 results for {}: {} {}'.format( mtype, m['title'], year)) # empty media for db write, prevent re-searching full_media = dict() full_media['title'] = m['title'] full_media['mtype'] = mtype full_media['year'] = year full_media['id'] = m['link'] full_media['sources'] = [] else: # assume top result is best match and use it full_media = results['results'][0] # append data so dict can be saved to database full_media['mtype'] = mtype full_media['sources'] = [] if mtype == 'movie': full_media['year'] = full_media['release_date'][:4] else: full_media['title'] = full_media['name'] full_media['year'] = full_media['first_air_date'][:4] # logging.info(u'tmdb found {}: {}'.format(mtype, full_media['title'])) # check if titles are not exact match, in future may not append these if not flaskapp.doTitlesMatch(m['title'], full_media['title']): logging.warning(u'not exact titles: {} | {}'.format( m['title'], full_media['title'])) # write db media if new flaskapp.insert_media_if_new(full_media) # update db media with source flaskapp.update_media_with_source(full_media, source_to_write)
def search_hbo(): driver = webdriver.PhantomJS(service_log_path='log/phantomjs.log') driver.implicitly_wait(10) # seconds driver.set_window_size(1920, 15000) base_url = 'https://play.hbogo.com' source = {'name': 'hbo', 'display_name': 'HBO', 'link': base_url} pages = [{ 'url': '/movies', 'mtype': 'movie' }, { 'url': '/series', 'mtype': 'show' }, { 'url': '/documentaries', 'mtype': 'movie' }] for page in pages: logging.info('HBO SEARCH OF ' + page['url']) driver.get(base_url + page['url']) time.sleep(5) driver.execute_script("window.scrollTo(0, 10000);") time.sleep(15) # get all boxes with media image and text boxes = driver.find_elements_by_xpath( "//a[@class='default class2 class4']") logging.info(u'num of media boxes found: {}'.format(len(boxes))) # create list of titles and links, replacing newline medias = [] for i, b in enumerate(boxes): title = b.text.replace('\n', ' ') medias += [{'title': title, 'link': b.get_attribute('href')}] # remove non-media, TODO make not catch false positives medias = [m for m in medias if not m['title'].isupper()] # get year if not already in database logging.info('getting year for all media not in database') for m in medias: if page['mtype'] == 'movie' and not flaskapp.db_lookup_via_link( m['link']): driver.get(m['link']) time.sleep(float(random.randrange(5000, 10000, 1)) / 1000) texts = driver.find_element_by_tag_name("body").text texts = texts.split('\n') years = [t for t in texts if re.search('^\d{4}.+min$', t)] if len(years) > 0: m['year'] = years[0][:4] logging.info('year lookup: {}: {}'.format( m['title'], m.get('year', ''))) lookup_and_write_medias(medias, mtype=page['mtype'], source=source) driver.quit() # remove any sources not just updated: media this provider no longer has flaskapp.remove_old_sources('hbo')