def download_thumbnails(self, video_ids): """ Download video thumbnails :param video_ids list, list of YouTube video IDs """ # prepare staging area results_path = self.dataset.get_staging_area() # Use YouTubeDL and the YouTube API to request video data youtube = build(config.YOUTUBE_API_SERVICE_NAME, config.YOUTUBE_API_VERSION, developerKey=config.YOUTUBE_DEVELOPER_KEY) ids_list = get_yt_compatible_ids(video_ids) retries = 0 for i, ids_string in enumerate(ids_list): if self.interrupted: raise ProcessorInterruptedException( "Interrupted while downloading thumbnails from YouTube") while retries < self.max_retries: try: response = youtube.videos().list(part="snippet", id=ids_string, maxResults=50).execute() break except Exception as error: self.dataset.update_status("Encountered exception " + str(error) + ".\nSleeping for " + str(self.sleep_time)) retries += 1 api_error = error time.sleep( self.sleep_time) # Wait a bit before trying again # Do nothing with the results if the requests failed - # be in the final results file if retries >= self.max_retries: self.dataset.update_status("Error during YouTube API request") else: # Get and return results for each video for metadata in response["items"]: # Get the URL of the thumbnail thumb_url = metadata["snippet"]["thumbnails"]["high"][ "url"] # Format the path to save the thumbnail to save_path = results_path.joinpath( metadata["id"] + "." + str(thumb_url.split('.')[-1])) # Download the image urllib.request.urlretrieve(thumb_url, save_path) self.dataset.update_status("Downloaded thumbnails for " + str(i * 50) + "/" + str(len(video_ids))) # create zip of archive and delete temporary files and folder self.dataset.update_status("Compressing results into archive") # Save the count of images for `finish` function image_count = 0 self.write_archive_and_finish(results_path)
def request_youtube_api(self, ids, custom_key=None, object_type="video"): """ Use the YouTube API to fetch metadata from videos or channels. :param video_ids, str: A list of valid YouTube IDs :param custom_key, str: A custom API key which can be provided by the user. :param object_type, str: The type of object to query. Currently only `video` or `channel`. :return list, containing dicts with YouTube's response metadata. Max 50 results per try. """ ids_list = get_yt_compatible_ids(ids) if object_type != "video" and object_type != "channel": return "No valid YouTube object type (currently only 'channel' and 'video' are supported)" # List of dicts for all video data results = {} # Use standard key or custom key if custom_key: api_key = custom_key else: api_key = config.YOUTUBE_DEVELOPER_KEY for i, ids_string in enumerate(ids_list): retries = 0 api_error = "" try: # Use YouTubeDL and the YouTube API to request video data youtube = build(config.YOUTUBE_API_SERVICE_NAME, config.YOUTUBE_API_VERSION, developerKey=api_key) # Catch invalid API keys except HttpError as e: if e.resp.status == 400: # "Bad Request" self.invalid_api_key = True return results # Google API's also throws other weird errors that might be resolved by retrying, like SSLEOFError except Exception as e: time.sleep(self.sleep_time) # Wait a bit before trying again pass while retries < self.max_retries: try: if object_type == "video": response = youtube.videos().list( part='snippet,contentDetails,statistics', id=ids_string, maxResults=50).execute() elif object_type == "channel": response = youtube.channels().list( part= "snippet,topicDetails,statistics,brandingSettings", id=ids_string, maxResults=50).execute() self.api_limit_reached = False break # Check rate limits except HttpError as httperror: status_code = httperror.resp.status if status_code == 403: # "Forbidden", what Google returns with rate limits retries += 1 self.api_limit_reached = True self.dataset.update_status( "API quota limit might be reached (HTTP" + str(status_code) + "), sleeping for " + str(self.sleep_time)) time.sleep( self.sleep_time) # Wait a bit before trying again pass else: retries += 1 self.dataset.update_status( "API error encoutered (HTTP" + str(status_code) + "), sleeping for " + str(self.sleep_time)) time.sleep( self.sleep_time) # Wait a bit before trying again pass # Google API's also throws other weird errors that might be resolved by retrying, like SSLEOFError except Exception as e: retries += 1 self.dataset.update_status( "Error encoutered, sleeping for " + str(self.sleep_time)) time.sleep( self.sleep_time) # Wait a bit before trying again pass # Do nothing with the results if the requests failed if retries > self.max_retries: if self.api_limit_reached == True: self.dataset.update_status( "Daily YouTube API requests exceeded.") return results else: # Sometimes there's no results, # and "respoonse" won't have an item key. if "items" not in response: continue # Get and return results for each video for metadata in response["items"]: result = {} # This will become the key result_id = metadata["id"] if object_type == "video": # Results as dict entries result["type"] = "video" result["upload_time"] = metadata["snippet"].get( "publishedAt") result["channel_id"] = metadata["snippet"].get( "channelId") result["channel_title"] = metadata["snippet"].get( "channelTitle") result["video_id"] = metadata["snippet"].get("videoId") result["video_title"] = metadata["snippet"].get( "title") result["video_duration"] = metadata.get( "contentDetails").get("duration") result["video_view_count"] = metadata[ "statistics"].get("viewCount") result["video_comment_count"] = metadata[ "statistics"].get("commentCount") result["video_likes_count"] = metadata[ "statistics"].get("likeCount") result["video_dislikes_count"] = metadata[ "statistics"].get("dislikeCount") result["video_topic_ids"] = metadata.get( "topicDetails") result["video_category_id"] = metadata["snippet"].get( "categoryId") result["video_tags"] = metadata["snippet"].get("tags") elif object_type == "channel": # Results as dict entries result["type"] = "channel" result["channel_id"] = metadata["snippet"].get( "channelId") result["channel_title"] = metadata["snippet"].get( "title") result["channel_description"] = metadata[ "snippet"].get("description") result["channel_default_language"] = metadata[ "snippet"].get("defaultLanguage") result["channel_country"] = metadata["snippet"].get( "country") result["channel_viewcount"] = metadata[ "statistics"].get("viewCount") result["channel_commentcount"] = metadata[ "statistics"].get("commentCount") result["channel_subscribercount"] = metadata[ "statistics"].get("subscriberCount") result["channel_videocount"] = metadata[ "statistics"].get("videoCount") # This one sometimes fails for some reason if "topicDetails" in metadata: result["channel_topic_ids"] = metadata[ "topicDetails"].get("topicIds") result["channel_topic_categories"] = metadata[ "topicDetails"].get("topicCategories") result["channel_branding_keywords"] = metadata.get( "brandingSettings").get("channel").get("keywords") results[result_id] = result # Update status per response item self.dataset.update_status("Got metadata from " + str(i * 50) + "/" + str(len(ids)) + " " + object_type + " YouTube URLs") return results