def test_private_vid_marked_as_private(self): record_download_video(self.private_vid, self.temp_dir.name) vid: Videos = Videos.objects.get(url=self.private_vid) self.assertEqual(CheckStatus[vid.check_status], CheckStatus.PRIVATE) self.assertEqual(vid.checked, True) vid.save() print(vid)
def test_present_vid_is_downloaded(self): record_download_video(self.present_vid, self.temp_dir.name) present_vid: Videos = Videos.objects.get(url=self.present_vid) self.assertEqual(present_vid.AdFile_ID.ad_filepath, f"CyAds/kPBtDHiHJuM.mkv") self.assertTrue(Path(self.temp_dir.name).joinpath(present_vid.AdFile_ID.ad_filepath).exists()) self.assertEqual(CheckStatus[present_vid.check_status], CheckStatus.FOUND) self.assertEqual(present_vid.checked, True)
def handle(self, *args, **options): # Only download youtube videos # Only download ads download_dir = os.environ["AD_ARCHIVE_FILESTORE_DIR"] assert download_dir is not None youtube_urls = Videos.objects.annotate(url_len=Length("url")).filter( url_len=11, watched_as_ad__gte=1, checked=False) vid: Videos for vid in youtube_urls: try: video_with_adfile = record_download_video( vid.url, download_dir) video_with_adfile.save() if video_with_adfile.check_status == CheckStatus.FOUND.value: print( f"downloaded ad to: {video_with_adfile.AdFile_ID.ad_filepath} for video: {video_with_adfile.url}" ) except Exception as e: print(f"Got error while downloading video {vid.url}: {e}") self.stdout.write(self.style.SUCCESS("Downloaded ads"))
def test_missing_vid_gets_marked_as_missing_downloaded(self): record_download_video(self.missing_vid, self.temp_dir.name) missing_video: Videos = Videos.objects.get(url=self.missing_vid) self.assertEqual(CheckStatus[missing_video.check_status], CheckStatus.MISSING) self.assertEqual(missing_video.checked, True) missing_video.save()
def test_duplicate_videos_raises_errors(self): record_download_video(self.double_vid, self.temp_dir.name) with self.assertRaises(DuplicateVideoError): record_download_video(self.double_vid, self.temp_dir.name)
def test_video_terminated_account_marked(self): vid = record_download_video(self.terminated_account_vid, self.temp_dir.name) vid.save() vid = Videos.objects.get(url=self.terminated_account_vid) self.assertEqual(CheckStatus[vid.check_status], CheckStatus.ACCOUNT_TERMINATED)
def test_video_removed_by_user_marked(self): record_download_video(self.user_removed_vid, self.temp_dir.name) vid = Videos.objects.get(url=self.user_removed_vid) self.assertEqual(CheckStatus[vid.check_status], CheckStatus.USER_REMOVED)
def save_video_metadata(self, video_list: List[str], is_ad: bool): """Save the video metadata of the video the bot requested""" self.logger.info("Enter save_video_metadata") # Only need to lookup video once. Increase by overall views by bot # Reduces queries to YouTube data API # all videos + ads watched viewed_videos: DefaultDict[str, int] = defaultdict(int) # videos/ads no info on yet not_viewed: DefaultDict[str, int] = defaultdict(int) for video in video_list: # store video_id and times seen for later # This creates a set of videos as well viewed_videos[video] += 1 self.logger.info("videos_watched", counts=viewed_videos) self.logger.info("Starting to check if videos already saved") vid_id: str times_seen: int for vid_id, times_seen in viewed_videos.items(): # Do we already have the video info? try: vid: Videos = Videos.objects.get(url=vid_id) # If the video info is already in db, update existing counts except Videos.DoesNotExist: # We don't have the video info yet # Lookup later not_viewed[vid_id] = times_seen continue except Videos.MultipleObjectsReturned: # Workaround for multiple entries for the same url. There should only be one! # Use the first of the duplicates vids: QuerySet[Videos] = Videos.objects.filter(url=vid_id) vid = vids[0] # Save our new count of times seen if is_ad: vid.watched_as_ad = True else: vid.watched_as_video = True vid.save() self.logger.info("Finished checking if videos already saved") self.logger.info("need to lookup videos", number=len(not_viewed), videos=list(not_viewed.keys())) # Benchmark max_queries = len(not_viewed.keys()) actual_queries = 0 self.logger.info("Starting to grab YT metadata for videos not saved") # Get and save info on videos we don't have info on yet. # Can only get info 50 videos at a time from YouTube data API for chunk in chunked(not_viewed.keys(), n=50): chunk = list(chunk) all_metadata = VideoMetadata(chunk, self.api_key) metadata: VideoMetadata # Made X queries actual_queries += len(all_metadata) for idx, metadata in enumerate(all_metadata): self.logger.info( f"idx: {idx}, vid_id: {metadata.id}, available: {metadata.available()}" ) # Create the video entry since it doesn't exist # If video is removed from YouTube if not metadata.available(): vid = Videos.objects.missing(metadata.id) else: cat = Categories.objects.from_valid_category_and_name( metadata.category_id, metadata.category_name) channel = Channels.objects.from_valid_channel_and_name( metadata.channel_id, metadata.channel_title) vid, created = Videos.objects.get_or_create( url=metadata.id, category=cat, channel=channel) vid.keywords = json.dumps(metadata.keywords) vid.description = metadata.description vid.title = metadata.title # Use youtube video id as key to lookup total times seen in batch times_viewed = not_viewed[metadata.id] if is_ad: vid.watched_as_ad = True else: vid.watched_as_video = True vid.save() # Download ads only should_download = vid.check_status != CheckStatus.FOUND.value and vid.watched_as_ad >= 1 if should_download: self.logger.info(f"Downloading video: {vid.url}") vid_with_adfile = record_download_video( vid.url, self.download_path) vid_with_adfile.save() vid.check_status = CheckStatus.FOUND.value self.logger.info( f"Downloaded video: {vid_with_adfile.url}, status={vid_with_adfile.check_status}" ) self.logger.info("Finished grabbing YT metadata for videos not saved") self.logger.info( f"Made {actual_queries} youtube queries. Max should be: {max_queries}" ) self.logger.info("exit save_video_metadata")