def update_channel(channel=None, link: str = None): """ Connect to the Channel's host website and pull a catalog of all videos. Insert any new videos into the DB. It is expected that any missing videos will be downloaded later. """ with get_db_context() as (engine, session): if not channel: channel = session.query(Channel).filter_by(link=link).one() logger.info(f'Downloading video list for {channel.name} at {channel.url} This may take several minutes.') info = YDL.extract_info(channel.url, download=False, process=False) if 'url' in info: url = info['url'] info = YDL.extract_info(url, download=False, process=False) # Resolve all entries to dictionaries. entries = info['entries'] = list(info['entries']) # Youtube-DL may hand back a list of URLs, lets use the "Uploads" URL, if available. try: entries[0]['id'] except Exception: for entry in entries: if entry['title'] == 'Uploads': logger.info('Youtube-DL gave back a list of URLs, found the "Uploads" URL and using it.') info = YDL.extract_info(entry['url'], download=False, process=False) entries = info['entries'] = list(info['entries']) break # This is all the source id's that are currently available. try: all_source_ids = {i['id'] for i in entries} except KeyError as e: logger.warning(f'No ids for entries! Was the channel update successful? Is the channel URL correct?') logger.warning(f'entries: {entries}') raise KeyError('No id key for entry!') from e with get_db_context(commit=True) as (engine, session): # Get the channel in this new context. channel = session.query(Channel).filter_by(id=channel.id).one() download_frequency = channel.download_frequency channel.info_json = info channel.info_date = datetime.now() channel.next_download = today() + timedelta(seconds=download_frequency) with get_db_curs() as curs: # Insert any new videos. query = 'SELECT source_id FROM video WHERE channel_id=%s AND source_id IS NOT NULL' curs.execute(query, (channel.id,)) known_source_ids = {i[0] for i in curs.fetchall()} new_source_ids = all_source_ids.difference(known_source_ids) logger.info(f'Got {len(new_source_ids)} new videos for channel {channel.name}') channel_id = channel.id for source_id in new_source_ids: session.add(Video(source_id=source_id, channel_id=channel_id))
def refresh_channel_calculate_size() -> bool: with get_db_curs() as curs: query = 'SELECT id FROM video WHERE video_path IS NOT NULL AND size IS NULL' curs.execute(query) missing_size = [i for (i, ) in curs.fetchall()] if missing_size: coro = get_bulk_video_size(missing_size) asyncio.ensure_future(coro) logger.info('Scheduled get_bulk_video_size()') return True else: logger.info('No videos missing size.') return False
def refresh_channel_generate_posters() -> bool: with get_db_curs() as curs: query = 'SELECT id FROM video WHERE video_path IS NOT NULL AND poster_path IS NULL' curs.execute(query) missing_posters = [i for (i, ) in curs.fetchall()] if missing_posters: coro = generate_bulk_posters(missing_posters) asyncio.ensure_future(coro) logger.info('Scheduled generate_bulk_posters()') return True else: logger.info('No missing posters to generate.') return False
def refresh_channel_video_captions() -> bool: with get_db_curs() as curs: query = 'SELECT id FROM video WHERE caption IS NULL AND caption_path IS NOT NULL' curs.execute(query) missing_captions = [i for (i, ) in curs.fetchall()] if missing_captions: coro = insert_bulk_captions(missing_captions) asyncio.ensure_future(coro) logger.info('Scheduled insert_bulk_captions()') return True else: logger.info('No missing captions to process.') return False
def test_process_captions(self): with get_db_context(commit=True) as (engine, session): video1 = Video(title='scream', caption_path=str(self.vtt_path1)) session.add(video1) with mock.patch('api.videos.captions.get_absolute_video_caption', lambda *a: self.vtt_path1): captions.process_captions(video1) video2 = Video(title='bar', caption_path=str(self.vtt_path2)) session.add(video2) with mock.patch('api.videos.captions.get_absolute_video_caption', lambda *a: self.vtt_path2): captions.process_captions(video2) session.flush() session.refresh(video1) session.refresh(video2) # Get the video from the DB video1 = session.query(Video).filter_by(id=video1.id).one() self.assertIsNotNone(video1.caption) video2 = session.query(Video).filter_by(id=video2.id).one() self.assertIsNotNone(video2.caption) # Search using the tsvector, "sessions" never actually appears in the text, but "session" does with get_db_curs() as curs: def select_textsearch(*args): curs.execute('SELECT id FROM video WHERE textsearch @@ to_tsquery(%s) ORDER BY id', args) select_textsearch('sessions') self.assertEqual(curs.fetchall(), [[1, ]]) # Matches video1.title and video2.caption select_textsearch('scream') self.assertEqual(curs.fetchall(), [[1, ], [2, ]]) # Matches video1.title and video2.caption select_textsearch('scream | sessions') self.assertEqual(curs.fetchall(), [[1, ], [2, ]]) # Only matches video1.title select_textsearch('scream & sessions') self.assertEqual(curs.fetchall(), [[1, ]]) # Matches neither select_textsearch('scream & sess') self.assertEqual(curs.fetchall(), []) # Matches video2.caption select_textsearch('yawn | sess') self.assertEqual(curs.fetchall(), [[2, ]]) # Matches video2.caption select_textsearch('yawn') self.assertEqual(curs.fetchall(), [[2, ]]) # Matches video2.title select_textsearch('bar') self.assertEqual(curs.fetchall(), [[2, ]])
async def get_minimal_channels() -> List[dict]: """ Get the minimum amount of information necessary about all channels. """ with get_db_curs() as curs: # Get all channels, even if they don't have videos. query = ''' SELECT c.id, name, link, directory, url, download_frequency FROM channel AS c ORDER BY LOWER(name) ''' curs.execute(query) channels = list(map(dict, curs.fetchall())) # Add video counts to all channels query = ''' SELECT c.id, COUNT(v.id) AS video_count FROM channel AS c LEFT JOIN video AS v ON v.channel_id = c.id WHERE v.video_path IS NOT NULL GROUP BY 1 ''' curs.execute(query) video_counts = {i['id']: i['video_count'] for i in curs.fetchall()} for channel in channels: channel_id = channel['id'] try: channel['video_count'] = video_counts[channel_id] except KeyError: # No videos for this channel channel['video_count'] = 0 return channels
def convert_invalid_posters() -> bool: """ Searches the DB for all videos with an invalid poster type (i.e. webp) and converts them to JPEGs. A video with a valid poster will be marked as such in it's column "validated_poster". """ with get_db_curs() as curs: query = "SELECT id FROM video WHERE poster_path IS NOT NULL AND validated_poster = FALSE" curs.execute(query) invalid_posters = [i for (i, ) in curs.fetchall()] if invalid_posters: async def _(): return bulk_validate_posters(invalid_posters) coro = _() asyncio.ensure_future(coro) logger.info('Scheduled bulk_replace_invalid_posters()') return True else: logger.info('No invalid posters to replace.') return False
async def get_statistics(): with get_db_curs() as curs: curs.execute(''' SELECT -- total videos COUNT(id) AS "videos", -- total videos that are marked as favorite COUNT(id) FILTER (WHERE favorite IS NOT NULL) AS "favorites", -- total videos downloaded over the past week/month/year COUNT(id) FILTER (WHERE upload_date >= current_date - interval '1 week') AS "week", COUNT(id) FILTER (WHERE upload_date >= current_date - interval '1 month') AS "month", COUNT(id) FILTER (WHERE upload_date >= current_date - interval '1 year') AS "year", -- sum of all video lengths in seconds COALESCE(SUM(duration), 0) AS "sum_duration", -- sum of all video file sizes COALESCE(SUM(size), 0)::BIGINT AS "sum_size", -- largest video COALESCE(MAX(size), 0) AS "max_size" FROM video WHERE video_path IS NOT NULL ''') video_stats = dict(curs.fetchone()) # Get the total videos downloaded every month for the past two years. curs.execute(''' SELECT DATE_TRUNC('month', months.a), COUNT(id)::BIGINT, SUM(size)::BIGINT AS "size" FROM generate_series( date_trunc('month', current_date) - interval '2 years', date_trunc('month', current_date) - interval '1 month', '1 month'::interval) AS months(a), video WHERE video.upload_date >= date_trunc('month', months.a) AND video.upload_date < date_trunc('month', months.a) + interval '1 month' AND video.upload_date IS NOT NULL AND video.video_path IS NOT NULL GROUP BY 1 ORDER BY 1 ''') monthly_videos = [dict(i) for i in curs.fetchall()] historical_stats = dict(monthly_videos=monthly_videos) historical_stats['average_count'] = (sum(i['count'] for i in monthly_videos) // len(monthly_videos)) \ if monthly_videos else 0 historical_stats['average_size'] = (sum(i['size'] for i in monthly_videos) // len(monthly_videos)) \ if monthly_videos else 0 curs.execute(''' SELECT COUNT(id) AS "channels" FROM channel ''') channel_stats = dict(curs.fetchone()) ret = dict(statistics=dict( videos=video_stats, channels=channel_stats, historical=historical_stats, )) return ret
def refresh_channel_videos(channel: Channel, reporter: ProgressReporter): """ Find all video files in a channel's directory. Add any videos not in the DB to the DB. """ # This function is hard to predict, so we will simply progress in chunks :( reporter.set_progress_total(1, 6) reporter.send_progress(1, 0, 'Preparing channel.') # Set the idempotency key so we can remove any videos not touched during this search with get_db_curs(commit=True) as curs: curs.execute('UPDATE video SET idempotency=NULL WHERE channel_id=%s', (channel.id, )) reporter.send_progress(1, 1, 'Finding all videos, checking for duplicates.') idempotency = str(uuid1()) directory = get_absolute_media_path(channel.directory) # A set of absolute paths that exist in the file system possible_new_paths = generate_video_paths(directory) possible_new_paths = remove_duplicate_video_paths(possible_new_paths) reporter.send_progress(1, 2, 'Matching all videos to the database.') # Update all videos that match the current video paths relative_new_paths = [ str(i.relative_to(directory)) for i in possible_new_paths ] with get_db_curs(commit=True) as curs: query = 'UPDATE video SET idempotency = %s WHERE channel_id = %s AND video_path = ANY(%s) RETURNING video_path' curs.execute(query, (idempotency, channel.id, relative_new_paths)) existing_paths = {i for (i, ) in curs.fetchall()} reporter.send_progress(1, 3) # Get the paths for any video not yet in the DB # (paths in DB are relative, but we need to pass an absolute path) new_videos = { p for p in possible_new_paths if str(p.relative_to(directory)) not in existing_paths } reporter.send_progress(1, 4, f'Inserting {len(new_videos)} new videos.') for video_path in new_videos: with get_db_context(commit=True) as (engine, session): upsert_video(session, pathlib.Path(video_path), channel, idempotency=idempotency) logger.debug(f'{channel.name}: Added {video_path}') reporter.send_progress(1, 5, 'Deleting unnecessary video entries.') with get_db_curs(commit=True) as curs: curs.execute( 'DELETE FROM video WHERE channel_id=%s AND idempotency IS NULL RETURNING id', (channel.id, )) deleted_count = len(curs.fetchall()) if deleted_count: deleted_status = f'Deleted {deleted_count} video records from channel {channel.name}' logger.info(deleted_status) logger.info( f'{channel.name}: {len(new_videos)} new videos, {len(existing_paths)} already existed. ' ) reporter.send_progress(1, 6, f'Processed all videos for {channel.name}')