Beispiel #1
0
def update_channel(channel=None, link: str = None):
    """
    Connect to the Channel's host website and pull a catalog of all videos.  Insert any new videos into the DB.

    It is expected that any missing videos will be downloaded later.
    """
    with get_db_context() as (engine, session):
        if not channel:
            channel = session.query(Channel).filter_by(link=link).one()

    logger.info(f'Downloading video list for {channel.name} at {channel.url}  This may take several minutes.')
    info = YDL.extract_info(channel.url, download=False, process=False)
    if 'url' in info:
        url = info['url']
        info = YDL.extract_info(url, download=False, process=False)

    # Resolve all entries to dictionaries.
    entries = info['entries'] = list(info['entries'])

    # Youtube-DL may hand back a list of URLs, lets use the "Uploads" URL, if available.
    try:
        entries[0]['id']
    except Exception:
        for entry in entries:
            if entry['title'] == 'Uploads':
                logger.info('Youtube-DL gave back a list of URLs, found the "Uploads" URL and using it.')
                info = YDL.extract_info(entry['url'], download=False, process=False)
                entries = info['entries'] = list(info['entries'])
                break

    # This is all the source id's that are currently available.
    try:
        all_source_ids = {i['id'] for i in entries}
    except KeyError as e:
        logger.warning(f'No ids for entries!  Was the channel update successful?  Is the channel URL correct?')
        logger.warning(f'entries: {entries}')
        raise KeyError('No id key for entry!') from e

    with get_db_context(commit=True) as (engine, session):
        # Get the channel in this new context.
        channel = session.query(Channel).filter_by(id=channel.id).one()

        download_frequency = channel.download_frequency

        channel.info_json = info
        channel.info_date = datetime.now()
        channel.next_download = today() + timedelta(seconds=download_frequency)

        with get_db_curs() as curs:
            # Insert any new videos.
            query = 'SELECT source_id FROM video WHERE channel_id=%s AND source_id IS NOT NULL'
            curs.execute(query, (channel.id,))
            known_source_ids = {i[0] for i in curs.fetchall()}

        new_source_ids = all_source_ids.difference(known_source_ids)

        logger.info(f'Got {len(new_source_ids)} new videos for channel {channel.name}')
        channel_id = channel.id
        for source_id in new_source_ids:
            session.add(Video(source_id=source_id, channel_id=channel_id))
Beispiel #2
0
def refresh_channel_calculate_size() -> bool:
    with get_db_curs() as curs:
        query = 'SELECT id FROM video WHERE video_path IS NOT NULL AND size IS NULL'
        curs.execute(query)
        missing_size = [i for (i, ) in curs.fetchall()]

    if missing_size:
        coro = get_bulk_video_size(missing_size)
        asyncio.ensure_future(coro)
        logger.info('Scheduled get_bulk_video_size()')
        return True
    else:
        logger.info('No videos missing size.')
        return False
Beispiel #3
0
def refresh_channel_generate_posters() -> bool:
    with get_db_curs() as curs:
        query = 'SELECT id FROM video WHERE video_path IS NOT NULL AND poster_path IS NULL'
        curs.execute(query)
        missing_posters = [i for (i, ) in curs.fetchall()]

    if missing_posters:
        coro = generate_bulk_posters(missing_posters)
        asyncio.ensure_future(coro)
        logger.info('Scheduled generate_bulk_posters()')
        return True
    else:
        logger.info('No missing posters to generate.')
        return False
Beispiel #4
0
def refresh_channel_video_captions() -> bool:
    with get_db_curs() as curs:
        query = 'SELECT id FROM video WHERE caption IS NULL AND caption_path IS NOT NULL'
        curs.execute(query)
        missing_captions = [i for (i, ) in curs.fetchall()]

    if missing_captions:
        coro = insert_bulk_captions(missing_captions)
        asyncio.ensure_future(coro)
        logger.info('Scheduled insert_bulk_captions()')
        return True
    else:
        logger.info('No missing captions to process.')
        return False
    def test_process_captions(self):
        with get_db_context(commit=True) as (engine, session):
            video1 = Video(title='scream', caption_path=str(self.vtt_path1))
            session.add(video1)
            with mock.patch('api.videos.captions.get_absolute_video_caption', lambda *a: self.vtt_path1):
                captions.process_captions(video1)
            video2 = Video(title='bar', caption_path=str(self.vtt_path2))
            session.add(video2)
            with mock.patch('api.videos.captions.get_absolute_video_caption', lambda *a: self.vtt_path2):
                captions.process_captions(video2)

            session.flush()
            session.refresh(video1)
            session.refresh(video2)

            # Get the video from the DB
            video1 = session.query(Video).filter_by(id=video1.id).one()
            self.assertIsNotNone(video1.caption)
            video2 = session.query(Video).filter_by(id=video2.id).one()
            self.assertIsNotNone(video2.caption)

        # Search using the tsvector, "sessions" never actually appears in the text, but "session" does
        with get_db_curs() as curs:
            def select_textsearch(*args):
                curs.execute('SELECT id FROM video WHERE textsearch @@ to_tsquery(%s) ORDER BY id', args)

            select_textsearch('sessions')
            self.assertEqual(curs.fetchall(), [[1, ]])
            # Matches video1.title and video2.caption
            select_textsearch('scream')
            self.assertEqual(curs.fetchall(), [[1, ], [2, ]])
            # Matches video1.title and video2.caption
            select_textsearch('scream | sessions')
            self.assertEqual(curs.fetchall(), [[1, ], [2, ]])
            # Only matches video1.title
            select_textsearch('scream & sessions')
            self.assertEqual(curs.fetchall(), [[1, ]])
            # Matches neither
            select_textsearch('scream & sess')
            self.assertEqual(curs.fetchall(), [])
            # Matches video2.caption
            select_textsearch('yawn | sess')
            self.assertEqual(curs.fetchall(), [[2, ]])
            # Matches video2.caption
            select_textsearch('yawn')
            self.assertEqual(curs.fetchall(), [[2, ]])
            # Matches video2.title
            select_textsearch('bar')
            self.assertEqual(curs.fetchall(), [[2, ]])
Beispiel #6
0
async def get_minimal_channels() -> List[dict]:
    """
    Get the minimum amount of information necessary about all channels.
    """
    with get_db_curs() as curs:
        # Get all channels, even if they don't have videos.
        query = '''
            SELECT
                c.id, name, link, directory, url, download_frequency
            FROM
                channel AS c
            ORDER BY LOWER(name)
        '''
        curs.execute(query)
        channels = list(map(dict, curs.fetchall()))

        # Add video counts to all channels
        query = '''
            SELECT
                c.id, COUNT(v.id) AS video_count
            FROM
                channel AS c
                LEFT JOIN video AS v ON v.channel_id = c.id
            WHERE
                v.video_path IS NOT NULL
            GROUP BY 1
        '''
        curs.execute(query)
        video_counts = {i['id']: i['video_count'] for i in curs.fetchall()}

        for channel in channels:
            channel_id = channel['id']
            try:
                channel['video_count'] = video_counts[channel_id]
            except KeyError:
                # No videos for this channel
                channel['video_count'] = 0

    return channels
Beispiel #7
0
def convert_invalid_posters() -> bool:
    """
    Searches the DB for all videos with an invalid poster type (i.e. webp) and converts them to JPEGs.  A video with a
    valid poster will be marked as such in it's column "validated_poster".
    """
    with get_db_curs() as curs:
        query = "SELECT id FROM video WHERE poster_path IS NOT NULL AND validated_poster = FALSE"
        curs.execute(query)
        invalid_posters = [i for (i, ) in curs.fetchall()]

    if invalid_posters:

        async def _():
            return bulk_validate_posters(invalid_posters)

        coro = _()
        asyncio.ensure_future(coro)
        logger.info('Scheduled bulk_replace_invalid_posters()')
        return True
    else:
        logger.info('No invalid posters to replace.')
        return False
Beispiel #8
0
async def get_statistics():
    with get_db_curs() as curs:
        curs.execute('''
        SELECT
            -- total videos
            COUNT(id) AS "videos",
            -- total videos that are marked as favorite
            COUNT(id) FILTER (WHERE favorite IS NOT NULL) AS "favorites",
            -- total videos downloaded over the past week/month/year
            COUNT(id) FILTER (WHERE upload_date >= current_date - interval '1 week') AS "week",
            COUNT(id) FILTER (WHERE upload_date >= current_date - interval '1 month') AS "month",
            COUNT(id) FILTER (WHERE upload_date >= current_date - interval '1 year') AS "year",
            -- sum of all video lengths in seconds
            COALESCE(SUM(duration), 0) AS "sum_duration",
            -- sum of all video file sizes
            COALESCE(SUM(size), 0)::BIGINT AS "sum_size",
            -- largest video
            COALESCE(MAX(size), 0) AS "max_size"
        FROM
            video
        WHERE
            video_path IS NOT NULL
        ''')
        video_stats = dict(curs.fetchone())

        # Get the total videos downloaded every month for the past two years.
        curs.execute('''
        SELECT
            DATE_TRUNC('month', months.a),
            COUNT(id)::BIGINT,
            SUM(size)::BIGINT AS "size"
        FROM
            generate_series(
                date_trunc('month', current_date) - interval '2 years',
                date_trunc('month', current_date) - interval '1 month',
                '1 month'::interval) AS months(a),
            video
        WHERE
            video.upload_date >= date_trunc('month', months.a)
            AND video.upload_date < date_trunc('month', months.a) + interval '1 month'
            AND video.upload_date IS NOT NULL
            AND video.video_path IS NOT NULL
        GROUP BY
            1
        ORDER BY
            1
        ''')
        monthly_videos = [dict(i) for i in curs.fetchall()]

        historical_stats = dict(monthly_videos=monthly_videos)
        historical_stats['average_count'] = (sum(i['count'] for i in monthly_videos) // len(monthly_videos)) \
            if monthly_videos else 0
        historical_stats['average_size'] = (sum(i['size'] for i in monthly_videos) // len(monthly_videos)) \
            if monthly_videos else 0

        curs.execute('''
        SELECT
            COUNT(id) AS "channels"
        FROM
            channel
        ''')
        channel_stats = dict(curs.fetchone())
    ret = dict(statistics=dict(
        videos=video_stats,
        channels=channel_stats,
        historical=historical_stats,
    ))
    return ret
Beispiel #9
0
def refresh_channel_videos(channel: Channel, reporter: ProgressReporter):
    """
    Find all video files in a channel's directory.  Add any videos not in the DB to the DB.
    """
    # This function is hard to predict, so we will simply progress in chunks :(
    reporter.set_progress_total(1, 6)
    reporter.send_progress(1, 0, 'Preparing channel.')

    # Set the idempotency key so we can remove any videos not touched during this search
    with get_db_curs(commit=True) as curs:
        curs.execute('UPDATE video SET idempotency=NULL WHERE channel_id=%s',
                     (channel.id, ))

    reporter.send_progress(1, 1,
                           'Finding all videos, checking for duplicates.')

    idempotency = str(uuid1())
    directory = get_absolute_media_path(channel.directory)

    # A set of absolute paths that exist in the file system
    possible_new_paths = generate_video_paths(directory)
    possible_new_paths = remove_duplicate_video_paths(possible_new_paths)

    reporter.send_progress(1, 2, 'Matching all videos to the database.')

    # Update all videos that match the current video paths
    relative_new_paths = [
        str(i.relative_to(directory)) for i in possible_new_paths
    ]
    with get_db_curs(commit=True) as curs:
        query = 'UPDATE video SET idempotency = %s WHERE channel_id = %s AND video_path = ANY(%s) RETURNING video_path'
        curs.execute(query, (idempotency, channel.id, relative_new_paths))
        existing_paths = {i for (i, ) in curs.fetchall()}

    reporter.send_progress(1, 3)

    # Get the paths for any video not yet in the DB
    # (paths in DB are relative, but we need to pass an absolute path)
    new_videos = {
        p
        for p in possible_new_paths
        if str(p.relative_to(directory)) not in existing_paths
    }

    reporter.send_progress(1, 4, f'Inserting {len(new_videos)} new videos.')

    for video_path in new_videos:
        with get_db_context(commit=True) as (engine, session):
            upsert_video(session,
                         pathlib.Path(video_path),
                         channel,
                         idempotency=idempotency)
            logger.debug(f'{channel.name}: Added {video_path}')

    reporter.send_progress(1, 5, 'Deleting unnecessary video entries.')

    with get_db_curs(commit=True) as curs:
        curs.execute(
            'DELETE FROM video WHERE channel_id=%s AND idempotency IS NULL RETURNING id',
            (channel.id, ))
        deleted_count = len(curs.fetchall())

    if deleted_count:
        deleted_status = f'Deleted {deleted_count} video records from channel {channel.name}'
        logger.info(deleted_status)

    logger.info(
        f'{channel.name}: {len(new_videos)} new videos, {len(existing_paths)} already existed. '
    )

    reporter.send_progress(1, 6, f'Processed all videos for {channel.name}')