Beispiel #1
0
    def handle(self, *args, **options):
        if settings.CENTRAL_SERVER:
            raise CommandError("videoscan should be run on the distributed server only.")

        caching_enabled = (settings.CACHE_TIME != 0)
        touched_video_ids = []

        # Filesystem
        files = glob.glob(os.path.join(settings.CONTENT_ROOT, "*.mp4"))
        youtube_ids_in_filesystem = set([os.path.splitext(os.path.basename(f))[0] for f in files])

        # Database
        videos_marked_at_all = set([video.youtube_id for video in VideoFile.objects.all()])

        def delete_objects_for_incomplete_videos():
            # delete VideoFile objects that are not marked as in progress, but are neither 0% nor 100% done; they're broken
            video_files_to_delete = VideoFile.objects.filter(download_in_progress=False, percent_complete__gt=0, percent_complete__lt=100)
            deleted_video_ids = [i18n.get_video_id(video_file.youtube_id) for video_file in video_files_to_delete]
            video_files_to_delete.delete()
            if deleted_video_ids:
                self.stdout.write("Deleted %d VideoFile models (to mark them as not downloaded, since they were in a bad state)\n" % len(deleted_video_ids))
            return deleted_video_ids
        touched_video_ids += delete_objects_for_incomplete_videos()


        def add_missing_objects_to_db(youtube_ids_in_filesystem, videos_marked_at_all):
            # Files that exist, but are not in the DB, should be assumed to be good videos,
            #   and just needing to be added to the DB.  Add them to the DB in this way,
            #   so that these files also trigger the update code below (and trigger cache invalidation)
            youtube_ids_needing_model_creation = list(youtube_ids_in_filesystem - videos_marked_at_all)
            new_video_files = []
            if youtube_ids_needing_model_creation:
                for lang_code, youtube_ids in divide_videos_by_language(youtube_ids_needing_model_creation).iteritems():
                    # OK to do bulk_create; cache invalidation triggered via save download
                    lang_video_files = [VideoFile(youtube_id=id, percent_complete=100, download_in_progress=False, language=lang_code) for id in youtube_ids]
                    VideoFile.objects.bulk_create(lang_video_files)
                    new_video_files += lang_video_files
                    caching.invalidate_all_caches()  # Do this within the loop, to update users ASAP
                self.stdout.write("Created %d VideoFile models (and marked them as complete, since the files exist)\n" % len(new_video_files))

            return [i18n.get_video_id(video_file.youtube_id) for video_file in new_video_files]

        touched_video_ids += add_missing_objects_to_db(youtube_ids_in_filesystem, videos_marked_at_all)

        def update_objects_to_be_complete(youtube_ids_in_filesystem):
            # Files that exist, are in the DB, but have percent_complete=0, download_in_progress=False
            updated_video_ids = []
            for chunk in break_into_chunks(youtube_ids_in_filesystem):
                video_files_needing_model_update = VideoFile.objects.filter(percent_complete=0, download_in_progress=False, youtube_id__in=chunk)
                video_files_needing_model_update.update(percent_complete=100, flagged_for_download=False)

                updated_video_ids += [i18n.get_video_id(video_file.youtube_id) for video_file in video_files_needing_model_update]

            if updated_video_ids:
                caching.invalidate_all_caches()
                self.stdout.write("Updated %d VideoFile models (to mark them as complete, since the files exist)\n" % len(updated_video_ids))
            return updated_video_ids
        touched_video_ids += update_objects_to_be_complete(youtube_ids_in_filesystem)

        def delete_objects_for_missing_videos(youtube_ids_in_filesystem, videos_marked_at_all):
            # VideoFile objects say they're available, but that don't actually exist.
            deleted_video_ids = []
            videos_flagged_for_download = set([video.youtube_id for video in VideoFile.objects.filter(flagged_for_download=True)])
            videos_needing_model_deletion_chunked = break_into_chunks(videos_marked_at_all - youtube_ids_in_filesystem - videos_flagged_for_download)
            for chunk in videos_needing_model_deletion_chunked:
                video_files_needing_model_deletion = VideoFile.objects.filter(youtube_id__in=chunk)
                video_files_needing_model_deletion.delete()
                deleted_video_ids += [video_file.video_id for video_file in video_files_needing_model_deletion]
            if deleted_video_ids:
                self.stdout.write("Deleted %d VideoFile models (because the videos didn't exist in the filesystem)\n" % len(deleted_video_ids))
            return deleted_video_ids
        touched_video_ids += delete_objects_for_missing_videos(youtube_ids_in_filesystem, videos_marked_at_all)

        if options["auto_cache"] and caching_enabled and touched_video_ids:
            caching.regenerate_all_pages_related_to_videos(video_ids=list(set(touched_video_ids)))
Beispiel #2
0
    def handle(self, *args, **options):
        if settings.CENTRAL_SERVER:
            raise CommandError(
                "videoscan should be run on the distributed server only.")

        caching_enabled = (settings.CACHE_TIME != 0)
        touched_video_ids = []

        # Filesystem
        files = glob.glob(os.path.join(settings.CONTENT_ROOT, "*.mp4"))
        youtube_ids_in_filesystem = set(
            [os.path.splitext(os.path.basename(f))[0] for f in files])

        # Database
        videos_marked_at_all = set(
            [video.youtube_id for video in VideoFile.objects.all()])

        def delete_objects_for_incomplete_videos():
            # delete VideoFile objects that are not marked as in progress, but are neither 0% nor 100% done; they're broken
            video_files_to_delete = VideoFile.objects.filter(
                download_in_progress=False,
                percent_complete__gt=0,
                percent_complete__lt=100)
            deleted_video_ids = [
                i18n.get_video_id(video_file.youtube_id)
                for video_file in video_files_to_delete
            ]
            video_files_to_delete.delete()
            if deleted_video_ids:
                self.stdout.write(
                    "Deleted %d VideoFile models (to mark them as not downloaded, since they were in a bad state)\n"
                    % len(deleted_video_ids))
            return deleted_video_ids

        touched_video_ids += delete_objects_for_incomplete_videos()

        def add_missing_objects_to_db(youtube_ids_in_filesystem,
                                      videos_marked_at_all):
            # Files that exist, but are not in the DB, should be assumed to be good videos,
            #   and just needing to be added to the DB.  Add them to the DB in this way,
            #   so that these files also trigger the update code below (and trigger cache invalidation)
            youtube_ids_needing_model_creation = list(
                youtube_ids_in_filesystem - videos_marked_at_all)
            new_video_files = []
            if youtube_ids_needing_model_creation:
                for lang_code, youtube_ids in divide_videos_by_language(
                        youtube_ids_needing_model_creation).iteritems():
                    # OK to do bulk_create; cache invalidation triggered via save download
                    lang_video_files = [
                        VideoFile(youtube_id=id,
                                  percent_complete=100,
                                  download_in_progress=False,
                                  language=lang_code) for id in youtube_ids
                    ]
                    VideoFile.objects.bulk_create(lang_video_files)
                    new_video_files += lang_video_files
                    caching.invalidate_all_caches(
                    )  # Do this within the loop, to update users ASAP
                self.stdout.write(
                    "Created %d VideoFile models (and marked them as complete, since the files exist)\n"
                    % len(new_video_files))

            return [
                i18n.get_video_id(video_file.youtube_id)
                for video_file in new_video_files
            ]

        touched_video_ids += add_missing_objects_to_db(
            youtube_ids_in_filesystem, videos_marked_at_all)

        def update_objects_to_be_complete(youtube_ids_in_filesystem):
            # Files that exist, are in the DB, but have percent_complete=0, download_in_progress=False
            updated_video_ids = []
            for chunk in break_into_chunks(youtube_ids_in_filesystem):
                video_files_needing_model_update = VideoFile.objects.filter(
                    percent_complete=0,
                    download_in_progress=False,
                    youtube_id__in=chunk)
                video_files_needing_model_update.update(
                    percent_complete=100, flagged_for_download=False)

                updated_video_ids += [
                    i18n.get_video_id(video_file.youtube_id)
                    for video_file in video_files_needing_model_update
                ]

            if updated_video_ids:
                caching.invalidate_all_caches()
                self.stdout.write(
                    "Updated %d VideoFile models (to mark them as complete, since the files exist)\n"
                    % len(updated_video_ids))
            return updated_video_ids

        touched_video_ids += update_objects_to_be_complete(
            youtube_ids_in_filesystem)

        def delete_objects_for_missing_videos(youtube_ids_in_filesystem,
                                              videos_marked_at_all):
            # VideoFile objects say they're available, but that don't actually exist.
            deleted_video_ids = []
            videos_flagged_for_download = set([
                video.youtube_id for video in VideoFile.objects.filter(
                    flagged_for_download=True)
            ])
            videos_needing_model_deletion_chunked = break_into_chunks(
                videos_marked_at_all - youtube_ids_in_filesystem -
                videos_flagged_for_download)
            for chunk in videos_needing_model_deletion_chunked:
                video_files_needing_model_deletion = VideoFile.objects.filter(
                    youtube_id__in=chunk)
                video_files_needing_model_deletion.delete()
                deleted_video_ids += [
                    video_file.video_id
                    for video_file in video_files_needing_model_deletion
                ]
            if deleted_video_ids:
                self.stdout.write(
                    "Deleted %d VideoFile models (because the videos didn't exist in the filesystem)\n"
                    % len(deleted_video_ids))
            return deleted_video_ids

        touched_video_ids += delete_objects_for_missing_videos(
            youtube_ids_in_filesystem, videos_marked_at_all)

        if options["auto_cache"] and caching_enabled and touched_video_ids:
            caching.regenerate_all_pages_related_to_videos(
                video_ids=list(set(touched_video_ids)))
Beispiel #3
0
    def handle(self, *args, **options):
        self.video = None

        handled_youtube_ids = []  # stored to deal with caching
        failed_youtube_ids = []  # stored to avoid requerying failures.

        set_process_priority.lowest(logging=settings.LOG)

        try:
            while True:  # loop until the method is aborted
                # Grab any video that hasn't been tried yet
                videos = VideoFile.objects \
                    .filter(flagged_for_download=True, download_in_progress=False) \
                    .exclude(youtube_id__in=failed_youtube_ids)
                video_count = videos.count()
                if video_count == 0:
                    self.stdout.write(
                        _("Nothing to download; exiting.") + "\n")
                    break

                # Grab a video as OURS to handle, set fields to indicate to others that we're on it!
                # Update the video logging
                video = videos[0]
                video.download_in_progress = True
                video.percent_complete = 0
                video.save()
                self.stdout.write(
                    (_("Downloading video '%(youtube_id)s'...") + "\n") %
                    {"youtube_id": video.youtube_id})

                # Update the progress logging
                self.set_stages(
                    num_stages=video_count + len(handled_youtube_ids) +
                    len(failed_youtube_ids) + int(options["auto_cache"]))
                if not self.started():
                    self.start(stage_name=video.youtube_id)

                # Initiate the download process
                try:
                    ensure_dir(settings.CONTENT_ROOT)

                    progress_callback = partial(
                        self.download_progress_callback, video)
                    try:
                        # Download via urllib
                        download_video(video.youtube_id,
                                       callback=progress_callback)

                    except URLNotFound:
                        # Video was not found on amazon cloud service,
                        #   either due to a KA mistake, or due to the fact
                        #   that it's a dubbed video.
                        #
                        # We can use youtube-dl to get that video!!
                        logging.debug(
                            _("Retrieving youtube video %(youtube_id)s via youtube-dl"
                              ) % {"youtube_id": video.youtube_id})

                        def youtube_dl_cb(stats, progress_callback, *args,
                                          **kwargs):
                            if stats['status'] == "finished":
                                percent = 100.
                            elif stats['status'] == "downloading":
                                percent = 100. * stats[
                                    'downloaded_bytes'] / stats['total_bytes']
                            else:
                                percent = 0.
                            progress_callback(percent=percent)

                        scrape_video(video.youtube_id,
                                     quiet=not settings.DEBUG,
                                     callback=partial(
                                         youtube_dl_cb,
                                         progress_callback=progress_callback))

                    # If we got here, we downloaded ... somehow :)
                    handled_youtube_ids.append(video.youtube_id)
                    self.stdout.write(_("Download is complete!") + "\n")

                    # caching.invalidate_all_caches()  # Unnecessary; we have a database listener for this.

                except DownloadCancelled:
                    # Cancellation event
                    video.percent_complete = 0
                    video.flagged_for_download = False
                    video.download_in_progress = False
                    video.save()
                    failed_youtube_ids.append(video.youtube_id)

                except Exception as e:
                    # On error, report the error, mark the video as not downloaded,
                    #   and allow the loop to try other videos.
                    msg = _(
                        "Error in downloading %(youtube_id)s: %(error_msg)s"
                    ) % {
                        "youtube_id": video.youtube_id,
                        "error_msg": unicode(e)
                    }
                    self.stderr.write("%s\n" % msg)

                    # If a connection error, we should retry.
                    if isinstance(e, DownloadError):
                        connection_error = "[Errno 8]" in e.message
                    elif isinstance(e, IOError) and hasattr(e, "strerror"):
                        connection_error = e.strerror[0] == 8
                    else:
                        connection_error = False

                    video.download_in_progress = False
                    video.flagged_for_download = connection_error  # Any error other than a connection error is fatal.
                    video.save()

                    # Rather than getting stuck on one video, continue to the next video.
                    self.update_stage(
                        stage_status="error",
                        notes=_("%(error_msg)s; continuing to next video.") %
                        {"error_msg": msg})
                    failed_youtube_ids.append(video.youtube_id)
                    continue

            # This can take a long time, without any further update, so ... best to avoid.
            if options["auto_cache"] and caching.caching_is_enabled(
            ) and handled_youtube_ids:
                self.update_stage(
                    stage_name=self.video.youtube_id,
                    stage_percent=0,
                    notes=_("Generating all pages related to videos."))
                caching.regenerate_all_pages_related_to_videos(video_ids=list(
                    set([
                        i18n.get_video_id(yid) or yid
                        for yid in handled_youtube_ids
                    ])))

            # Update
            self.complete(
                notes=
                _("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully."
                  ) % {
                      "num_handled_videos":
                      len(handled_youtube_ids),
                      "num_total_videos":
                      len(handled_youtube_ids) + len(failed_youtube_ids),
                  })

        except Exception as e:
            self.cancel(stage_status="error",
                        notes=_("Error: %(error_msg)s") % {"error_msg": e})
            raise
Beispiel #4
0
    def handle(self, *args, **options):
        self.video = None

        handled_youtube_ids = []  # stored to deal with caching
        failed_youtube_ids = []  # stored to avoid requerying failures.

        set_process_priority.lowest(logging=settings.LOG)

        try:
            while True:  # loop until the method is aborted
                # Grab any video that hasn't been tried yet
                videos = VideoFile.objects.filter(flagged_for_download=True, download_in_progress=False).exclude(
                    youtube_id__in=failed_youtube_ids
                )
                video_count = videos.count()
                if video_count == 0:
                    self.stdout.write(_("Nothing to download; exiting.") + "\n")
                    break

                # Grab a video as OURS to handle, set fields to indicate to others that we're on it!
                # Update the video logging
                video = videos[0]
                video.download_in_progress = True
                video.percent_complete = 0
                video.save()
                self.stdout.write(
                    (_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id}
                )

                # Update the progress logging
                self.set_stages(
                    num_stages=video_count
                    + len(handled_youtube_ids)
                    + len(failed_youtube_ids)
                    + int(options["auto_cache"])
                )
                if not self.started():
                    self.start(stage_name=video.youtube_id)

                # Initiate the download process
                try:
                    ensure_dir(settings.CONTENT_ROOT)

                    progress_callback = partial(self.download_progress_callback, video)
                    try:
                        # Download via urllib
                        download_video(video.youtube_id, callback=progress_callback)

                    except URLNotFound:
                        # Video was not found on amazon cloud service,
                        #   either due to a KA mistake, or due to the fact
                        #   that it's a dubbed video.
                        #
                        # We can use youtube-dl to get that video!!
                        logging.debug(
                            _("Retrieving youtube video %(youtube_id)s via youtube-dl")
                            % {"youtube_id": video.youtube_id}
                        )

                        def youtube_dl_cb(stats, progress_callback, *args, **kwargs):
                            if stats["status"] == "finished":
                                percent = 100.0
                            elif stats["status"] == "downloading":
                                percent = 100.0 * stats["downloaded_bytes"] / stats["total_bytes"]
                            else:
                                percent = 0.0
                            progress_callback(percent=percent)

                        scrape_video(
                            video.youtube_id,
                            quiet=not settings.DEBUG,
                            callback=partial(youtube_dl_cb, progress_callback=progress_callback),
                        )

                    # If we got here, we downloaded ... somehow :)
                    handled_youtube_ids.append(video.youtube_id)
                    self.stdout.write(_("Download is complete!") + "\n")

                    # caching.invalidate_all_caches()  # Unnecessary; we have a database listener for this.

                except DownloadCancelled:
                    # Cancellation event
                    video.percent_complete = 0
                    video.flagged_for_download = False
                    video.download_in_progress = False
                    video.save()
                    failed_youtube_ids.append(video.youtube_id)

                except Exception as e:
                    # On error, report the error, mark the video as not downloaded,
                    #   and allow the loop to try other videos.
                    msg = _("Error in downloading %(youtube_id)s: %(error_msg)s") % {
                        "youtube_id": video.youtube_id,
                        "error_msg": unicode(e),
                    }
                    self.stderr.write("%s\n" % msg)

                    # If a connection error, we should retry.
                    if isinstance(e, DownloadError):
                        connection_error = "[Errno 8]" in e.message
                    elif isinstance(e, IOError) and hasattr(e, "strerror"):
                        connection_error = e.strerror[0] == 8
                    else:
                        connection_error = False

                    video.download_in_progress = False
                    video.flagged_for_download = connection_error  # Any error other than a connection error is fatal.
                    video.save()

                    # Rather than getting stuck on one video, continue to the next video.
                    self.update_stage(
                        stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}
                    )
                    failed_youtube_ids.append(video.youtube_id)
                    continue

            # This can take a long time, without any further update, so ... best to avoid.
            if options["auto_cache"] and caching.caching_is_enabled() and handled_youtube_ids:
                self.update_stage(
                    stage_name=self.video.youtube_id,
                    stage_percent=0,
                    notes=_("Generating all pages related to videos."),
                )
                caching.regenerate_all_pages_related_to_videos(
                    video_ids=list(set([i18n.get_video_id(yid) or yid for yid in handled_youtube_ids]))
                )

            # Update
            self.complete(
                notes=_("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully.")
                % {
                    "num_handled_videos": len(handled_youtube_ids),
                    "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids),
                }
            )

        except Exception as e:
            self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e})
            raise