def save(self, update_userlog=False, *args, **kwargs): # To deal with backwards compatibility, # check video_id, whether imported or not. if not self.video_id: assert kwargs.get("imported", False), "video_id better be set by internal code." assert self.youtube_id, "If not video_id, you better have set youtube_id!" self.video_id = ( i18n.get_video_id(self.youtube_id) or self.youtube_id ) # for unknown videos, default to the youtube_id if not kwargs.get("imported", False): self.full_clean() # Compute learner status already_complete = self.complete self.complete = self.points >= VideoLog.POINTS_PER_VIDEO if not already_complete and self.complete: self.completion_timestamp = datetime.now() # Tell logins that they are still active (ignoring validation failures). # TODO(bcipolli): Could log video information in the future. if update_userlog: try: UserLog.update_user_activity( self.user, activity_type="login", update_datetime=(self.completion_timestamp or datetime.now()), language=self.language, ) except ValidationError as e: logging.error("Failed to update userlog during video: %s" % e) super(VideoLog, self).save(*args, **kwargs)
def add_missing_objects_to_db(youtube_ids_in_filesystem, videos_marked_at_all): # Files that exist, but are not in the DB, should be assumed to be good videos, # and just needing to be added to the DB. Add them to the DB in this way, # so that these files also trigger the update code below (and trigger cache invalidation) youtube_ids_needing_model_creation = list( youtube_ids_in_filesystem - videos_marked_at_all) new_video_files = [] if youtube_ids_needing_model_creation: for lang_code, youtube_ids in divide_videos_by_language( youtube_ids_needing_model_creation).iteritems(): # OK to do bulk_create; cache invalidation triggered via save download lang_video_files = [ VideoFile(youtube_id=id, percent_complete=100, download_in_progress=False, language=lang_code) for id in youtube_ids ] VideoFile.objects.bulk_create(lang_video_files) new_video_files += lang_video_files caching.invalidate_all_caches( ) # Do this within the loop, to update users ASAP self.stdout.write( "Created %d VideoFile models (and marked them as complete, since the files exist)\n" % len(new_video_files)) return [ i18n.get_video_id(video_file.youtube_id) for video_file in new_video_files ]
def save(self, update_userlog=False, *args, **kwargs): # To deal with backwards compatibility, # check video_id, whether imported or not. if not self.video_id: assert kwargs.get( "imported", False), "video_id better be set by internal code." assert self.youtube_id, "If not video_id, you better have set youtube_id!" self.video_id = i18n.get_video_id( self.youtube_id ) or self.youtube_id # for unknown videos, default to the youtube_id if not kwargs.get("imported", False): self.full_clean() # Tell logins that they are still active (ignoring validation failures). # TODO(bcipolli): Could log video information in the future. if update_userlog: try: UserLog.update_user_activity( self.user, activity_type="login", update_datetime=(self.completion_timestamp or datetime.now()), language=self.language) except ValidationError as e: logging.error("Failed to update userlog during video: %s" % e) super(VideoLog, self).save(*args, **kwargs)
def delete_objects_for_incomplete_videos(): # delete VideoFile objects that are not marked as in progress, but are neither 0% nor 100% done; they're broken video_files_to_delete = VideoFile.objects.filter(download_in_progress=False, percent_complete__gt=0, percent_complete__lt=100) deleted_video_ids = [i18n.get_video_id(video_file.youtube_id) for video_file in video_files_to_delete] video_files_to_delete.delete() if deleted_video_ids: self.stdout.write("Deleted %d VideoFile models (to mark them as not downloaded, since they were in a bad state)\n" % len(deleted_video_ids)) return deleted_video_ids
def update_objects_to_be_complete(youtube_ids_in_filesystem): # Files that exist, are in the DB, but have percent_complete=0, download_in_progress=False updated_video_ids = [] for chunk in break_into_chunks(youtube_ids_in_filesystem): video_files_needing_model_update = VideoFile.objects.filter(percent_complete=0, download_in_progress=False, youtube_id__in=chunk) video_files_needing_model_update.update(percent_complete=100, flagged_for_download=False) updated_video_ids += [i18n.get_video_id(video_file.youtube_id) for video_file in video_files_needing_model_update] if updated_video_ids: self.stdout.write("Updated %d VideoFile models (to mark them as complete, since the files exist)\n" % len(updated_video_ids)) return updated_video_ids
def add_missing_objects_to_db(youtube_ids_in_filesystem, videos_marked_at_all): # Files that exist, but are not in the DB, should be assumed to be good videos, # and just needing to be added to the DB. Add them to the DB in this way, # so that these files also trigger the update code below (and trigger cache invalidation) youtube_ids_needing_model_creation = list(youtube_ids_in_filesystem - videos_marked_at_all) new_video_files = [] if youtube_ids_needing_model_creation: for lang_code, youtube_ids in divide_videos_by_language(youtube_ids_needing_model_creation).iteritems(): # OK to do bulk_create; cache invalidation triggered via save download lang_video_files = [VideoFile(youtube_id=id, percent_complete=100, download_in_progress=False, language=lang_code) for id in youtube_ids] VideoFile.objects.bulk_create(lang_video_files) new_video_files += lang_video_files self.stdout.write("Created %d VideoFile models (and marked them as complete, since the files exist)\n" % len(new_video_files)) return [i18n.get_video_id(video_file.youtube_id) for video_file in new_video_files]
def get_video_node_by_youtube_id(youtube_id): """Returns the video node corresponding to the video_id of the given youtube_id, or None""" video_id = i18n.get_video_id(youtube_id=youtube_id) return topic_tools.get_node_cache("Video").get(video_id, [None])[0]
def forwards(self, orm): # Setting the video ID for vlog in orm["main.VideoLog"].objects.all(): vlog.video_id = i18n.get_video_id(vlog.youtube_id) or vlog.youtube_id vlog.save()
def update_all_central_callback(request): """ Callback after authentication. Parses out the request token verification. Then finishes the request by getting an auth token. """ if not "ACCESS_TOKEN" in request.session: finish_auth(request) exercises = get_api_resource(request, "/api/v1/user/exercises") videos = get_api_resource(request, "/api/v1/user/videos") node_cache = get_node_cache() # Collate videos video_logs = [] for video in videos: # Assume that KA videos are all english-language, not dubbed (for now) youtube_id = video.get('video', {}).get('youtube_id', "") video_id = get_video_id(youtube_id) # map from youtube_id to video_id (across all languages) # Only save videos with progress if not video.get('seconds_watched', None): continue # Only save video logs for videos that we recognize. if video_id not in node_cache["Video"]: logging.warn("Skipping unknown video %s" % video_id) continue try: video_logs.append({ "video_id": video_id, "youtube_id": youtube_id, "total_seconds_watched": video['seconds_watched'], "points": VideoLog.calc_points(video['seconds_watched'], video['duration']), "complete": video['completed'], "completion_timestamp": convert_ka_date(video['last_watched']) if video['completed'] else None, }) logging.debug("Got video log for %s: %s" % (video_id, video_logs[-1])) except KeyError: # logging.error("Could not save video log for data with missing values: %s" % video) # Collate exercises exercise_logs = [] for exercise in exercises: # Only save exercises that have any progress. if not exercise.get('last_done', None): continue # Only save video logs for videos that we recognize. slug = exercise.get('exercise', "") if slug not in node_cache['Exercise']: logging.warn("Skipping unknown video %s" % slug) continue try: completed = exercise['streak'] >= 10 basepoints = node_cache['Exercise'][slug][0]['basepoints'] exercise_logs.append({ "exercise_id": slug, "streak_progress": min(100, 100 * exercise['streak']/10), # duplicates logic elsewhere "attempts": exercise['total_done'], "points": ExerciseLog.calc_points(basepoints, ncorrect=exercise['streak'], add_randomness=False), # no randomness when importing from KA "complete": completed, "attempts_before_completion": exercise['total_done'] if not exercise['practiced'] else None, #can't figure this out if they practiced after mastery. "completion_timestamp": convert_ka_date(exercise['proficient_date']) if completed else None, }) logging.debug("Got exercise log for %s: %s" % (slug, exercise_logs[-1])) except KeyError: logging.error("Could not save exercise log for data with missing values: %s" % exercise) # POST the data back to the distributed server try: dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None logging.debug("POST'ing to %s" % request.session["distributed_callback_url"]) response = requests.post( request.session["distributed_callback_url"], cookies={ "csrftoken": request.session["distributed_csrf_token"] }, data = { "csrfmiddlewaretoken": request.session["distributed_csrf_token"], "video_logs": json.dumps(video_logs, default=dthandler), "exercise_logs": json.dumps(exercise_logs, default=dthandler), "user_id": request.session["distributed_user_id"], } ) logging.debug("Response (%d): %s" % (response.status_code, response.content)) except requests.exceptions.ConnectionError as e: return HttpResponseRedirect(set_query_params(request.session["distributed_redirect_url"], { "message_type": "error", "message": _("Could not connect to your KA Lite installation to share Khan Academy data."), "message_id": "id_khanload", })) except Exception as e: return HttpResponseRedirect(set_query_params(request.session["distributed_redirect_url"], { "message_type": "error", "message": _("Failure to send data to your KA Lite installation: %s") % e, "message_id": "id_khanload", })) try: json_response = json.loads(response.content) if not isinstance(json_response, dict) or len(json_response) != 1: # Could not validate the message is a single key-value pair raise Exception(_("Unexpected response format from your KA Lite installation.")) message_type = json_response.keys()[0] message = json_response.values()[0] except ValueError as e: message_type = "error" message = unicode(e) except Exception as e: message_type = "error" message = _("Loading json object: %s") % e # If something broke on the distributed server, we have no way to recover. # For now, just show the error to users. # # Ultimately, we have a message, would like to share with the distributed server. # if response.status_code != 200: # return HttpResponseServerError(response.content) return HttpResponseRedirect(set_query_params(request.session["distributed_redirect_url"], { "message_type": message_type, "message": message, "message_id": "id_khanload", }))
class TestSaveVideoLog(KALiteTestCase): ORIGINAL_POINTS = 84 ORIGINAL_SECONDS_WATCHED = 32 NEW_POINTS = 32 NEW_SECONDS_WATCHED = 15 YOUTUBE_ID = "aNqG4ChKShI" VIDEO_ID = i18n.get_video_id(YOUTUBE_ID) or "dummy" YOUTUBE_ID2 = "b22tMEc6Kko" VIDEO_ID2 = i18n.get_video_id(YOUTUBE_ID2) or "dummy2" USERNAME = "******" PASSWORD = "******" def setUp(self): super(TestSaveVideoLog, self).setUp() # create a facility and user that can be referred to in models across tests self.facility = Facility(name="Test Facility") self.facility.save() self.user = FacilityUser(username=self.USERNAME, facility=self.facility) self.user.set_password(self.PASSWORD) self.user.save() # create an initial VideoLog instance so we have something to update later self.original_videolog = VideoLog(video_id=self.VIDEO_ID, youtube_id=self.YOUTUBE_ID, user=self.user) self.original_videolog.points = self.ORIGINAL_POINTS self.original_videolog.total_seconds_watched = self.ORIGINAL_SECONDS_WATCHED self.original_videolog.save() def test_new_videolog(self): # make sure the target video log does not already exist videologs = VideoLog.objects.filter(video_id=self.VIDEO_ID2, user__username=self.USERNAME) self.assertEqual( videologs.count(), 0, "The target video log to be newly created already exists") c = KALiteClient() # login success = c.login(username=self.USERNAME, password=self.PASSWORD, facility=self.facility.id) self.assertTrue(success, "Was not able to login as the test user") # save a new video log result = c.save_video_log( video_id=self.VIDEO_ID2, youtube_id=self.YOUTUBE_ID2, total_seconds_watched=self.ORIGINAL_SECONDS_WATCHED, points=self.NEW_POINTS, user=self.USERNAME, ) self.assertEqual( result.status_code, 201, "An error (%d) was thrown while saving the video log." % result.status_code) # get a reference to the newly created VideoLog videolog = VideoLog.objects.get(video_id=self.VIDEO_ID2, user__username=self.USERNAME) # make sure the VideoLog was properly created self.assertEqual(videolog.points, self.NEW_POINTS, "The VideoLog's points were not saved correctly.") self.assertEqual( videolog.total_seconds_watched, self.ORIGINAL_SECONDS_WATCHED, "The VideoLog's seconds watched was not saved correctly.") def test_update_videolog(self): # get a new reference to the existing VideoLog videolog = VideoLog.objects.get(id=self.original_videolog.id) # make sure the VideoLog hasn't already been changed self.assertEqual(videolog.points, self.ORIGINAL_POINTS, "The VideoLog's points have already changed.") self.assertEqual(videolog.total_seconds_watched, self.ORIGINAL_SECONDS_WATCHED, "The VideoLog's seconds watched already changed.") c = KALiteClient() # login success = c.login(username=self.USERNAME, password=self.PASSWORD, facility=self.facility.id) self.assertTrue(success, "Was not able to login as the test user") # save a new record onto the video log, with a correct answer (increasing the points and streak) result = c.save_video_log( video_id=self.VIDEO_ID, youtube_id=self.YOUTUBE_ID, total_seconds_watched=self.ORIGINAL_SECONDS_WATCHED + self.NEW_SECONDS_WATCHED, points=self.ORIGINAL_POINTS + self.NEW_POINTS, user=self.USERNAME, ) self.assertEqual( result.status_code, 201, "An error (%d) was thrown while saving the video log." % result.status_code) # get a reference to the updated VideoLog videolog = VideoLog.objects.get(video_id=self.VIDEO_ID, user__username=self.USERNAME) # make sure the VideoLog was properly updated self.assertEqual(videolog.points, self.ORIGINAL_POINTS + self.NEW_POINTS, "The VideoLog's points were not updated correctly.") self.assertEqual( videolog.total_seconds_watched, self.ORIGINAL_SECONDS_WATCHED + self.NEW_SECONDS_WATCHED, "The VideoLog's seconds watched was not updated correctly.")
def update_all_central_callback(request): """ Callback after authentication. Parses out the request token verification. Then finishes the request by getting an auth token. """ if not "ACCESS_TOKEN" in request.session: finish_auth(request) exercises = get_api_resource(request, "/api/v1/user/exercises") videos = get_api_resource(request, "/api/v1/user/videos") node_cache = get_node_cache() # Collate videos video_logs = [] for video in videos: # Assume that KA videos are all english-language, not dubbed (for now) youtube_id = video.get('video', {}).get('youtube_id', "") video_id = get_video_id( youtube_id ) # map from youtube_id to video_id (across all languages) # Only save videos with progress if not video.get('seconds_watched', None): continue # Only save video logs for videos that we recognize. if video_id not in node_cache["Video"]: logging.warn("Skipping unknown video %s" % video_id) continue try: video_logs.append({ "video_id": video_id, "youtube_id": youtube_id, "total_seconds_watched": video['seconds_watched'], "points": VideoLog.calc_points(video['seconds_watched'], video['duration']), "complete": video['completed'], "completion_timestamp": convert_ka_date(video['last_watched']) if video['completed'] else None, }) logging.debug("Got video log for %s: %s" % (video_id, video_logs[-1])) except KeyError: # logging.error( "Could not save video log for data with missing values: %s" % video) # Collate exercises exercise_logs = [] for exercise in exercises: # Only save exercises that have any progress. if not exercise.get('last_done', None): continue # Only save video logs for videos that we recognize. slug = exercise.get('exercise', "") if slug not in node_cache['Exercise']: logging.warn("Skipping unknown video %s" % slug) continue try: completed = exercise['streak'] >= 10 basepoints = node_cache['Exercise'][slug][0]['basepoints'] exercise_logs.append({ "exercise_id": slug, "streak_progress": min(100, 100 * exercise['streak'] / 10), # duplicates logic elsewhere "attempts": exercise['total_done'], "points": ExerciseLog.calc_points( basepoints, ncorrect=exercise['streak'], add_randomness=False ), # no randomness when importing from KA "complete": completed, "attempts_before_completion": exercise['total_done'] if not exercise['practiced'] else None, #can't figure this out if they practiced after mastery. "completion_timestamp": convert_ka_date(exercise['proficient_date']) if completed else None, }) logging.debug("Got exercise log for %s: %s" % (slug, exercise_logs[-1])) except KeyError: logging.error( "Could not save exercise log for data with missing values: %s" % exercise) # POST the data back to the distributed server try: dthandler = lambda obj: obj.isoformat() if isinstance( obj, datetime.datetime) else None logging.debug("POST'ing to %s" % request.session["distributed_callback_url"]) response = requests.post( request.session["distributed_callback_url"], cookies={"csrftoken": request.session["distributed_csrf_token"]}, data={ "csrfmiddlewaretoken": request.session["distributed_csrf_token"], "video_logs": json.dumps(video_logs, default=dthandler), "exercise_logs": json.dumps(exercise_logs, default=dthandler), "user_id": request.session["distributed_user_id"], }) logging.debug("Response (%d): %s" % (response.status_code, response.content)) except requests.exceptions.ConnectionError as e: return HttpResponseRedirect( set_query_params( request.session["distributed_redirect_url"], { "message_type": "error", "message": _("Could not connect to your KA Lite installation to share Khan Academy data." ), "message_id": "id_khanload", })) except Exception as e: return HttpResponseRedirect( set_query_params( request.session["distributed_redirect_url"], { "message_type": "error", "message": _("Failure to send data to your KA Lite installation: %s") % e, "message_id": "id_khanload", })) try: json_response = json.loads(response.content) if not isinstance(json_response, dict) or len(json_response) != 1: # Could not validate the message is a single key-value pair raise Exception( _("Unexpected response format from your KA Lite installation.") ) message_type = json_response.keys()[0] message = json_response.values()[0] except ValueError as e: message_type = "error" message = unicode(e) except Exception as e: message_type = "error" message = _("Loading json object: %s") % e # If something broke on the distributed server, we have no way to recover. # For now, just show the error to users. # # Ultimately, we have a message, would like to share with the distributed server. # if response.status_code != 200: # return HttpResponseServerError(response.content) return HttpResponseRedirect( set_query_params( request.session["distributed_redirect_url"], { "message_type": message_type, "message": message, "message_id": "id_khanload", }))
def forwards(self, orm): # Setting the video ID for vlog in orm["main.VideoLog"].objects.all(): vlog.video_id = i18n.get_video_id( vlog.youtube_id) or vlog.youtube_id vlog.save()
class TestVideoLogs(KALiteTestCase): ORIGINAL_POINTS = 37 ORIGINAL_SECONDS_WATCHED = 3 NEW_POINTS = 22 NEW_SECONDS_WATCHED = 5 YOUTUBE_ID = "aNqG4ChKShI" VIDEO_ID = i18n.get_video_id(YOUTUBE_ID) or "dummy" def setUp(self): super(TestVideoLogs, self).setUp() # create a facility and user that can be referred to in models across tests self.facility = Facility(name="Test Facility") self.facility.save() self.user = FacilityUser(username="******", facility=self.facility) self.user.set_password("dumber") self.user.save() # create an initial VideoLog instance so we have something to collide with later self.original_videolog = VideoLog(video_id=self.VIDEO_ID, youtube_id=self.YOUTUBE_ID, user=self.user) self.original_videolog.points = self.ORIGINAL_POINTS self.original_videolog.total_seconds_watched = self.ORIGINAL_SECONDS_WATCHED self.original_videolog.save() # get a new reference to the existing VideoLog videolog = VideoLog.objects.get(id=self.original_videolog.id) # make sure the VideoLog was created correctly self.assertEqual(videolog.points, self.ORIGINAL_POINTS, "The VideoLog's points have already changed.") self.assertEqual(videolog.total_seconds_watched, self.ORIGINAL_SECONDS_WATCHED, "The VideoLog's total seconds watched have already changed.") def test_videolog_update(self): # get a new reference to the existing VideoLog videolog = VideoLog.objects.get(id=self.original_videolog.id) # update the VideoLog videolog.points = self.NEW_POINTS videolog.total_seconds_watched = self.NEW_SECONDS_WATCHED videolog.save() # get a new reference to the existing VideoLog videolog2 = VideoLog.objects.get(id=self.original_videolog.id) # make sure the VideoLog was updated self.assertEqual(videolog2.points, self.NEW_POINTS, "The VideoLog's points were not updated.") self.assertEqual(videolog2.total_seconds_watched, self.NEW_SECONDS_WATCHED, "The VideoLog's total seconds watched were not updated.") @unittest.skip("Auto-merging is not yet automatic, so skip this") def test_videolog_collision(self): # create a new video log with the same youtube_id and user, but different points/total seconds watched videolog = VideoLog(video_id=self.VIDEO_ID, youtube_id=self.YOUTUBE_ID, user=self.user) videolog.points = self.NEW_POINTS videolog.total_seconds_watched = self.NEW_SECONDS_WATCHED # try saving the new VideoLog: this is where the collision will happen, hopefully leading to a merge videolog.save() # get a new reference to the existing VideoLog videolog2 = VideoLog.objects.get(id=self.original_videolog.id) # make sure the VideoLog has been properly merged self.assertEqual(videolog.points, max(self.ORIGINAL_POINTS, self.NEW_POINTS), "The VideoLog's points were not properly merged.") self.assertEqual(videolog.total_seconds_watched, max(self.ORIGINAL_ATTEMPTS, self.NEW_SECONDS_WATCHED), "The VideoLog's total seconds watched have already changed.")
def handle(self, *args, **options): self.video = None handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet videos = VideoFile.objects.filter(flagged_for_download=True, download_in_progress=False).exclude( youtube_id__in=failed_youtube_ids ) video_count = videos.count() if video_count == 0: self.stdout.write(_("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = videos[0] video.download_in_progress = True video.percent_complete = 0 video.save() self.stdout.write( (_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id} ) # Update the progress logging self.set_stages( num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"]) ) if not self.started(): self.start(stage_name=video.youtube_id) # Initiate the download process try: ensure_dir(settings.CONTENT_ROOT) progress_callback = partial(self.download_progress_callback, video) try: # Download via urllib download_video(video.youtube_id, callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug( _("Retrieving youtube video %(youtube_id)s via youtube-dl") % {"youtube_id": video.youtube_id} ) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats["status"] == "finished": percent = 100.0 elif stats["status"] == "downloading": percent = 100.0 * stats["downloaded_bytes"] / stats["total_bytes"] else: percent = 0.0 progress_callback(percent=percent) scrape_video( video.youtube_id, quiet=not settings.DEBUG, callback=partial(youtube_dl_cb, progress_callback=progress_callback), ) # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.youtube_id) self.stdout.write(_("Download is complete!") + "\n") # caching.invalidate_all_caches() # Unnecessary; we have a database listener for this. except DownloadCancelled: # Cancellation event video.percent_complete = 0 video.flagged_for_download = False video.download_in_progress = False video.save() failed_youtube_ids.append(video.youtube_id) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _("Error in downloading %(youtube_id)s: %(error_msg)s") % { "youtube_id": video.youtube_id, "error_msg": unicode(e), } self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.message elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False video.download_in_progress = False video.flagged_for_download = connection_error # Any error other than a connection error is fatal. video.save() # Rather than getting stuck on one video, continue to the next video. self.update_stage( stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg} ) failed_youtube_ids.append(video.youtube_id) continue # This can take a long time, without any further update, so ... best to avoid. if options["auto_cache"] and caching.caching_is_enabled() and handled_youtube_ids: self.update_stage( stage_name=self.video.youtube_id, stage_percent=0, notes=_("Generating all pages related to videos."), ) caching.regenerate_all_pages_related_to_videos( video_ids=list(set([i18n.get_video_id(yid) or yid for yid in handled_youtube_ids])) ) # Update self.complete( notes=_("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully.") % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), } ) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise
def show_logs(request, ndays=None): """Show file-based logging info for video downloads, language packs, and subtitles""" ndays = ndays or int(request.GET.get("days", 7)) def get_logger_filename(logger_type): return stats_logger(logger_type).handlers[0].baseFilename def parse_data(logger_type, data_fields, windowsize=128, ndays=None): parsed_data = {} nparts = len(data_fields) summary_data = dict([(fld, {}) for fld in (data_fields + ["date"])]) filepath = get_logger_filename(logger_type) if not os.path.exists(filepath): return (parsed_data, summary_data) # Group by ip, date, and youtube_id old_data = "" first_loop = True last_loop = False with open(filepath, "r") as fp: fp.seek(0, 2) # go to the end of the stream while True: # Read the next chunk of data try: # Get the data try: if first_loop: fp.seek(-windowsize, 1) # go backwards by a few first_loop = False else: fp.seek(-2 * windowsize, 1) # go backwards by a few cur_data = fp.read(windowsize) + old_data except: if last_loop and not old_data: raise elif last_loop: cur_data = old_data old_data = "" else: last_loop = True fp.seek(0) cur_data = fp.read(windowsize) + old_data # could be some overlap... if not cur_data: break; except: break # Parse the data lines = cur_data.split("\n") old_data = lines[0] if len(lines) > 1 else "" new_data = lines[1:] if len(lines) > 1 else lines for l in new_data: if not l: continue # All start with a date parts = l.split(" - ", 2) if len(parts) != 2: continue tim = parts[0] dat = tim.split(" ")[0] # Validate that this date is within the accepted range parsed_date = datetime.datetime.strptime(dat, "%Y-%m-%d") #logging.debug("%s %s" % (parsed_date, (datetime.datetime.now() - timedelta(days=ndays)))) if ndays is not None and datetime.datetime.now() - timedelta(days=ndays) > parsed_date: last_loop = True old_data = "" break; # The rest is semicolon-delimited parts = parts[1].split(";") # vd;127.0.0.1;xvnpSRO9IDM # Now save things off parsed_data[tim] = dict([(data_fields[idx], parts[idx]) for idx in range(nparts)]) summary_data["date"][dat] = 1 + summary_data["date"].get(dat, 0) for idx in range(nparts): summary_data[data_fields[idx]][parts[idx]] = 1 + summary_data[data_fields[idx]].get(parts[idx], 0) for key, val in summary_data.iteritems(): summary_data[key] = sorted_dict(val, key=lambda t: t[0]) return (parsed_data, summary_data) (video_raw_data, video_summary_data) = parse_data("videos", ["task_id", "ip_address", "youtube_id"], ndays=ndays) (lp_raw_data, lp_summary_data) = parse_data("language_packs", ["task_id", "ip_address", "lang_code", "version"], ndays=ndays) (srt_raw_data, srt_summary_data) = parse_data("subtitles", ["task_id", "ip_address", "lang_code", "youtube_id"], ndays=ndays) return { "ndays": ndays, "videos": { "raw": video_raw_data, "dates": video_summary_data["date"], "ips": video_summary_data["ip_address"], "slugs": sum_counter(video_summary_data["youtube_id"], fn=lambda yid: get_id2slug_map().get(get_video_id(yid))), "lang_codes": sum_counter(video_summary_data["youtube_id"], fn=lambda yid: get_video_language(yid)), }, "language_packs": { "raw": lp_raw_data, "dates": lp_summary_data["date"], "ips": lp_summary_data["ip_address"], "lang_codes": lp_summary_data["lang_code"], "versions": lp_summary_data["version"], }, "subtitles": { "raw": srt_raw_data, "dates": srt_summary_data["date"], "ips": srt_summary_data["ip_address"], "lang_codes": srt_summary_data["lang_code"], }, }
def handle(self, *args, **options): self.video = None handled_youtube_ids = [] # stored to deal with caching failed_youtube_ids = [] # stored to avoid requerying failures. set_process_priority.lowest(logging=settings.LOG) try: while True: # loop until the method is aborted # Grab any video that hasn't been tried yet videos = VideoFile.objects \ .filter(flagged_for_download=True, download_in_progress=False) \ .exclude(youtube_id__in=failed_youtube_ids) video_count = videos.count() if video_count == 0: self.stdout.write( _("Nothing to download; exiting.") + "\n") break # Grab a video as OURS to handle, set fields to indicate to others that we're on it! # Update the video logging video = videos[0] video.download_in_progress = True video.percent_complete = 0 video.save() self.stdout.write( (_("Downloading video '%(youtube_id)s'...") + "\n") % {"youtube_id": video.youtube_id}) # Update the progress logging self.set_stages( num_stages=video_count + len(handled_youtube_ids) + len(failed_youtube_ids) + int(options["auto_cache"])) if not self.started(): self.start(stage_name=video.youtube_id) # Initiate the download process try: ensure_dir(settings.CONTENT_ROOT) progress_callback = partial( self.download_progress_callback, video) try: # Download via urllib download_video(video.youtube_id, callback=progress_callback) except URLNotFound: # Video was not found on amazon cloud service, # either due to a KA mistake, or due to the fact # that it's a dubbed video. # # We can use youtube-dl to get that video!! logging.debug( _("Retrieving youtube video %(youtube_id)s via youtube-dl" ) % {"youtube_id": video.youtube_id}) def youtube_dl_cb(stats, progress_callback, *args, **kwargs): if stats['status'] == "finished": percent = 100. elif stats['status'] == "downloading": percent = 100. * stats[ 'downloaded_bytes'] / stats['total_bytes'] else: percent = 0. progress_callback(percent=percent) scrape_video(video.youtube_id, quiet=not settings.DEBUG, callback=partial( youtube_dl_cb, progress_callback=progress_callback)) # If we got here, we downloaded ... somehow :) handled_youtube_ids.append(video.youtube_id) self.stdout.write(_("Download is complete!") + "\n") # caching.invalidate_all_caches() # Unnecessary; we have a database listener for this. except DownloadCancelled: # Cancellation event video.percent_complete = 0 video.flagged_for_download = False video.download_in_progress = False video.save() failed_youtube_ids.append(video.youtube_id) except Exception as e: # On error, report the error, mark the video as not downloaded, # and allow the loop to try other videos. msg = _( "Error in downloading %(youtube_id)s: %(error_msg)s" ) % { "youtube_id": video.youtube_id, "error_msg": unicode(e) } self.stderr.write("%s\n" % msg) # If a connection error, we should retry. if isinstance(e, DownloadError): connection_error = "[Errno 8]" in e.message elif isinstance(e, IOError) and hasattr(e, "strerror"): connection_error = e.strerror[0] == 8 else: connection_error = False video.download_in_progress = False video.flagged_for_download = connection_error # Any error other than a connection error is fatal. video.save() # Rather than getting stuck on one video, continue to the next video. self.update_stage( stage_status="error", notes=_("%(error_msg)s; continuing to next video.") % {"error_msg": msg}) failed_youtube_ids.append(video.youtube_id) continue # This can take a long time, without any further update, so ... best to avoid. if options["auto_cache"] and caching.caching_is_enabled( ) and handled_youtube_ids: self.update_stage( stage_name=self.video.youtube_id, stage_percent=0, notes=_("Generating all pages related to videos.")) caching.regenerate_all_pages_related_to_videos(video_ids=list( set([ i18n.get_video_id(yid) or yid for yid in handled_youtube_ids ]))) # Update self.complete( notes= _("Downloaded %(num_handled_videos)s of %(num_total_videos)s videos successfully." ) % { "num_handled_videos": len(handled_youtube_ids), "num_total_videos": len(handled_youtube_ids) + len(failed_youtube_ids), }) except Exception as e: self.cancel(stage_status="error", notes=_("Error: %(error_msg)s") % {"error_msg": e}) raise
def get_video_node_by_youtube_id(youtube_id): """Returns the video node corresponding to the video_id of the given youtube_id, or None""" video_id = i18n.get_video_id(youtube_id=youtube_id) return topic_tools.get_node_cache("Content").get(video_id, [None])
def show_logs(request, ndays=None): """Show file-based logging info for video downloads, language packs, and subtitles""" ndays = ndays or int(request.GET.get("days", 7)) def get_logger_filename(logger_type): return stats_logger(logger_type).handlers[0].baseFilename def parse_data(logger_type, data_fields, windowsize=128, ndays=None): parsed_data = {} nparts = len(data_fields) summary_data = dict([(fld, {}) for fld in (data_fields + ["date"])]) filepath = get_logger_filename(logger_type) if not os.path.exists(filepath): return (parsed_data, summary_data) # Group by ip, date, and youtube_id old_data = "" first_loop = True last_loop = False with open(filepath, "r") as fp: fp.seek(0, 2) # go to the end of the stream while True: # Read the next chunk of data try: # Get the data try: if first_loop: fp.seek(-windowsize, 1) # go backwards by a few first_loop = False else: fp.seek(-2 * windowsize, 1) # go backwards by a few cur_data = fp.read(windowsize) + old_data except: if last_loop and not old_data: raise elif last_loop: cur_data = old_data old_data = "" else: last_loop = True fp.seek(0) cur_data = fp.read( windowsize ) + old_data # could be some overlap... if not cur_data: break except: break # Parse the data lines = cur_data.split("\n") old_data = lines[0] if len(lines) > 1 else "" new_data = lines[1:] if len(lines) > 1 else lines for l in new_data: if not l: continue # All start with a date parts = l.split(" - ", 2) if len(parts) != 2: continue tim = parts[0] dat = tim.split(" ")[0] # Validate that this date is within the accepted range parsed_date = datetime.datetime.strptime(dat, "%Y-%m-%d") #logging.debug("%s %s" % (parsed_date, (datetime.datetime.now() - timedelta(days=ndays)))) if ndays is not None and datetime.datetime.now( ) - timedelta(days=ndays) > parsed_date: last_loop = True old_data = "" break # The rest is semicolon-delimited parts = parts[1].split(";") # vd;127.0.0.1;xvnpSRO9IDM # Now save things off parsed_data[tim] = dict([(data_fields[idx], parts[idx]) for idx in range(nparts)]) summary_data["date"][dat] = 1 + summary_data["date"].get( dat, 0) for idx in range(nparts): summary_data[data_fields[idx]][parts[ idx]] = 1 + summary_data[data_fields[idx]].get( parts[idx], 0) for key, val in summary_data.iteritems(): summary_data[key] = sorted_dict(val, key=lambda t: t[0]) return (parsed_data, summary_data) (video_raw_data, video_summary_data) = parse_data("videos", ["task_id", "ip_address", "youtube_id"], ndays=ndays) (lp_raw_data, lp_summary_data) = parse_data( "language_packs", ["task_id", "ip_address", "lang_code", "version"], ndays=ndays) (srt_raw_data, srt_summary_data) = parse_data( "subtitles", ["task_id", "ip_address", "lang_code", "youtube_id"], ndays=ndays) return { "ndays": ndays, "videos": { "raw": video_raw_data, "dates": video_summary_data["date"], "ips": video_summary_data["ip_address"], "slugs": sum_counter( video_summary_data["youtube_id"], fn=lambda yid: get_id2slug_map().get(get_video_id(yid))), "lang_codes": sum_counter(video_summary_data["youtube_id"], fn=lambda yid: get_video_language(yid)), }, "language_packs": { "raw": lp_raw_data, "dates": lp_summary_data["date"], "ips": lp_summary_data["ip_address"], "lang_codes": lp_summary_data["lang_code"], "versions": lp_summary_data["version"], }, "subtitles": { "raw": srt_raw_data, "dates": srt_summary_data["date"], "ips": srt_summary_data["ip_address"], "lang_codes": srt_summary_data["lang_code"], }, }
def get_video_by_youtube_id(youtube_id): # TODO(bcipolli): will need to change for dubbed videos video_id = i18n.get_video_id(youtube_id=youtube_id) return get_node_cache("Video").get(video_id, [None])[0]