def upload_converted_to_archive(youtube_id, formats_to_upload): # The bucket may not exist yet on archive.org. Unfortunately create_bucket # is broken in boto (it requires all-lowercase, despite the fact that # we're using OrdinaryCallingFormat). Fortunately get_bucket can be told # to not check that the bucket exists with the validate=False flag, and # the "x-archive-auto-make-bucket" header we pass below automatically # creates the bucket with the first upload. dest_bucket = archive_connection.get_bucket("KA-converted-{0}".format(youtube_id), validate=False) source_keys_for_format = defaultdict(list) for key in list(converted_bucket.list(youtube_id)): video_match = re_video_key_name.match(key.name) if video_match is None: if re_legacy_video_key_name.match(key.name) is None: logger.info("Unrecognized file in converted bucket {0}".format(key.name)) continue assert video_match.group(1) == youtube_id format = video_match.group(2) modification_date = datetime.strptime(key.last_modified, "%Y-%m-%dT%H:%M:%S.%fZ") if datetime.utcnow() - modification_date < timedelta(hours=1): logger.error("Format {0} for video {1} appeared ready on S3, but further inspection showed Zencoder may still be uploading it. Modification date {2}".format(format, youtube_id, modification_date)) return False # Maps format (mp4, m3u8, etc) to list of keys source_keys_for_format[format].append(key) for format in formats_to_upload: if len(source_keys_for_format[format]) == 0: logger.error("Requested upload of format {0} for video {1} to archive.org, but unable to find video format in converted bucket".format(format, youtube_id)) return False # Fetch the video metadata so we can specify title and description to archive.org video_metadata = api.video_metadata(youtube_id) # Only pass ascii title and descriptions in headers to archive, and strip newlines def normalize_for_header(s): return unicodedata.normalize("NFKD", s or u"").encode("ascii", "ignore").replace("\n", "") uploaded_filenames = [] for format in formats_to_upload: for key in source_keys_for_format[format]: video_match = re_video_key_name.match(key.name) assert video_match.group(1) == youtube_id assert video_match.group(2) == format video_prefix = video_match.group() assert key.name.startswith(video_prefix) destination_name = key.name[len(video_prefix):] assert "/" not in destination_name # Don't expect more than one level of nesting logger.debug("Copying file {0} to archive.org".format(destination_name)) with tempfile.TemporaryFile() as t: pbar = ProgressBar(maxval=100).start() def get_cb(bytes_sent, bytes_total): pbar.update(50.0 * bytes_sent / bytes_total) def send_cb(bytes_sent, bytes_total): pbar.update(50.0 + 50.0 * bytes_sent / bytes_total) key.get_file(t, cb=get_cb) t.seek(0) dest_key = Key(dest_bucket, destination_name) headers = { "x-archive-auto-make-bucket": "1", "x-archive-meta-collection": "khanacademy", "x-archive-meta-title": normalize_for_header(video_metadata["title"]), "x-archive-meta-description": normalize_for_header(video_metadata["description"]), "x-archive-meta-mediatype": "movies", "x-archive-meta01-subject": "Salman Khan", "x-archive-meta02-subject": "Khan Academy", } for attempt in xrange(10): try: dest_key.set_contents_from_file(t, headers=headers, cb=send_cb) break except BotoServerError as e: logger.error("Error {0} {1} during upload attempt {2} to archive.org.".format(e.status, e.reason, attempt)) else: raise Exception("Gave up publish attempt due to server errors") pbar.finish() uploaded_filenames.append(destination_name) logger.debug("Waiting 10 seconds for uploads to propagate") time.sleep(10) for destination_name in uploaded_filenames: if verify_archive_upload(youtube_id, destination_name): logger.error("Verified upload {0}/{1}".format(youtube_id, destination_name)) else: logger.error("Unable to verify upload {0}/{1}".format(youtube_id, destination_name)) return False return True
def publish_converted_videos(max_videos, dryrun=False, use_archive=True): logger.info( "Searching for downloadable content that needs to be " "published") publish_attempts = 0 converted_formats = s3.list_converted_formats() for youtube_id, missing_formats in (api.list_missing_video_content() .iteritems()): if publish_attempts >= max_videos: logger.info("Stopping: max videos reached") break converted_missing_formats = ( converted_formats[youtube_id] & missing_formats) unconverted_formats = missing_formats - converted_missing_formats if len(unconverted_formats) > 0: logger.info( "Video %s missing formats %s which are still " "unconverted, can't be published" % (youtube_id, ",".join(unconverted_formats))) # If no converted content waiting, just continue to next video if len(converted_missing_formats) == 0: continue if dryrun: logger.info( "Skipping publish for video {0} formats {1} due to dryrun" .format(youtube_id, ", ".join(converted_missing_formats))) else: if use_archive: if s3.upload_converted_to_archive( youtube_id, converted_missing_formats): logger.info("Successfully uploaded to archive.org") else: logger.error( "Unable to upload video {0} to archive.org" .format(youtube_id)) continue else: logger.info("Skipping upload to archive.org; assuming API " "points directly to S3 instead.") current_format_downloads = (api.video_metadata(youtube_id)[ "download_urls"] or {}) current_formats = set(current_format_downloads.keys()) new_formats = current_formats | converted_missing_formats if "mp4" in new_formats: # PNG thumbnails are generated as part of the MP4 # conversion process. If mp4 has been uploaded to # archive.org, png is guaranteed to be there as well. new_formats.add("png") if api.update_download_available(youtube_id, new_formats): logger.info( "Updated KA download_available, set to {0} for video " "{1}".format(", ".join(new_formats), youtube_id)) else: logger.error( "Unable to update KA download_available to {0} for " "youtube id {1}".format(", ".join(new_formats), youtube_id)) publish_attempts += 1