Esempio n. 1
0
def upload_converted_to_archive(youtube_id, formats_to_upload):
    # The bucket may not exist yet on archive.org. Unfortunately create_bucket
    # is broken in boto (it requires all-lowercase, despite the fact that
    # we're using OrdinaryCallingFormat). Fortunately get_bucket can be told
    # to not check that the bucket exists with the validate=False flag, and 
    # the "x-archive-auto-make-bucket" header we pass below automatically
    # creates the bucket with the first upload.
    dest_bucket = archive_connection.get_bucket("KA-converted-{0}".format(youtube_id), validate=False)

    source_keys_for_format = defaultdict(list)
    for key in list(converted_bucket.list(youtube_id)):
        video_match = re_video_key_name.match(key.name)
        if video_match is None:
            if re_legacy_video_key_name.match(key.name) is None:
                logger.info("Unrecognized file in converted bucket {0}".format(key.name))
            continue

        assert video_match.group(1) == youtube_id
        format = video_match.group(2)

        modification_date = datetime.strptime(key.last_modified, "%Y-%m-%dT%H:%M:%S.%fZ")
        if datetime.utcnow() - modification_date < timedelta(hours=1):
            logger.error("Format {0} for video {1} appeared ready on S3, but further inspection showed Zencoder may still be uploading it. Modification date {2}".format(format, youtube_id, modification_date))
            return False

        # Maps format (mp4, m3u8, etc) to list of keys
        source_keys_for_format[format].append(key)

    for format in formats_to_upload:
        if len(source_keys_for_format[format]) == 0:
            logger.error("Requested upload of format {0} for video {1} to archive.org, but unable to find video format in converted bucket".format(format, youtube_id))
            return False

    # Fetch the video metadata so we can specify title and description to archive.org
    video_metadata = api.video_metadata(youtube_id)

    # Only pass ascii title and descriptions in headers to archive, and strip newlines
    def normalize_for_header(s):
        return unicodedata.normalize("NFKD", s or u"").encode("ascii", "ignore").replace("\n", "")

    uploaded_filenames = []

    for format in formats_to_upload:
        for key in source_keys_for_format[format]:
            video_match = re_video_key_name.match(key.name)
            assert video_match.group(1) == youtube_id
            assert video_match.group(2) == format
            video_prefix = video_match.group()
            assert key.name.startswith(video_prefix)
            destination_name = key.name[len(video_prefix):]
            assert "/" not in destination_name # Don't expect more than one level of nesting
            
            logger.debug("Copying file {0} to archive.org".format(destination_name))
            
            with tempfile.TemporaryFile() as t:
                pbar = ProgressBar(maxval=100).start()
                def get_cb(bytes_sent, bytes_total):
                    pbar.update(50.0 * bytes_sent / bytes_total)
                def send_cb(bytes_sent, bytes_total):
                    pbar.update(50.0 + 50.0 * bytes_sent / bytes_total)
                key.get_file(t, cb=get_cb)
                
                t.seek(0)
                dest_key = Key(dest_bucket, destination_name)
                headers = {
                    "x-archive-auto-make-bucket": "1",
                    "x-archive-meta-collection": "khanacademy", 
                    "x-archive-meta-title": normalize_for_header(video_metadata["title"]),
                    "x-archive-meta-description": normalize_for_header(video_metadata["description"]),
                    "x-archive-meta-mediatype": "movies", 
                    "x-archive-meta01-subject": "Salman Khan", 
                    "x-archive-meta02-subject": "Khan Academy",
                }
                for attempt in xrange(10):
                    try:
                        dest_key.set_contents_from_file(t, headers=headers, cb=send_cb)
                        break
                    except BotoServerError as e:
                        logger.error("Error {0} {1} during upload attempt {2} to archive.org.".format(e.status, e.reason, attempt))
                else:
                    raise Exception("Gave up publish attempt due to server errors")
                pbar.finish()
                
                uploaded_filenames.append(destination_name)

    logger.debug("Waiting 10 seconds for uploads to propagate")
    time.sleep(10)

    for destination_name in uploaded_filenames:
        if verify_archive_upload(youtube_id, destination_name):
            logger.error("Verified upload {0}/{1}".format(youtube_id, destination_name))
        else:
            logger.error("Unable to verify upload {0}/{1}".format(youtube_id, destination_name))
            return False

    return True
Esempio n. 2
0
    def publish_converted_videos(max_videos, dryrun=False, use_archive=True):

        logger.info(
            "Searching for downloadable content that needs to be "
            "published")

        publish_attempts = 0
        converted_formats = s3.list_converted_formats()

        for youtube_id, missing_formats in (api.list_missing_video_content()
                .iteritems()):
            if publish_attempts >= max_videos:
                logger.info("Stopping: max videos reached")
                break

            converted_missing_formats = (
                converted_formats[youtube_id] & missing_formats)

            unconverted_formats = missing_formats - converted_missing_formats
            if len(unconverted_formats) > 0:
                logger.info(
                    "Video %s missing formats %s which are still "
                    "unconverted, can't be published" %
                    (youtube_id, ",".join(unconverted_formats)))

            # If no converted content waiting, just continue to next video
            if len(converted_missing_formats) == 0:
                continue

            if dryrun:
                logger.info(
                    "Skipping publish for video {0} formats {1} due to dryrun"
                    .format(youtube_id, ", ".join(converted_missing_formats)))
            else:
                if use_archive:
                    if s3.upload_converted_to_archive(
                            youtube_id, converted_missing_formats):
                        logger.info("Successfully uploaded to archive.org")
                    else:
                        logger.error(
                            "Unable to upload video {0} to archive.org"
                            .format(youtube_id))
                        continue
                else:
                    logger.info("Skipping upload to archive.org; assuming API "
                                "points directly to S3 instead.")

                current_format_downloads = (api.video_metadata(youtube_id)[
                    "download_urls"] or {})
                current_formats = set(current_format_downloads.keys())
                new_formats = current_formats | converted_missing_formats

                if "mp4" in new_formats:
                    # PNG thumbnails are generated as part of the MP4
                    # conversion process.  If mp4 has been uploaded to
                    # archive.org, png is guaranteed to be there as well.
                    new_formats.add("png")

                if api.update_download_available(youtube_id, new_formats):
                    logger.info(
                        "Updated KA download_available, set to {0} for video "
                        "{1}".format(", ".join(new_formats), youtube_id))
                else:
                    logger.error(
                        "Unable to update KA download_available to {0} for "
                        "youtube id {1}".format(", ".join(new_formats),
                                                youtube_id))

                publish_attempts += 1