def download_and_convert_mpeg_file(mpeg_url): """ Kolibri AudioNode only support .mp3 files and not .mpeg, so we must convert. """ mpeg_filename = mpeg_url.split('/')[-1] # e.g. something.wav mpeg_path = os.path.join(DOWNLOADED_MPEG_FILES_DIR, mpeg_filename) print(mpeg_path, 'mpeg_path') # 1. DOWNLOAD download_file(mpeg_url, DOWNLOADED_MPEG_FILES_DIR) print("mpeg downloaded") # 2. CONVERT mp3_filename = mpeg_filename.replace('.mpeg', '.mp3') mp3_path = os.path.join(CONVERTED_MP3_FILES_DIR, mp3_filename) print(mp3_filename, mp3_path) if not os.path.exists(mp3_path): try: command = [ "ffmpeg", "-i", mpeg_path, "-acodec", "mp3", "-ac", "2", "-ab", "64k", "-y", "-hide_banner", "-loglevel", "warning", mp3_path ] subprocess.check_call(command) print("Successfully converted mpeg file to mp3") except subprocess.CalledProcessError: print("Problem converting " + mpeg_url) return None # Return path of converted mp3 file return mp3_path
def download_and_convert_png_file(png_url): """ Kolibri VideoNode only support .mp4 files and not .m4v, so we must convert. """ m4v_filename = png_url.split('/')[-1] # e.g. something.wav m4v_path = os.path.join(DOWNLOADED_M4V_FILES_DIR, m4v_filename) print(m4v_path, 'm4v_path') # 1. DOWNLOAD M4V file download_file(png_url, DOWNLOADED_M4V_FILES_DIR) print("m4v downloaded") # 2. CONVERT mp4_filename = m4v_filename.replace('.png', '.mp4') mp4_path = os.path.join(CONVERTED_MP4_FILES_DIR, mp4_filename) print(mp4_filename, mp4_path) if not os.path.exists(mp4_path): try: command = [ "ffmpeg", "-i", m4v_path, "-vcodec", "copy", "-acodec", "copy", mp4_path ] subprocess.check_call(command) print("Successfully converted m4v file to mp4") except subprocess.CalledProcessError: print("Problem converting " + png_url) return None # Return path of converted mp4 file return mp4_path
def process_wikipedia_page(content, baseurl, destpath, **kwargs): """ Saves images to html zip folder """ page = BeautifulSoup(content, "html.parser") index = 0 # Add style sheets to zip file for link in page.find_all("link"): if link.get('href') and 'stylesheet' in link['rel']: try: subpath = "item_{}".format(index) link["href"], _ = download_file(make_fully_qualified_url( link['href']), destpath, subpath=subpath) index = index + 1 except Exception: link["href"] = "#" # Add images to zip file for image in page.find_all("img"): try: relpath, _ = download_file(make_fully_qualified_url(image["src"]), destpath) image["src"] = relpath except Exception: image["src"] = "#" # Replace links with text to avoid broken links content = str(page) for link in page.find_all("a"): if link.get('href') and not link['href'].startswith("#"): content = content.replace(str(link), link.text) return content
def download_assets(selector, attr, url_middleware=None, content_middleware=None, node_filter=None): nodes = doc.select(selector) for i, node in enumerate(nodes): if node_filter: if not node_filter(node): src = node[attr] node[attr] = '' print(' Skipping node with src ', src) continue if node[attr].startswith('data:'): continue url = urljoin(base_url, node[attr]) if _is_blacklisted(url, url_blacklist): print(' Skipping downloading blacklisted url', url) node[attr] = "" continue if url_middleware: url = url_middleware(url) filename = derive_filename(url) node[attr] = filename print(" Downloading", url, "to filename", filename) download_file(url, destination, request_fn=request_fn, filename=filename, middleware_callbacks=content_middleware)
def download_writing_topic_category(category_doc, title, level_id): destination = tempfile.mkdtemp() # Download a font font_url = make_fully_qualified_url( '//fonts.googleapis.com/css?family=Roboto:400,300,300italic,400italic,700,700italic' ) download_file(font_url, destination, request_fn=make_request, filename='roboto.css') # Write out the HTML source, based on CSS formatting from # https://k12.thoughtfullearning.com/resources/writingtopics topics = (("<li>%s</li>" % topic.text) for topic in category_doc.select('.views-row')) html_source = """ <!DOCTYPE html> <head> <link href='roboto.css' rel='stylesheet' type='text/css'> <style> ul { margin: 0 0 0 40px; padding: 0; } li { font-family: "Roboto", sans-serif; font-weight: 300; font-size: 19.2px; line-height: 24.96px; color: #202020; margin-top: 10px; } </style> </head> <body> <ul>%s</ul> </body> """ % ''.join(topics) with open(os.path.join(destination, "index.html"), "w") as f: f.write(html_source) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( source_id="%s|%s" % (level_id, title), title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), files=[files.HTMLZipFile(zip_path)], language="en", thumbnail=writing_topic_thumbnail, )
def repl(match): src = match.group(1) if src.startswith('//localhost'): return 'src()' # Don't download data: files if src.startswith('data:'): return match.group(0) src_url = make_fully_qualified_url(src) derived_filename = derive_filename(src_url) download_file(src_url, destination, request_fn=make_request, filename=derived_filename) return 'src("%s")' % derived_filename
def download_assets(doc, selector, attr, destination, middleware=None): """ Find all assets in `attr` for DOM elements that match `selector` within doc and download them to `destination` dir. """ nodes = doc.select(selector) for i, node in enumerate(nodes): url = make_fully_qualified_url(node[attr]) filename = "%s_%s" % (i, os.path.basename(url)) node[attr] = filename download_file(url, destination, request_fn=make_request, filename=filename, middleware_callbacks=middleware)
def download_wikipedia_page(url, thumbnail, title): """ Create zip file to use for html pages """ destpath = tempfile.mkdtemp( ) # Create a temp directory to house our downloaded files # downlod the main wikipedia page, apply a middleware processor, and call it index.html localref, _ = download_file( url, destpath, filename="index.html", middleware_callbacks=process_wikipedia_page, ) zippath = create_predictable_zip( destpath) # Turn the temp folder into a zip file # Create an HTML5 app node html5app = nodes.HTML5AppNode( files=[files.HTMLZipFile(zippath)], title=title, thumbnail=thumbnail, source_id=url.split("/")[-1], license=CHANNEL_LICENSE, ) return html5app
def js_middleware(content, url, **kwargs): # Download all images referenced in JS files for img in IMAGES_IN_JS_RE.findall(content): url = make_fully_qualified_url('/images/%s' % img) print(" Downloading", url, "to filename", img) download_file(url, destination, subpath="images", request_fn=make_request, filename=img) # Polyfill localStorage and document.cookie as iframes can't access # them return (content.replace("localStorage", "_localStorage").replace( 'document.cookie.split', '"".split').replace('document.cookie', 'window._document_cookie'))
def download_wikipedia_page(url, thumbnail, title): # create a temp directory to house our downloaded files destpath = tempfile.mkdtemp() # downlod the main wikipedia page, apply a middleware processor, and call it index.html localref, _ = download_file( url, destpath, filename="index.html", middleware_callbacks=process_wikipedia_page, request_fn=make_request, ) # turn the temp folder into a zip file zippath = create_predictable_zip(destpath) # create an HTML5 app node html5app = HTML5AppNode( files=[HTMLZipFile(zippath)], title=title, thumbnail=thumbnail, source_id=url.split("/")[-1], license=licenses.PublicDomainLicense(), ) return html5app
def get_phet_zip_file(zip_file_url, main_file_and_query): """ Phet simulations are provided in the zip file `phet.zip`, and the entry point is passed as a GET parameter in `main_file_and_query`. To make these compatible with Kolibri's default behaviour of loading index.html, we will: - Rename index.html to phetindex.thml - Add a custom index.html that uses javascrpt redirect to phetindex.thml?{sim_id} """ u = urlparse(main_file_and_query) idk, sim_id = u.query.split('=') assert idk == 'id', 'unknown query sting format found' + main_file_and_query main_file = u.scheme + '://' + u.netloc + u.path # skip querystring destpath = tempfile.mkdtemp() LOGGER.info('saving phet zip file in dir ' + destpath) try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] zip_basename = zip_filename.rsplit('.', 1)[0] zip_folder = os.path.join(destpath, zip_basename) # Extract zip file contents. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: zf.extractall(destpath) # Rename main_file to phetindex.html main_file = main_file.split('/')[-1] src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'phetindex.html') os.rename(src, dest) # Create the index_html = PHET_INDEX_HTML_TEMPLATE.format(sim_id=sim_id) with open(os.path.join(zip_folder, 'index.html'), 'w') as indexf: indexf.write(index_html) # Always be zipping! return create_predictable_zip(zip_folder) except Exception as e: LOGGER.error("get_phet_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file_and_query, destpath, e)) return None
def process_wikipedia_page(content, baseurl, destpath, **kwargs): page = BeautifulSoup(content, "html.parser") for image in page.find_all("img"): relpath, _ = download_file(make_fully_qualified_url(image["src"]), destpath, request_fn=make_request) image["src"] = relpath return str(page)
def repl(match): src = match.group(1) if src.startswith('//localhost'): return 'url()' # Don't download data: files if src.startswith('data:'): return match.group(0) src_url = urljoin(base_url, os.path.join(file_dir, src)) if _is_blacklisted(src_url, url_blacklist): print(' Skipping downloading blacklisted url', src_url) return 'url()' derived_filename = derive_filename(src_url) download_file(src_url, destination, request_fn=request_fn, filename=derived_filename) return 'url("%s")' % derived_filename
def repl(match): src = match.group(1) if src.startswith('//localhost'): return 'url()' # Don't download data: files if src.startswith('data:'): return match.group(0) parts = urlparse(src) root_url = None if url: root_url = url[:url.rfind('/') + 1] if parts.scheme and parts.netloc: src_url = src elif parts.path.startswith('/') and url: src_url = '{}://{}{}'.format(root_parts.scheme, root_parts.netloc, parts.path) elif url and root_url: src_url = urljoin(root_url, src) else: src_url = urljoin(base_url, src) if _is_blacklisted(src_url, url_blacklist): print(' Skipping downloading blacklisted url', src_url) return 'url()' derived_filename = derive_filename(src_url) # The _derive_filename function puts all files in the root, so all URLs need # rewritten. When using get_archive_filename, relative URLs will still work. new_url = src if derive_filename == _derive_filename: if url and parts.path.startswith('/'): parent_url = derive_filename(url) new_url = os.path.relpath(src, os.path.dirname(parent_url)) else: new_url = derived_filename download_file(src_url, destination, request_fn=request_fn, filename=derived_filename) return 'url("%s")' % new_url
def download_assets_from_github(repo_name, repo_path, destination): print(' Downloading files from GitHub repo %s/%s ...' % (repo_name, repo_path)) access_token_param = '' if _GITHUB_API_TOKEN: access_token_param = '&access_token=%s' % _GITHUB_API_TOKEN url = 'https://api.github.com/repos/%s/contents/%s?ref=master%s' % ( repo_name, repo_path, access_token_param) response = make_request(url) for item in response.json(): if item['type'] == 'file': filename = item['name'] download_url = item['download_url'] print(' Downloading %s' % download_url) download_file(download_url, destination, request_fn=make_request, filename=filename)
def js_middleware(content, url, **kwargs): if DEBUG_MODE: print('in js_middleware', url) # Download all images referenced in JS files for img in IMAGES_IN_JS_RE.findall(content): url = make_fully_qualified_url('/images/%s' % img) print("Downloading", url, "to filename", img) download_file(url, destination, subpath="images", request_fn=make_request, filename=img) # Monkey-patch the js code that use localStorage and document.cookie so # to use window._localStorage (a plain js object) instead real localStorage # This change primarily affects the functions getStoredValue and setStoredValue # which are used to set the following properties: # - diffRange: sets age-range for stories (needed to avoid a dialog popup) # - lng: set to arabic # - audio: toggles between read-aloud vs. no read-aloud return (content .replace("localStorage", "_localStorage") .replace('document.cookie.split', '"".split') .replace('document.cookie', 'window._document_cookie'))
def download_assets(selector, attr, url_middleware=None, content_middleware=None, node_filter=None): nodes = doc.select(selector) for i, node in enumerate(nodes): if node_filter: if not node_filter(node): src = node[attr] # node[attr] = '' node.decompose() print('Skipping node with src ', src) continue # Remove preconnect and preload links form header relattr = node.get('rel', None) if relattr in ['preconnect', 'preload', 'apple-touch-icon']: node.decompose() continue url = make_fully_qualified_url(node[attr]) if is_blacklisted(url): print('Skipping downloading blacklisted url', url) node.decompose() # node[attr] = "" continue if url_middleware: url = url_middleware(url) filename = derive_filename(url) node[attr] = filename print("Downloading", url, "to filename", filename) download_file(url, destination, request_fn=make_request, filename=filename, middleware_callbacks=content_middleware)
def get_zip_file(zip_file_url, main_file): """HTML games are provided as zip files, the entry point of the game is main_file. main_file needs to be renamed to index.html to make it compatible with Kolibri. """ destpath = tempfile.mkdtemp() try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] zip_basename = zip_filename.rsplit('.', 1)[0] zip_folder = os.path.join(destpath, zip_basename) # Extract zip file contents. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: zf.extractall(destpath) # In some cases, the files are under the www directory, # let's move them up one level. www_dir = os.path.join(zip_folder, 'www') if os.path.isdir(www_dir): files = os.listdir(www_dir) for f in files: shutil.move(os.path.join(www_dir, f), zip_folder) # Rename main_file to index.html. main_file = main_file.split('/')[-1] src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'index.html') os.rename(src, dest) return create_predictable_zip(zip_folder) except Exception as e: LOGGER.error("get_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file, destpath, e)) return None
def download_wikipedia_page(url, title, writer, thumbnail=None): """ Create zip file to use for html pages """ destpath = tempfile.mkdtemp( ) # Create a temp directory to house our downloaded files # Generate details for files details = { 'thumbnail': thumbnail, 'source_id': url.split("/")[-1], 'license': CHANNEL_LICENSE, } # Download the main wikipedia page, apply middleware processor, and call it index.html localref, _ = download_file(url, destpath, filename="index.html", middleware_callbacks=process_wikipedia_page) zippath = create_predictable_zip( destpath) # Turn the temp folder into a zip file writer.add_file(str(PATH), title, zippath, **details)
def download_additional_assets(destination, puzzle_name): url = make_fully_qualified_url('/third-party/JS-Interpreter/compiled.js') download_file(url, os.path.join(destination, 'third-party/JS-Interpreter'), request_fn=make_request, filename='compiled.js') dir_name = puzzle_name if dir_name == 'pond-tutor' or dir_name == 'pond-duck': dir_name = 'pond' url = make_fully_qualified_url('/pond/docs/generated/en/compressed.js') download_file(url, os.path.join(destination, 'pond/docs/generated/en'), request_fn=make_request, filename='compressed.js') url = make_fully_qualified_url('third-party/ace/worker-javascript.js') download_file(url, destination, request_fn=make_request, filename='worker-javascript.js') download_assets_from_github('google/blockly-games', 'appengine/pond/docs', os.path.join(destination, 'pond/docs')) download_assets_from_github('google/blockly-games', 'appengine/%s' % dir_name, os.path.join(destination, dir_name)) download_assets_from_github('google/blockly-games', 'appengine/%s' % dir_name, destination) download_assets_from_github('google/blockly-games', 'appengine/common', os.path.join(destination, 'common')) download_assets_from_github('google/blockly', 'media', destination) download_assets_from_github( 'google/blockly', 'media', os.path.join(destination, 'third-party/blockly/media'))
def get_zip_file(zip_file_url, main_file): """ HTML games are provided as zip files, the entry point of the game is `main_file`. THe `main_file` needs to be renamed to index.html to make it compatible with Kolibri. """ key = zip_file_url + main_file destpath = make_temporary_dir_from_key(key) # Check for "REPLACE WITH:" correction rule for the current `zip_file_url` replacement_url = should_replace_with(zip_file_url) if replacement_url: zip_file_url = replacement_url # return cached version if already there final_webroot_path = os.path.join(destpath, 'webroot.zip') if os.path.exists(final_webroot_path): return final_webroot_path try: download_file(zip_file_url, destpath, request_fn=make_request) zip_filename = zip_file_url.split('/')[-1] # e.g. Mathematics.zip zip_basename = zip_filename.rsplit('.', 1)[0] # e.g. Mathematics/ # July 31: handle ednge cases where zip filename doesn't match folder name inside it awazchitras = [ 'Awazchitra_HI', 'Awazchitra_TL', 'Awazchitra_KN', 'Awazchitra_BN', 'Awazchitra_OD', 'Awazchitra_PN', 'Awazchitra_TM' ] for awazchitra in awazchitras: if awazchitra in zip_basename: zip_basename = zip_basename.replace('Awazchitra', 'AwazChitra') if '_KKS_Hi' in zip_basename: zip_basename = zip_basename.replace('_KKS_Hi', '_KKS_HI') # Mar 2: more edge cases where zip filename doesn't match folder name inside it if 'Memorygamekb' in zip_basename: zip_basename = zip_basename.replace('Memorygamekb', 'MemoryGamekb') if 'cityofstories' in zip_basename: zip_basename = zip_basename.replace('cityofstories', 'CityOfStories') # Jun 12: fix more edge cases where .zip filename doesn't match dir name if '_KKS_Gj' in zip_basename: zip_basename = zip_basename.replace('_KKS_Gj', '_KKS_GJ') if 'ShabdKhel' in zip_basename: zip_basename = zip_basename.replace('ShabdKhel', 'Shabdkhel') zip_folder = os.path.join(destpath, zip_basename) # e.g. destpath/Mathematics/ main_file = main_file.split('/')[ -1] # e.g. activity_name.html or index.html if 'KhelbadiKahaniyan_MR' in zip_basename: # Inconsistency --- `main_file` contains dir name, and not index.html main_file = 'index.html' # Jul 8th: handle weird case-insensitive webserver main_file if main_file == 'mainexpand.html': main_file = 'mainExpand.html' # <-- this is the actual filename in the zip # Zip files from Pratham website have the web content inside subfolder # of the same as the zip filename. We need to recreate these zip files # to make sure the index.html is in the root of the zip. local_zip_file = os.path.join(destpath, zip_filename) with zipfile.ZipFile(local_zip_file) as zf: # If main_file is in the root (like zips from the game repository) # then we need to extract the zip contents to subfolder zip_basename/ for zfileinfo in zf.filelist: if zfileinfo.filename == main_file: destpath = os.path.join(destpath, zip_basename) # Extract zip so main file will be in destpath/zip_basename/index.html zf.extractall(destpath) # In some cases, the files are under the www directory, # let's move them up one level. www_dir = os.path.join(zip_folder, 'www') if os.path.isdir(www_dir): files = os.listdir(www_dir) for f in files: shutil.move(os.path.join(www_dir, f), zip_folder) # Rename `main_file` to index.html src = os.path.join(zip_folder, main_file) dest = os.path.join(zip_folder, 'index.html') os.rename(src, dest) # Logic to add margin-top:44px; for games that match Corrections tab add_margin_top = False for row in PRADIGI_CORRECTIONS_LIST: if row[CORRECTIONS_ACTION_KEY] == ADD_MARGIN_TOP_ACTION: pat = row[CORRECTIONS_SOURCE_URL_PAT_KEY] m = pat.match(zip_file_url) if m: add_margin_top = True if add_margin_top: if zip_file_url.endswith('CourseContent/Games/Mathematics.zip'): LOGGER.info( "adding body.margin-top:44px; to ALL .html files in: %s" % zip_file_url) for root, dirs, files in os.walk(zip_folder): for file in files: if file.endswith(".html"): add_body_margin_top(root, file) else: LOGGER.info( "adding body.margin-top:44px; to index.html in: %s" % zip_file_url) add_body_margin_top(zip_folder, 'index.html') # Replace occurences of `main_file` with index.html to avoid broken links for root, dirs, files in os.walk(zip_folder): for file in files: if file.endswith(".html") or file.endswith(".js"): file_path = os.path.join(root, file) # use bytes to avoid Unicode errors "invalid start/continuation byte" bytes_in = open(file_path, 'rb').read() bytes_out = bytes_in.replace(main_file.encode('utf-8'), b'index.html') open(file_path, 'wb').write(bytes_out) # create the zip file and copy it to tmp_predictable_zip_path = create_predictable_zip(zip_folder) shutil.copyfile(tmp_predictable_zip_path, final_webroot_path) return final_webroot_path except Exception as e: LOGGER.error("get_zip_file: %s, %s, %s, %s" % (zip_file_url, main_file, destpath, e)) return None
def scrape_content(title, content_url): """ title: Boys' clothing content_url: http://www.touchableearth.org/china-culture-boys-clothing/ """ print(" Scraping content node: %s (%s)" % (title, content_url)) doc = get_parsed_html_from_url(content_url) if not doc: # 404 return None description = create_description(doc) source_id = doc.select_one(".current_post.active .post_id")["value"] base_node_attributes = { "source_id": source_id, "title": title, "license": TE_LICENSE, "description": description, } youtube_iframe = doc.select_one(".video-container iframe") if youtube_iframe: youtube_url = doc.select_one(".video-container iframe")["src"] youtube_id = get_youtube_id_from_url(youtube_url) if not youtube_id: print(" *** WARNING: youtube_id not found for content url", content_url) print(" Skipping.") return None try: info = ydl.extract_info(youtube_url, download=False) subtitles = info.get("subtitles") subtitle_languages = subtitles.keys() if subtitles else [] print(" ... with subtitles in languages:", subtitle_languages) except youtube_dl.DownloadError as e: # Some of the videos have been removed from the YouTube channel -- # skip creating content nodes for them entirely so they don't show up # as non-loadable videos in Kolibri. print(" NOTE: Skipping video download due to error: ", e) return None video_node = nodes.VideoNode( **base_node_attributes, derive_thumbnail=True, files=[WatermarkedYouTubeVideoFile(youtube_id=youtube_id)], ) # Add subtitles in whichever languages are available. for language in subtitle_languages: video_node.add_file( files.YouTubeSubtitleFile(youtube_id=youtube_id, language=language)) return video_node img = doc.select_one(".uncode-single-media-wrapper img") if img: img_src = img["data-guid"] or img["src"] destination = tempfile.mkdtemp() download_file(img_src, destination, request_fn=make_request, filename="image.jpg") with open(os.path.join(destination, "index.html"), "w") as f: f.write(""" <!doctype html> <html> <head></head> <body> <img src="image.jpg" style="width: 100%; max-width: 1200px;" /> </body> </html> """) zip_path = create_predictable_zip(destination) return nodes.HTML5AppNode( **base_node_attributes, files=[files.HTMLZipFile(zip_path)], thumbnail=img_src, ) return None
def overlay_and_watermark_video(filename, youtube_id): # Check if we've processed this file before -- is it in the cache? key = files.generate_key("WATERMARKED", filename, settings=WATERMARK_SETTINGS) if not config.UPDATE and files.FILECACHE.get(key): return files.FILECACHE.get(key).decode('utf-8') # Create a temporary filename to write the watermarked video. tempf = tempfile.NamedTemporaryFile(suffix=".{}".format(file_formats.MP4), delete=False) tempf.close() tempfile_name = tempf.name # Now watermark it with the Touchable Earth logo! print("\t--- Watermarking and adding overlay ", filename) # First add the overlay image -- this is the image shown as the first frame # so that when the video hasn't been played yet, it will show this image # rather than a black screen (since Touchable Earth's videos start from # a blank black screen). # Download the overlay image based on the YouTube ID overlay_src = 'https://i.ytimg.com/vi_webp/%s/maxresdefault.webp' % youtube_id print("\t ... grabbing overlay image from %s" % overlay_src) destination = tempfile.mkdtemp() overlay_filename = "overlay.webp" overlay_file = os.path.join(destination, overlay_filename) _, response = download_file(overlay_src, destination, request_fn=sess.get, filename=overlay_filename) video_clip = mpe.VideoFileClip(config.get_storage_path(filename), audio=True) if response.status_code == 200: overlay_clip = mpe.ImageClip(overlay_file).set_duration(0.1) concat_clips = mpe.concatenate_videoclips([overlay_clip, video_clip]) else: concat_clips = video_clip print("\t WARNING: Could not download overlay image file from %s" % overlay_src) # Now create the watermark logo as a clip ... logo = (mpe.ImageClip(WATERMARK_SETTINGS["image"]).set_duration( concat_clips.duration).resize( height=WATERMARK_SETTINGS["height"]).margin( right=WATERMARK_SETTINGS["right"], bottom=WATERMARK_SETTINGS["bottom"], opacity=0).set_pos(WATERMARK_SETTINGS["position"])) # And then combine it with the video clip. composite = mpe.CompositeVideoClip([concat_clips, logo]) composite.duration = concat_clips.duration composite.write_videofile(tempfile_name, threads=4) # Now move the watermarked file to Ricecooker storage and hash its name # so it can be validated. watermarked_filename = "{}.{}".format(files.get_hash(tempfile_name), file_formats.MP4) files.copy_file_to_storage(watermarked_filename, tempfile_name) os.unlink(tempfile_name) os.unlink(overlay_file) files.FILECACHE.set(key, bytes(watermarked_filename, "utf-8")) return watermarked_filename
def download_sim(self, topic, sim, keywords, language): """ Download, zip, and add a node for a sim, as well as any associated video. """ localized_sim = sim["localizedSimulations"][0] print("\tProcessing sim:", localized_sim["title"]) dst = tempfile.mkdtemp() download_file( localized_sim["downloadUrl"], dst, filename="index.html", request_fn=sess.get, middleware_callbacks=[process_sim_html], ) zippath = create_predictable_zip(dst) authors = re.sub(" \(.*?\)", "", sim["credits"]["designTeam"]) authors = re.sub("<br\/?>", ", ", authors) title = localized_sim["title"] if language == "ar": if title in ARABIC_NAME_CATEGORY: title = ARABIC_NAME_CATEGORY[title] if title in SIM_TYPO: title = SIM_TYPO[title] # create a node for the sim simnode = HTML5AppNode( source_id="sim-%d" % localized_sim["id"], files=[HTMLZipFile(zippath)], title=title, description=sim["description"][language][:200], license=CC_BYLicense( "PhET Interactive Simulations, University of Colorado Boulder" ), # author=authors, # tags=[keywords[topic] for topic in sim["topicIds"]], thumbnail=sim["media"]["thumbnailUrl"], language=getlang(language), ) # if there's a video, extract it and put it in the topic right before the sim videos = sim["media"]["vimeoFiles"] if videos: video_url = [v for v in videos if v.get("height") == 540][0]["link"] videonode = VideoNode( source_id="video-%d" % localized_sim["id"], files=[VideoFile(video_url)], title="Video: %s" % localized_sim["title"], license=CC_BYLicense( "PhET Interactive Simulations, University of Colorado Boulder" ), thumbnail=sim["media"]["thumbnailUrl"], ) topic.add_child(videonode) # add the sim node into the topic topic.add_child(simnode)
def download_content_node(category_node, url, title, thumbnail=None, description=None): doc = get_parsed_html_from_url(url) destination = tempfile.mkdtemp() doc = download_static_assets(doc, destination, 'https://k12.thoughtfullearning.com', request_fn=make_request, url_blacklist=url_blacklist) remove_node(doc, '#header') remove_node(doc, '.subMenuBarContainer') remove_node(doc, '.breadbookmarkcontainer') remove_node(doc, '.resourcePageTypeTitle') remove_node(doc, '.sharethis-wrapper') remove_node(doc, '.ccBlock') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block-1') remove_node(doc, '#block-views-resource-info-block-block') remove_node(doc, '.productSuggestionContainer') remove_node(doc, 'footer') # For minilessons remove_node(doc, '.field-name-field-minilesson-downloadables') # For writing assessments remove_node(doc, '.assessmentTGLink') remove_node(doc, '.assessmentModelRubrics') remove_node(doc, '.view-display-id-attachment_1') # Write out the HTML source. with open(os.path.join(destination, "index.html"), "w") as f: f.write(str(doc)) print(" ... downloaded to %s" % destination) #preview_in_browser(destination) thumbnail_path = None if thumbnail: # Manually download the thumbnail and use it so we can lowercase the # extension to be accepted by Ricecooker. thumbnail_filename = derive_filename(thumbnail) thumbnail_path = os.path.join(destination, thumbnail_filename) download_file(thumbnail, destination, request_fn=make_request, filename=thumbnail_filename) # If there is an embedded video in the page source grab it as a video node. video_node = None iframe = doc.select_one('.embedded-video iframe') if iframe: youtube_url = iframe['src'] youtube_id = get_youtube_id_from_url(youtube_url) info = ydl.extract_info(youtube_url, download=False) video_title = info['title'] print( " ... and with video titled %s from www.youtube.com/watch?v=%s" % (video_title, youtube_id)) video_node = nodes.VideoNode( source_id=youtube_id, title=truncate_metadata(info['title']), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=info['description'], language="en", derive_thumbnail=True, files=[files.YouTubeVideoFile(youtube_id)], ) category_node.add_child(video_node) zip_path = create_predictable_zip(destination) app_node = nodes.HTML5AppNode( source_id=url, title=truncate_metadata(title), license=licenses.CC_BY_NC_SALicense( copyright_holder=truncate_metadata('Thoughtful Learning')), description=description, thumbnail=thumbnail_path, files=[files.HTMLZipFile(zip_path)], language="en", ) category_node.add_child(app_node)