コード例 #1
0
    def write_css_js(self, filepath):
        with html_writer.HTMLWriter(
                filepath, "a") as zipper, open("chefdata/styles.css") as f:
            content = f.read()
            zipper.write_contents("styles.css", content, directory="css/")

        with html_writer.HTMLWriter(
                filepath, "a") as zipper, open("chefdata/scripts.js") as f:
            content = f.read()
            zipper.write_contents("scripts.js", content, directory="js/")
コード例 #2
0
 def write_images(self, filepath, content):
     self.get_images(content)
     with html_writer.HTMLWriter(filepath, "a") as zipper:
         for img_src, img_filename in self.images.items():
             try:
                 zipper.write_url(img_src, img_filename, directory="files")
             except requests.exceptions.HTTPError:
                 pass
コード例 #3
0
 def write_contents(self,
                    filepath_index,
                    filename,
                    content,
                    directory="files"):
     with html_writer.HTMLWriter(filepath_index, "a") as zipper:
         content = '<html><head><meta charset="utf-8"><link rel="stylesheet" href="../css/styles.css"></head><body>{}<script src="../js/scripts.js"></script></body></html>'.format(
             content)
         zipper.write_contents(filename, content, directory=directory)
コード例 #4
0
    def _download_file(self, write_to_path):

        with html_writer.HTMLWriter(write_to_path) as zipper:
            try:
                self.zipper = zipper
                self.to_zip(filename='index.html')
            except Exception as e:
                # Any errors here will just say index.html file does not exist, so
                # print out error for more descriptive debugging
                LOGGER.error(str(e))
コード例 #5
0
 def write_images(self, filepath, images):
     with html_writer.HTMLWriter(filepath, "a") as zipper:
         for img_src, img_filename in images.items():
             try:
                 if img_src.startswith("data:image/"):
                     pass
                 else:
                     requests.get(img_src, timeout=40)
                     zipper.write_url(img_src, img_filename, directory="")
             except requests.exceptions.HTTPError:
                 pass
             except requests.exceptions.ConnectTimeout as e:
                 LOGGER.info(str(e))
コード例 #6
0
 def write_images(self, filepath, images):
     with html_writer.HTMLWriter(filepath, "a") as zipper:
         for img_src, img_filename in images.items():
             try:
                 if img_src.startswith("data:image/") or img_src.startswith(
                         "file://"):
                     pass
                 else:
                     # zipper.write_url(img_src, img_filename, directory="")
                     zipper.write_contents(img_filename,
                                           downloader.read(img_src,
                                                           timeout=5,
                                                           session=sess),
                                           directory="")
             except (requests.exceptions.HTTPError,
                     requests.exceptions.ConnectTimeout,
                     requests.exceptions.ConnectionError, FileNotFoundError,
                     requests.exceptions.ReadTimeout):
                 pass
コード例 #7
0
 def write_index(self, filepath, content):
     with html_writer.HTMLWriter(filepath, "w") as zipper:
         zipper.write_index_contents(content)
コード例 #8
0
ファイル: base.py プロジェクト: learningequality/webmixer
 def download_file(self, write_to_path):
     # Generate a .zip file
     with html_writer.HTMLWriter(write_to_path) as zipper:
         self.zipper = zipper
         self.to_zip(filename='index.html')
コード例 #9
0
def scrape_snack_page(slug, attempts=5):
    """ Writes activity to a zipfile
        Args:
            slug (str): url slug (e.g. /snacks/drawing-board)
            attemps (int): number of times to attempt a download
        Returns
            write_to_path (str): path to generated zip
            tags ([str]): list of tags scraped from activity page
    """
    tags = []
    write_to_path = os.path.sep.join(
        [SNACK_DIRECTORY, "{}.zip".format(slug.split('/')[-1])])

    try:
        contents = BeautifulSoup(read(slug), 'html5lib')
        main_contents = contents.find('div', {'class': 'activity'})

        # Gather keywords from page
        tags.extend(
            scrape_keywords(main_contents,
                            'field-name-field-activity-subject'))
        tags.extend(
            scrape_keywords(main_contents, 'field-name-field-activity-tags'))

        # Don't rezip activities that have already been zipped
        if os.path.isfile(write_to_path):
            return write_to_path, tags

        with html_writer.HTMLWriter(write_to_path) as zipper:
            write_contents = BeautifulSoup("", "html5lib")

            # Scrape stylesheets
            for stylesheet in contents.find_all('link', {'rel': 'stylesheet'}):
                # Don't scrape external style sheets (e.g. fontawesome, google fonts)
                if "exploratorium.edu" not in stylesheet['href']:
                    continue
                style_contents = scrape_style(stylesheet['href'], zipper)
                filename = stylesheet['href'].split('/')[-1]
                stylesheet['href'] = zipper.write_contents(filename,
                                                           style_contents,
                                                           directory="css")
                write_contents.head.append(stylesheet)

            # Remove scripts and any unneeded sections
            cluster = main_contents.find('div', {'id': 'curated-cluster'})
            cluster and cluster.decompose()
            service_links = main_contents.find(
                'div', {'class': 'activity-service-links'})
            service_links and service_links.decompose()
            for script in main_contents.find_all("script"):
                script.decompose()

            # Get rid of hardcoded height/width on slideshow element
            slideshow = main_contents.find('div', {'class': 'field-slideshow'})
            if slideshow:
                del slideshow['style']

            # Add images
            for img in main_contents.find_all('img'):
                img['src'] = zipper.write_url(format_url(img['src']),
                                              img['src'].split('/')[-1],
                                              directory="images")

            # Add videos embedded from youtube
            for video in main_contents.find_all('div', {'class': 'yt-player'}):
                yt_video_path = download_web_video(
                    video['data-ytid'], "{}.mp4".format(video['data-ytid']))
                video_tag = generate_video_tag(yt_video_path, zipper)
                video_tag['style'] = video.find('div', {
                    'class': 'placeholder'
                }).get('style')
                video.replaceWith(video_tag)

            # Add videos embedded from brightcove and remove playlist element (if any)
            for k, v in get_brightcove_mapping(main_contents,
                                               get_playlist=True).items():
                video_path = download_web_video(v['url'], "{}.mp4".format(k))
                if v.get('original_el'):
                    v['original_el'].replaceWith(
                        generate_video_tag(video_path, zipper))
                elif v.get('append_to'):
                    if v.get('title'):
                        p_tag = contents.new_tag("p")
                        p_tag.string = v['title']
                        p_tag[
                            'style'] = "margin-top: 40px; margin-bottom: 10px"
                        v['append_to'].parent.append(p_tag)
                    v['append_to'].parent.append(
                        generate_video_tag(video_path, zipper))
            playlist = main_contents.find(
                'div', {'id': 'media-collection-banner-playlist'})
            if playlist:
                playlist.decompose()

            # Handle links (need to start with parent as beautifulsoup returns parent as None on links)
            for paragraph in main_contents.find_all(
                    'p') + main_contents.find_all('li'):
                for link in paragraph.find_all('a'):
                    # Skip any previously parsed links
                    if zipper.contains(link['href']):
                        continue

                    # Just bold activities and remove link
                    elif "exploratorium.edu/snacks/" in link['href']:
                        bold_tag = contents.new_tag("b")
                        bold_tag.string = link.text
                        link.replaceWith(bold_tag)

                    # If it's an image, replace the tag with just the image
                    elif link.find('img'):
                        link.replaceWith(link.find('img'))

                    # Get downloadable files and attach them to new pages
                    elif "/sites/default/files/" in link['href']:
                        link['href'] = generate_download_page(
                            link['href'], zipper)

                    # Get any referenced videos
                    elif "exploratorium.edu" in link['href']:
                        linked_page = BeautifulSoup(read(link['href']),
                                                    'html5lib')
                        link.replaceWith(link.text.replace(link['href'], ''))
                        for k, v in get_brightcove_mapping(
                                linked_page).items():
                            video_path = download_web_video(
                                v['url'], "{}.mp4".format(k))
                            paragraph.append(
                                generate_video_tag(video_path, zipper))

                    # Scrape any images
                    elif next((e for e in IMAGE_EXTENSIONS
                               if link['href'].lower().endswith(e)), None):
                        img_tag = contents.new_tag('img')
                        img_tag['src'] = zipper.write_url(
                            link['href'],
                            link['href'].split('/')[-1],
                            directory="images")
                        img_tag['style'] = "max-width: 100%;"
                        paragraph.append(img_tag)
                        link.replaceWith(link.text)

                    # Remove hyperlink from external links
                    else:
                        if link['href'] not in link.text and link.text not in link[
                                'href']:
                            link.string += " ({}) ".format(link['href'])
                        link.replaceWith(link.text)

            # Write contents and custom tags
            write_contents.body.append(main_contents)
            write_contents.head.append(
                generate_custom_style_tag())  # Add custom style tag
            write_contents.body.append(generate_custom_script_tag(
            ))  # Add custom script to handle slideshow

            # Write main index.html file
            zipper.write_index_contents(
                write_contents.prettify().encode('utf-8-sig'))

    except Exception as e:
        # Reattempt if there are attempts left
        if attempts > 0:
            return scrape_snack_page(slug, attempts=attempts - 1)
        else:
            LOGGER.error("Could not scrape {} ({})".format(slug, str(e)))
    return write_to_path, tags
コード例 #10
0
 def write_img(self, img_url, filepath, img_filename):
     with html_writer.HTMLWriter(filepath, "a") as zipper:
         path = zipper.write_url(img_url, img_filename, directory="files")
コード例 #11
0
 def write_index(self, content):
     with html_writer.HTMLWriter(self.filename, "w") as zipper:
         zipper.write_index_contents(content)
コード例 #12
0
 def write_img(self, img_url, filename):
     with html_writer.HTMLWriter(self.filename, "a") as zipper:
         zipper.write_url(img_url, filename, directory="files")
コード例 #13
0
 def write(self, filename, content):
     with html_writer.HTMLWriter(self.filename, "a") as zipper:
         zipper.write_contents(filename, content, directory="files")