コード例 #1
0
    def parse_page(self, url, processed_pages={}, index=None):
        # if this is the first page being parse, set it as the index.html
        if not index:
            index = url

        log.info(f"Parsing page '{url}'")
        log.debug(f"Using page config: {self.get_page_config(url)}")
        self.driver.get(url)

        try:
            WebDriverWait(self.driver, 60).until(notion_page_loaded())
        except TimeoutException as ex:
            log.critical(
                "Timeout waiting for page content to load, or no content found."
                " Are you sure the page is set to public?")
            return

        # scroll at the bottom of the notion-scroller element to load all elements
        # continue once there are no changes in height after a timeout
        # don't do this if the page has a calendar databse on it or it will load forever
        calendar = self.driver.find_elements_by_class_name(
            "notion-calendar-view")
        if not calendar:
            scroller = self.driver.find_element_by_css_selector(
                ".notion-frame > .notion-scroller")
            last_height = scroller.get_attribute("scrollHeight")
            log.debug(
                f"Scrolling to bottom of notion-scroller (height: {last_height})"
            )
            while True:
                self.driver.execute_script(
                    "arguments[0].scrollTo(0, arguments[0].scrollHeight)",
                    scroller)
                time.sleep(self.args["timeout"])
                new_height = scroller.get_attribute("scrollHeight")
                log.debug(
                    f"New notion-scroller height after timeout is: {new_height}"
                )
                if new_height == last_height:
                    break
                last_height = new_height

        # function to expand all the toggle block in the page to make their content visible
        # so we can hook up our custom toggle logic afterwards
        def open_toggle_blocks(timeout, exclude=[]):
            opened_toggles = exclude
            toggle_blocks = self.driver.find_elements_by_class_name(
                "notion-toggle-block")
            log.debug(
                f"Opening {len(toggle_blocks)} new toggle blocks in the page")
            for toggle_block in toggle_blocks:
                if not toggle_block in opened_toggles:
                    toggle_button = toggle_block.find_element_by_css_selector(
                        "div[role=button]")
                    # check if the toggle is already open by the direction of its arrow
                    is_toggled = "(180deg)" in (
                        toggle_button.find_element_by_tag_name(
                            "svg").get_attribute("style"))
                    if not is_toggled:
                        # click on it, then wait until all elements are displayed
                        toggle_button.click()
                        try:
                            WebDriverWait(self.driver, timeout).until(
                                toggle_block_has_opened(toggle_block))
                        except TimeoutException as ex:
                            log.warning(
                                "Timeout waiting for toggle block to open."
                                " Likely it's already open, but doesn't hurt to check."
                            )
                        except Exception as exception:
                            log.error(
                                f"Error trying to open a toggle block: {exception}"
                            )
                        opened_toggles.append(toggle_block)

            # after all toggles have been opened, check the page again to see if
            # any toggle block had nested toggle blocks inside them
            new_toggle_blocks = self.driver.find_elements_by_class_name(
                "notion-toggle-block")
            if len(new_toggle_blocks) > len(toggle_blocks):
                # if so, run the function again
                open_toggle_blocks(timeout, opened_toggles)

        # open the toggle blocks in the page
        open_toggle_blocks(self.args["timeout"])

        # creates soup from the page to start parsing
        soup = BeautifulSoup(self.driver.page_source, "html.parser")

        # remove scripts and other tags we don't want / need
        for unwanted in soup.findAll("script"):
            unwanted.decompose()
        for intercom_frame in soup.findAll("div", {"id": "intercom-frame"}):
            intercom_frame.decompose()
        for intercom_div in soup.findAll(
                "div", {"class": "intercom-lightweight-app"}):
            intercom_div.decompose()
        for overlay_div in soup.findAll("div",
                                        {"class": "notion-overlay-container"}):
            overlay_div.decompose()
        for vendors_css in soup.find_all("link",
                                         href=lambda x: x and "vendors~" in x):
            vendors_css.decompose()

        # clean up the default notion meta tags
        for tag in [
                "description",
                "twitter:card",
                "twitter:site",
                "twitter:title",
                "twitter:description",
                "twitter:image",
                "twitter:url",
                "apple-itunes-app",
        ]:
            unwanted_tag = soup.find("meta", attrs={"name": tag})
            if unwanted_tag:
                unwanted_tag.decompose()
        for tag in [
                "og:site_name",
                "og:type",
                "og:url",
                "og:title",
                "og:description",
                "og:image",
        ]:
            unwanted_og_tag = soup.find("meta", attrs={"property": tag})
            if unwanted_og_tag:
                unwanted_og_tag.decompose()

        # set custom meta tags
        custom_meta_tags = self.get_page_config(url).get("meta", [])
        for custom_meta_tag in custom_meta_tags:
            tag = soup.new_tag("meta")
            for attr, value in custom_meta_tag.items():
                tag.attrs[attr] = value
            log.debug(f"Adding meta tag {str(tag)}")
            soup.head.append(tag)

        # process images & emojis
        cache_images = True
        for img in soup.findAll("img"):
            if img.has_attr("src"):
                if cache_images and not "data:image" in img["src"]:
                    img_src = img["src"]
                    # if the path starts with /, it's one of notion's predefined images
                    if img["src"].startswith("/"):
                        img_src = "https://www.notion.so" + img["src"]
                        # notion's own default images urls are in a weird format, need to sanitize them
                        # img_src = 'https://www.notion.so' + img['src'].split("notion.so")[-1].replace("notion.so", "").split("?")[0]
                        # if (not '.amazonaws' in img_src):
                        # img_src = urllib.parse.unquote(img_src)

                    cached_image = self.cache_file(img_src)
                    img["src"] = cached_image
                else:
                    if img["src"].startswith("/"):
                        img["src"] = "https://www.notion.so" + img["src"]

            # on emoji images, cache their sprite sheet and re-set their background url
            if img.has_attr("class") and "notion-emoji" in img["class"]:
                style = cssutils.parseStyle(img["style"])
                spritesheet = style["background"]
                spritesheet_url = spritesheet[spritesheet.find("(") +
                                              1:spritesheet.find(")")]
                cached_spritesheet_url = self.cache_file(
                    "https://www.notion.so" + spritesheet_url)
                style["background"] = spritesheet.replace(
                    spritesheet_url, str(cached_spritesheet_url))
                img["style"] = style.cssText

        # process stylesheets
        for link in soup.findAll("link", rel="stylesheet"):
            if link.has_attr("href") and link["href"].startswith("/"):
                # we don't need the vendors stylesheet
                if "vendors~" in link["href"]:
                    continue
                # css_file = link['href'].strip("/")
                cached_css_file = self.cache_file("https://www.notion.so" +
                                                  link["href"])
                with open(self.dist_folder / cached_css_file, "rb") as f:
                    stylesheet = cssutils.parseString(f.read())
                    # open the stylesheet and check for any font-face rule,
                    for rule in stylesheet.cssRules:
                        if rule.type == cssutils.css.CSSRule.FONT_FACE_RULE:
                            # if any are found, download the font file
                            font_file = (rule.style["src"].split("url(/")
                                         [-1].split(") format")[0])
                            cached_font_file = self.cache_file(
                                f"https://www.notion.so/{font_file}")
                            rule.style["src"] = f"url({str(cached_font_file)})"
                link["href"] = str(cached_css_file)

        # add our custom logic to all toggle blocks
        for toggle_block in soup.findAll("div",
                                         {"class": "notion-toggle-block"}):
            toggle_id = uuid.uuid4()
            toggle_button = toggle_block.select_one("div[role=button]")
            toggle_content = toggle_block.find("div", {
                "class": None,
                "style": ""
            })
            if toggle_button and toggle_content:
                # add a custom class to the toggle button and content,
                # plus a custom attribute sharing a unique uiid so
                # we can hook them up with some custom js logic later
                toggle_button["class"] = toggle_block.get(
                    "class", []) + ["loconotion-toggle-button"]
                toggle_content["class"] = toggle_content.get(
                    "class", []) + ["loconotion-toggle-content"]
                toggle_content.attrs[
                    "loconotion-toggle-id"] = toggle_button.attrs[
                        "loconotion-toggle-id"] = toggle_id

        # if there are any table views in the page, add links to the title rows
        # the link to the row item is equal to its data-block-id without dashes
        for table_view in soup.findAll("div", {"class": "notion-table-view"}):
            for table_row in table_view.findAll(
                    "div", {"class": "notion-collection-item"}):
                table_row_block_id = table_row["data-block-id"]
                table_row_href = "/" + table_row_block_id.replace("-", "")
                row_target_span = table_row.find("span")
                row_link_wrapper = soup.new_tag("a",
                                                attrs={
                                                    "href": table_row_href,
                                                    "style": "cursor: pointer;"
                                                })
                row_target_span.wrap(row_link_wrapper)

        # embed custom google font(s)
        fonts_selectors = {
            "site": "div:not(.notion-code-block)",
            "navbar": ".notion-topbar div",
            "title":
            ".notion-page-block > div, .notion-collection_view_page-block > div[data-root]",
            "h1":
            ".notion-header-block div, notion-page-content > notion-collection_view-block > div:first-child div",
            "h2": ".notion-sub_header-block div",
            "h3": ".notion-sub_sub_header-block div",
            "body": ".notion-scroller",
            "code": ".notion-code-block *",
        }
        custom_fonts = self.get_page_config(url).get("fonts", {})
        if custom_fonts:
            # append a stylesheet importing the google font for each unique font
            unique_custom_fonts = set(custom_fonts.values())
            for font in unique_custom_fonts:
                if font:
                    google_fonts_embed_name = font.replace(" ", "+")
                    font_href = f"https://fonts.googleapis.com/css2?family={google_fonts_embed_name}:wght@500;600;700&display=swap"
                    custom_font_stylesheet = soup.new_tag("link",
                                                          rel="stylesheet",
                                                          href=font_href)
                    soup.head.append(custom_font_stylesheet)

            # go through each custom font, and add a css rule overriding the font-family
            # to the font override stylesheet targetting the appropriate selector
            font_override_stylesheet = soup.new_tag("style", type="text/css")
            for target, custom_font in custom_fonts.items():
                if custom_font and not target == "site":
                    log.debug(f"Setting {target} font-family to {custom_font}")
                    font_override_stylesheet.append(fonts_selectors[target] +
                                                    " {font-family:" +
                                                    custom_font +
                                                    " !important} ")
            site_font = custom_fonts.get("site", None)
            # process global site font last to more granular settings can override it
            if site_font:
                log.debug(f"Setting global site font-family to {site_font}"),
                font_override_stylesheet.append(fonts_selectors["site"] +
                                                " {font-family:" + site_font +
                                                "} ")
            # finally append the font overrides stylesheets to the page
            soup.head.append(font_override_stylesheet)

        # inject any custom elements to the page
        custom_injects = self.get_page_config(url).get("inject", {})

        def injects_custom_tags(section):
            section_custom_injects = custom_injects.get(section, {})
            for tag, elements in section_custom_injects.items():
                for element in elements:
                    injected_tag = soup.new_tag(tag)
                    for attr, value in element.items():
                        injected_tag[attr] = value
                        # if the value refers to a file, copy it to the dist folder
                        if attr.lower() == "href" or attr.lower() == "src":
                            log.debug(f"Copying injected file '{value}'")
                            cached_custom_file = self.cache_file(
                                (Path.cwd() / value.strip("/")))
                            # destination = (self.dist_folder / source.name)
                            # shutil.copyfile(source, destination)
                            injected_tag[attr] = str(
                                cached_custom_file)  # source.name
                    log.debug(
                        f"Injecting <{section}> tag: {str(injected_tag)}")
                    soup.find(section).append(injected_tag)

        injects_custom_tags("head")
        injects_custom_tags("body")

        # inject loconotion's custom stylesheet and script
        loconotion_custom_css = self.cache_file(Path("bundles/loconotion.css"))
        custom_css = soup.new_tag("link",
                                  rel="stylesheet",
                                  href=str(loconotion_custom_css))
        soup.head.insert(-1, custom_css)
        loconotion_custom_js = self.cache_file(Path("bundles/loconotion.js"))
        custom_script = soup.new_tag("script",
                                     type="text/javascript",
                                     src=str(loconotion_custom_js))
        soup.body.insert(-1, custom_script)

        # find sub-pages and clean slugs / links
        sub_pages = []
        for a in soup.findAll("a"):
            if a["href"].startswith("/"):
                sub_page_href = "https://www.notion.so" + a["href"]
                # if the link is an anchor link,
                # check if the page hasn't already been parsed
                if "#" in sub_page_href:
                    sub_page_href_tokens = sub_page_href.split("#")
                    sub_page_href = sub_page_href_tokens[0]
                    a["href"] = "#" + sub_page_href_tokens[-1]
                    a["class"] = a.get("class",
                                       []) + ["loconotion-anchor-link"]
                    if (sub_page_href in processed_pages.keys()
                            or sub_page_href in sub_pages):
                        log.debug(
                            f"Original page for anchor link {sub_page_href}"
                            " already parsed / pending parsing, skipping")
                        continue
                else:
                    a["href"] = (self.get_page_slug(sub_page_href)
                                 if sub_page_href != index else "index.html")
                sub_pages.append(sub_page_href)
                log.debug(f"Found link to page {a['href']}")

        # exports the parsed page
        html_str = str(soup)
        html_file = self.get_page_slug(url) if url != index else "index.html"
        if html_file in processed_pages.values():
            log.error(
                f"Found duplicate pages with slug '{html_file}' - previous one will be"
                " overwritten. Make sure that your notion pages names or custom slugs"
                " in the configuration files are unique")
        log.info(f"Exporting page '{url}' as '{html_file}'")
        with open(self.dist_folder / html_file, "wb") as f:
            f.write(html_str.encode("utf-8").strip())
        processed_pages[url] = html_file

        # parse sub-pages
        if sub_pages and not self.args.get("single_page", False):
            if processed_pages:
                log.debug(f"Pages processed so far: {len(processed_pages)}")
            for sub_page in sub_pages:
                if not sub_page in processed_pages.keys():
                    self.parse_page(sub_page,
                                    processed_pages=processed_pages,
                                    index=index)

        # we're all done!
        return processed_pages
コード例 #2
0
ファイル: notionparser.py プロジェクト: zhming0/loconotion
 def load(self, url):
     self.driver.get(url)
     WebDriverWait(self.driver, 60).until(notion_page_loaded())