Ejemplo n.º 1
0
    def from_user_id(user_id):
        """
        Transform `user_id` to instance of :class:`User`.

        Returns:
            obj: :class:`User` instance parsed from the `user_id`.
        """
        data = shared.download(url_context("/Profile/" + str(user_id)))
        dom = dhtmlparser.parseString(data)
        dhtmlparser.makeDoubleLinked(dom)

        shared.handle_errors(dom)

        # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků
        # na abclinuxu.cz</a>
        a_tags = dom.find(
            "a", fn=lambda x: x.params.get("href", "").startswith("/lide/"))

        # pick only links which have content that starts with Seznam
        links = [
            a_tag.params["href"] for a_tag in a_tags
            if a_tag.getContent().startswith("Seznam")
        ]

        username = links[-1].split("/")[2]

        return User(username)
def test_predecesors_pattern():
    dom = dhtmlparser.parseString(
        """
        <root>
            <xex>
                <x>content</x>
            </xex>
        </root>
        """
    )
    dhtmlparser.makeDoubleLinked(dom)

    x = dom.find("x")[0]

    res = path_patterns.predecesors_pattern(x, dom)

    assert res
    assert len(res) == 1

    assert isinstance(res[0], path_patterns.PathCall)

    assert res[0].call_type == "match"
    assert res[0].index == 0
    assert res[0].params == [
        ["root", None],
        ["xex", None],
        ["x", None],
    ]
def test_neighbours_pattern_text_neigh():
    dom = dhtmlparser.parseString(
        """
        asd
        <xex>\tHello</xex>
        <xep></xep>
        asd
        """
    )
    dhtmlparser.makeDoubleLinked(dom)

    xex = dom.find("xex")[0]
    res = path_patterns.neighbours_pattern(xex)

    assert res
    assert len(res) == 2

    left, right = res

    assert left.call_type == "left_neighbour_tag"
    assert left.index == 0
    assert res[0].params.tag_name == "xex"
    assert res[0].params.params == None
    assert left.params.fn_params == [None, None, "asd"]

    assert right.call_type == "right_neighbour_tag"
    assert right.index == 0
    assert res[0].params.tag_name == "xex"
    assert res[0].params.params == None
    assert right.params.fn_params == ["xep", None, ""]
Ejemplo n.º 4
0
    def from_user_id(user_id):
        """
        Transform `user_id` to instance of :class:`User`.

        Returns:
            obj: :class:`User` instance parsed from the `user_id`.
        """
        data = shared.download(url_context("/Profile/" + str(user_id)))
        dom = dhtmlparser.parseString(data)
        dhtmlparser.makeDoubleLinked(dom)

        shared.handle_errors(dom)

        # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků
        # na abclinuxu.cz</a>
        a_tags = dom.find(
            "a",
            fn=lambda x: x.params.get("href", "").startswith("/lide/")
        )

        # pick only links which have content that starts with Seznam
        links = [
            a_tag.params["href"]
            for a_tag in a_tags
            if a_tag.getContent().startswith("Seznam")
        ]

        username = links[-1].split("/")[2]

        return User(username)
Ejemplo n.º 5
0
        def cut_dom_to_area_of_interest(html):
            dom = html

            # make sure, that you don't modify `html` parameter
            if not isinstance(html, dhtmlparser.HTMLElement):
                dom = dhtmlparser.parseString(html)
            else:
                dom = copy.deepcopy(dom)
            dhtmlparser.makeDoubleLinked(dom)

            # comments are not stored in hierarchical structure, but in somehow
            # flat-nested lists

            # locate end of article
            ds_toolbox = dom.find("div", {"class": "ds_toolbox"})

            if not ds_toolbox:
                raise ValueError("Couldn't locate ds_toolbox!")

            ds_toolbox = first(ds_toolbox)
            dom = ds_toolbox.parent

            # ged rid of everything until end of the article
            while dom.childs[0] != ds_toolbox:
                dom.childs.pop(0)

            dom.childs.pop(0)

            return dom
Ejemplo n.º 6
0
def test_makeDoubleLinked():
    dom = dhtmlparser.parseString("""<html><tag PARAM="true"></html>""")

    dhtmlparser.makeDoubleLinked(dom)

    assert dom.childs[0].parent == dom
    assert dom.childs[1].parent == dom

    assert dom.childs[0].childs[0].parent == dom.childs[0]
Ejemplo n.º 7
0
def test_makeDoubleLinked():
    dom = dhtmlparser.parseString(
        """<html><tag PARAM="true"></html>"""
    )

    dhtmlparser.makeDoubleLinked(dom)

    assert dom.childs[0].parent == dom
    assert dom.childs[1].parent == dom

    assert dom.childs[0].childs[0].parent == dom.childs[0]
def test_neighbours_pattern_both_corners():
    dom = dhtmlparser.parseString(
        """
        <xex>\tHello</xex>
        """
    )
    dhtmlparser.makeDoubleLinked(dom)

    xex = dom.find("xex")[0]
    res = path_patterns.neighbours_pattern(xex)

    assert not res
def test_has_neigh():
    dom = dhtmlparser.parseString(SAUCE)
    dhtmlparser.makeDoubleLinked(dom)

    el = dom.find(
        "container",
        None,
        fn=utils.has_neigh(None, None, "something something", left=False)
    )

    assert el
    assert len(el) == 1

    assert el[0].getContent() == "and this"
def test_predecesors_pattern_shallow_root():
    dom = dhtmlparser.parseString(
        """
        <root>
            <x>content</x>
        </root>
        """
    )
    dhtmlparser.makeDoubleLinked(dom)

    x = dom.find("x")[0]

    res = path_patterns.predecesors_pattern(x, dom)

    assert not res
Ejemplo n.º 11
0
    def transform(cls, virtual_fs: VirtualFS, root: Directory, page: HtmlPage):
        made_doublelinked = False
        add_style_to_the_header = False

        code_code = page.dom.match(["pre", {"class": "code"}], "code")
        code_wrap = page.dom.match(["pre", {
            "class": "code code-wrap"
        }], "code")
        for code_tag in code_code + code_wrap:
            code_content, lang = cls._parse_code_content_and_lang(code_tag)

            if not made_doublelinked:
                dhtmlparser.makeDoubleLinked(page.dom)
                made_doublelinked = True

            add_style_to_the_header = True
            if lang == "python" or lang == "python3" or lang == "py":
                cls._add_syntax_highlight_for(PythonLexer, code_tag,
                                              code_content)
            elif lang == "c":
                cls._add_syntax_highlight_for(CLexer, code_tag, code_content)
            elif lang == "c++" or lang == "cpp":
                cls._add_syntax_highlight_for(CppLexer, code_tag, code_content)
            elif lang == "smalltalk":
                cls._add_syntax_highlight_for(SmalltalkLexer, code_tag,
                                              code_content)
            elif lang == "xml":
                cls._add_syntax_highlight_for(XmlLexer, code_tag, code_content)
            elif lang == "html":
                cls._add_syntax_highlight_for(HtmlLexer, code_tag,
                                              code_content)
            elif lang == "css":
                cls._add_syntax_highlight_for(CssLexer, code_tag, code_content)
            elif lang == "yaml":
                cls._add_syntax_highlight_for(YamlLexer, code_tag,
                                              code_content)
            elif lang:
                settings.logger.error("Unknown lang definition: %s, skipping.",
                                      lang)
                add_style_to_the_header = False
            else:
                add_style_to_the_header = False

        if add_style_to_the_header:
            style = HtmlFormatter().get_style_defs()
            style_html = "<style>\n%s\n</style>" % style
            style_tag = dhtmlparser.parseString(style_html)
            page.dom.find("head")[0].childs.append(style_tag)
Ejemplo n.º 12
0
        def cut_dom_to_area_of_interest(html):
            """
            Raises:
                StopIteration: In case of no comments.
                ValueError: In case that there is missing elements from HTML.
            """
            dom = html

            # make sure, that you don't modify `html` parameter
            if not isinstance(html, dhtmlparser.HTMLElement):
                dom = dhtmlparser.parseString(html)
            else:
                dom = copy.deepcopy(dom)
            dhtmlparser.makeDoubleLinked(dom)

            # comments are not stored in hierarchical structure, but in somehow
            # flat-nested lists

            # locate end of article
            ds_toolbox = dom.find("div", {"class": "ds_toolbox"})

            if not ds_toolbox:
                # blogposts without any comments
                add_first_comment = dom.find(
                    "a",
                    fn=lambda x:
                        "action=addDiz" in x.params.get("href", "") and
                        x.getContent().strip() == "Vložit první komentář"
                )

                if add_first_comment:
                    raise StopIteration("No comments yet.")

                raise ValueError("Couldn't locate ds_toolbox!")

            ds_toolbox = first(ds_toolbox)
            dom = ds_toolbox.parent

            # ged rid of everything until end of the article
            while dom.childs[0] != ds_toolbox:
                dom.childs.pop(0)

            dom.childs.pop(0)

            return dom
def test_parent_iterator():
    dom = dhtmlparser.parseString("""
        <root>
            <a />
            <sub>
                <a attr=1 />
            </sub>
        </root>
        """)
    dhtmlparser.makeDoubleLinked(dom)

    a_tag = dom.find("a", {"attr": "1"})[0]
    assert a_tag

    parents = list(toc_guesser._parent_iterator(a_tag))
    assert parents

    assert parents == [dom.find("sub")[0], dom.find("root")[0], dom]
Ejemplo n.º 14
0
def _create_dom(data):
    """
    Creates doublelinked DOM from `data`.

    Args:
        data (str/HTMLElement): Either string or HTML element.

    Returns:
        obj: HTMLElement containing double linked DOM.
    """
    if not isinstance(data, dhtmlparser.HTMLElement):
        data = dhtmlparser.parseString(
            utils.handle_encodnig(data)
        )

    dhtmlparser.makeDoubleLinked(data)

    return data
def _create_dom(data):
    """
    Creates doublelinked DOM from `data`.

    Args:
        data (str/HTMLElement): Either string or HTML element.

    Returns:
        obj: HTMLElement containing double linked DOM.
    """
    if not isinstance(data, dhtmlparser.HTMLElement):
        data = dhtmlparser.parseString(
            utils.handle_encodnig(data)
        )

    dhtmlparser.makeDoubleLinked(data)

    return data
Ejemplo n.º 16
0
    def from_html(html, lazy=True):
        """
        Convert HTML string to :class:`Blogpost` instance.

        Args:
            html (str): Input data.
            lazy (bool, default True): Be lazy (don't pull data by yourself
                 from the site). Call :meth:`pull` for active download of all
                 required informations.

        Returns:
            obj: :class:`Blogpost` instance.
        """
        if not isinstance(html, dhtmlparser.HTMLElement):
            html = dhtmlparser.parseString(html)
            dhtmlparser.makeDoubleLinked(html)

        # support for legacy blogs
        title_tag = html.find("h2", {"class": "st_nadpis"})
        if title_tag:
            title_tag = first(title_tag)
            rel_link = first(title_tag.find("a")).params["href"]
            link = url_context(rel_link)
        else:
            title_tag = first(html.find("h2"))
            link = first(html.find("link", {"rel": "canonical"}))
            link = link.params["href"]

        title = dhtmlparser.removeTags(title_tag).strip()

        # get meta
        meta = html.find("p", {"class": "meta-vypis"})[0]

        blog = Blogpost(url=link, lazy=lazy)

        if lazy:
            blog.title = title
            blog.intro = Blogpost._parse_intro(html, meta, title_tag)
            blog.rating = Blogpost._parse_rating_from_preview(meta)
            blog.created_ts = parse_timestamp(meta)
            blog.comments_n = Blogpost._parse_comments_n(meta)

        return blog
Ejemplo n.º 17
0
    def from_html(html, lazy=True):
        """
        Convert HTML string to :class:`Blogpost` instance.

        Args:
            html (str): Input data.
            lazy (bool, default True): Be lazy (don't pull data by yourself
                 from the site). Call :meth:`pull` for active download of all
                 required informations.

        Returns:
            obj: :class:`Blogpost` instance.
        """
        if not isinstance(html, dhtmlparser.HTMLElement):
            html = dhtmlparser.parseString(html)
            dhtmlparser.makeDoubleLinked(html)

        # support for legacy blogs
        title_tag = html.find("h2", {"class": "st_nadpis"})
        if title_tag:
            title_tag = first(title_tag)
            rel_link = first(title_tag.find("a")).params["href"]
            link = url_context(rel_link)
        else:
            title_tag = first(html.find("h2"))
            link = first(html.find("link", {"rel": "canonical"}))
            link = link.params["href"]

        title = dhtmlparser.removeTags(title_tag).strip()

        # get meta
        meta = html.find("p", {"class": "meta-vypis"})[0]

        blog = Blogpost(url=link, lazy=lazy)

        if lazy:
            blog.title = title
            blog.intro = Blogpost._parse_intro(html, meta, title_tag)
            blog.rating = Blogpost._parse_rating_from_preview(meta)
            blog.created_ts = parse_timestamp(meta)
            blog.comments_n = Blogpost._parse_comments_n(meta)

        return blog
    def transform(cls, virtual_fs: VirtualFS, root: Directory, page: HtmlPage):
        dhtmlparser.makeDoubleLinked(page.dom)
        description = page.metadata.page_description

        if not description:
            description = ""

        description = description.replace('"', "&quote;")

        if page.dom.find("img") and page.metadata.image_index != -1:
            meta_html = cls._large_image_card(description, page)
        else:
            meta_html = cls.summary_card_html.format(
                title=page.title,
                description=description,
                user=settings.twitter_handle)

        meta_tags = dhtmlparser.parseString(meta_html)

        page.dom.find("head")[0].childs.extend(meta_tags.find("meta"))
def test_neighbours_pattern_right_corner():
    dom = dhtmlparser.parseString(
        """
        asd
        <xex>\tHello</xex>
        """
    )
    dhtmlparser.makeDoubleLinked(dom)

    xex = dom.find("xex")[0]
    res = path_patterns.neighbours_pattern(xex)

    assert res
    assert len(res) == 1

    assert res[0].call_type == "left_neighbour_tag"
    assert res[0].index == 0
    assert res[0].params.tag_name == "xex"
    assert res[0].params.params == None
    assert res[0].params.fn_params == [None, None, "asd"]
Ejemplo n.º 20
0
    def transform(cls, virtual_fs: VirtualFS, root: Directory, page: HtmlPage):
        if not settings.generate_thumbnails:
            return

        if cls.resource_registry is None:
            cls.resource_registry = virtual_fs.resource_registry

        dhtmlparser.makeDoubleLinked(page.dom)

        for img in page.dom.find("img"):
            if not img.params.get("src"):
                settings.logger.warning("Image without src: `%s`",
                                        img.tagToString())
                continue

            src = img.params["src"]
            if src.startswith("http://") or src.startswith("https://"):
                continue

            cls._add_thumbnail_for_image(img, src)
Ejemplo n.º 21
0
def guess_toc_element(document):
    """
    For given `document`, guess which HTMLElement holds TOC (Table Of Content).

    This function picks most used cluster with highest derivation of ``<a>``
    element count.

    Args:
        document (str): Document which should contain TOC somewhere.

    Returns:
        obj: HTMLelement instance which looks like it *may* contain TOC.
    """
    dom = dhtmlparser.parseString(document)
    dhtmlparser.makeDoubleLinked(dom)

    links = dom.find("a")

    # construct parent tree
    tree = {}
    for link in links:
        tree[link] = []

        for parent in _parent_iterator(link):
            num_of_links = _number_of_links(parent)

            tree[link].append(
                (num_of_links, parent)
            )

    # find biggest jumps in number of elements in <a> clusters
    jumps = {}
    for link in links:
        jump = _identify_jump(tree[link])

        jumps[jump] = jumps.get(jump, 0) + 1

    # pick element containing most links
    return max(jumps, key=lambda k: jumps[k])
Ejemplo n.º 22
0
    def pull(self):
        """
        Download page with blogpost. Parse text, comments and everything else.

        Until this is called, following attributes are not known/parsed:

            - :attr:`text`
            - :attr:`tags`
            - :attr:`has_tux`
            - :attr:`comments`
            - :attr:`last_modified_ts`
        """
        data = download(url=self.url)

        # this is because of f***s who forgot to close elements like in this
        # blogpost: https://www.abclinuxu.cz/blog/EmentuX/2005/10/all-in-one
        blog_data, comments_data = data.split('<p class="page_tools">')

        self._dom = dhtmlparser.parseString(blog_data)
        self._content_tag = None
        dhtmlparser.makeDoubleLinked(self._dom)

        self._parse_uid()
        self._parse_title()
        self._parse_text()
        self._parse_rating()
        self._parse_meta()

        self._tags = self._get_tags()

        # there are blogs with f****d up HTML which is basically unparsable
        if self.relative_url not in COMMENT_BANLIST:
            self.comments = Comment.comments_from_html(comments_data)
            self.comments_n = len(self.comments)

        # memory cleanup - this saves a LOT of memory
        self._dom = None
        self._content_tag = None
Ejemplo n.º 23
0
    def pull(self):
        """
        Download page with blogpost. Parse text, comments and everything else.

        Until this is called, following attributes are not known/parsed:

            - :attr:`text`
            - :attr:`tags`
            - :attr:`has_tux`
            - :attr:`comments`
            - :attr:`last_modified_ts`
        """
        data = download(url=self.url)

        # this is because of f***s who forgot to close elements like in this
        # blogpost: https://www.abclinuxu.cz/blog/EmentuX/2005/10/all-in-one
        blog_data, comments_data = data.split('<p class="page_tools">')

        self._dom = dhtmlparser.parseString(blog_data)
        self._content_tag = None
        dhtmlparser.makeDoubleLinked(self._dom)

        self._parse_uid()
        self._parse_title()
        self._parse_text()
        self._parse_rating()
        self._parse_meta()

        self._tags = self._get_tags()

        # there are blogs with f****d up HTML which is basically unparsable
        if self.relative_url not in COMMENT_BANLIST:
            self.comments = Comment.comments_from_html(comments_data)
            self.comments_n = len(self.comments)

        # memory cleanup - this saves a LOT of memory
        self._dom = None
        self._content_tag = None
Ejemplo n.º 24
0
def guess_toc_element(document):
    """
    For given `document`, guess which HTMLElement holds TOC (Table Of Content).

    This function picks most used cluster with highest derivation of ``<a>``
    element count.

    Args:
        document (str): Document which should contain TOC somewhere.

    Returns:
        obj: HTMLelement instance which looks like it *may* contain TOC.
    """
    dom = dhtmlparser.parseString(document)
    dhtmlparser.makeDoubleLinked(dom)

    links = dom.find("a")

    # construct parent tree
    tree = {}
    for link in links:
        tree[link] = []

        for parent in _parent_iterator(link):
            num_of_links = _number_of_links(parent)

            tree[link].append((num_of_links, parent))

    # find biggest jumps in number of elements in <a> clusters
    jumps = {}
    for link in links:
        jump = _identify_jump(tree[link])

        jumps[jump] = jumps.get(jump, 0) + 1

    # pick element containing most links
    return max(jumps, key=lambda k: jumps[k])
def test_parsers():
    # Test parsers against http://www.zonerpress.cz/e-kniha-vyvoj-univerzalnych-aplikacii-pre-windows-8-a-windows-phone-8.1
    html = handle_encodnig(
        _get_source('http://www.zonerpress.cz/e-kniha-vyvoj-univerzalnych-aplikacii-pre-windows-8-a-windows-phone-8.1')
    )
    dom = dhtmlparser.parseString(html)
    dhtmlparser.makeDoubleLinked(dom)

    publisher = get_publisher(dom)
    assert publisher.getContent().strip() == 'Zoner Press'

    ISBN = get_ISBN(dom)
    assert ISBN.getContent().strip() == '978-80-7413-282-7'

    title = get_title(dom)
    assert title.getContent().strip() == 'E-kniha: V\xc3\xbdvoj univerz\xc3\xa1lnych aplik\xc3\xa1ci\xc3\xad pre windows 8 a Windows Phone 8.1'

    price = get_price(dom)
    assert price.getContent().strip() == '199&nbsp;K\xc4\x8d'

    author = get_author(dom)
    assert author.getContent().strip().split() == ['Luboslav', 'Lacko', '(<a', 'href="http://www.zonerpress.cz/inshop/scripts/shop.aspx?action=DoSearch&limitedlevels=1&ParamID_1=Luboslav', 'Lacko">Zobrazit', 'v\xc5\xa1echny', 'knihy', 'tohoto', 'autora</a>)']

    binding = get_binding(dom)
    assert binding.getContent().strip() == 'bro\xc5\xbeovan\xc3\xa1'

    pub_date = get_pub_date(dom)
    assert pub_date.getContent().strip() == '2014 (e-kniha)'

    pages = get_pages(dom)
    assert pages.getContent().strip() == '96 (pap\xc3\xadrov\xc3\xa1 kniha)'

    # Test parsers against http://www.zonerpress.cz/ocima-fotografa-graficky-pruvodce
    html = handle_encodnig(
        _get_source('http://www.zonerpress.cz/ocima-fotografa-graficky-pruvodce')
    )
    dom = dhtmlparser.parseString(html)
    dhtmlparser.makeDoubleLinked(dom)

    publisher = get_publisher(dom)
    assert publisher.getContent().strip() == 'Zoner Press'

    ISBN = get_ISBN(dom)
    assert ISBN.getContent().strip() == '978-80-7413-275-9'

    title = get_title(dom)
    assert title.getContent().strip() == 'O\xc4\x8dima fotografa: Grafick\xc3\xbd pr\xc5\xafvodce'

    price = get_price(dom)
    assert price.getContent().strip() == '360&nbsp;K\xc4\x8d'

    author = get_author(dom)
    assert author.getContent().strip().split() == ['Michael', 'Freeman', '(<a', 'href="http://www.zonerpress.cz/inshop/scripts/shop.aspx?action=DoSearch&limitedlevels=1&ParamID_1=Michael', 'Freeman">Zobrazit', 'v\xc5\xa1echny', 'knihy', 'tohoto', 'autora</a>)']

    binding = get_binding(dom)
    assert binding.getContent().strip() == 'M\xc4\x9bkk\xc3\xa1 s klopnami'

    pub_date = get_pub_date(dom)
    assert pub_date.getContent().strip() == '2014'

    pages = get_pages(dom)
    assert pages.getContent().strip() == '192'

    # Test parsers against http://www.zonerpress.cz/konec-prokrastinace
    html = handle_encodnig(
        _get_source('http://www.zonerpress.cz/konec-prokrastinace')
    )
    dom = dhtmlparser.parseString(html)
    dhtmlparser.makeDoubleLinked(dom)

    publisher = get_publisher(dom)
    assert publisher.getContent().strip() == 'Jan Melvil Publishing'

    ISBN = get_ISBN(dom)
    assert ISBN.getContent().strip() == '978-80-87270-51-6'

    title = get_title(dom)
    assert title.getContent().strip() == 'Konec prokrastinace'

    price = get_price(dom)
    assert price.getContent().strip() == '349&nbsp;K\xc4\x8d'

    author = get_author(dom)
    assert author.getContent().strip().split() == ['Petr', 'Ludwig', '(<a', 'href="http://www.zonerpress.cz/inshop/scripts/shop.aspx?action=DoSearch&limitedlevels=1&ParamID_1=Petr', 'Ludwig">Zobrazit', 'v\xc5\xa1echny', 'knihy', 'tohoto', 'autora</a>)']

    binding = get_binding(dom)
    assert binding.getContent().strip() == 'bro\xc5\xbeovan\xc3\xa1 s chlopn\xc4\x9bmi'

    pub_date = get_pub_date(dom)
    assert pub_date.getContent().strip() == '2013'

    pages = get_pages(dom)
    assert pages.getContent().strip() == '272'