def from_user_id(user_id): """ Transform `user_id` to instance of :class:`User`. Returns: obj: :class:`User` instance parsed from the `user_id`. """ data = shared.download(url_context("/Profile/" + str(user_id))) dom = dhtmlparser.parseString(data) dhtmlparser.makeDoubleLinked(dom) shared.handle_errors(dom) # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků # na abclinuxu.cz</a> a_tags = dom.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/")) # pick only links which have content that starts with Seznam links = [ a_tag.params["href"] for a_tag in a_tags if a_tag.getContent().startswith("Seznam") ] username = links[-1].split("/")[2] return User(username)
def test_predecesors_pattern(): dom = dhtmlparser.parseString( """ <root> <xex> <x>content</x> </xex> </root> """ ) dhtmlparser.makeDoubleLinked(dom) x = dom.find("x")[0] res = path_patterns.predecesors_pattern(x, dom) assert res assert len(res) == 1 assert isinstance(res[0], path_patterns.PathCall) assert res[0].call_type == "match" assert res[0].index == 0 assert res[0].params == [ ["root", None], ["xex", None], ["x", None], ]
def test_neighbours_pattern_text_neigh(): dom = dhtmlparser.parseString( """ asd <xex>\tHello</xex> <xep></xep> asd """ ) dhtmlparser.makeDoubleLinked(dom) xex = dom.find("xex")[0] res = path_patterns.neighbours_pattern(xex) assert res assert len(res) == 2 left, right = res assert left.call_type == "left_neighbour_tag" assert left.index == 0 assert res[0].params.tag_name == "xex" assert res[0].params.params == None assert left.params.fn_params == [None, None, "asd"] assert right.call_type == "right_neighbour_tag" assert right.index == 0 assert res[0].params.tag_name == "xex" assert res[0].params.params == None assert right.params.fn_params == ["xep", None, ""]
def from_user_id(user_id): """ Transform `user_id` to instance of :class:`User`. Returns: obj: :class:`User` instance parsed from the `user_id`. """ data = shared.download(url_context("/Profile/" + str(user_id))) dom = dhtmlparser.parseString(data) dhtmlparser.makeDoubleLinked(dom) shared.handle_errors(dom) # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků # na abclinuxu.cz</a> a_tags = dom.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/") ) # pick only links which have content that starts with Seznam links = [ a_tag.params["href"] for a_tag in a_tags if a_tag.getContent().startswith("Seznam") ] username = links[-1].split("/")[2] return User(username)
def cut_dom_to_area_of_interest(html): dom = html # make sure, that you don't modify `html` parameter if not isinstance(html, dhtmlparser.HTMLElement): dom = dhtmlparser.parseString(html) else: dom = copy.deepcopy(dom) dhtmlparser.makeDoubleLinked(dom) # comments are not stored in hierarchical structure, but in somehow # flat-nested lists # locate end of article ds_toolbox = dom.find("div", {"class": "ds_toolbox"}) if not ds_toolbox: raise ValueError("Couldn't locate ds_toolbox!") ds_toolbox = first(ds_toolbox) dom = ds_toolbox.parent # ged rid of everything until end of the article while dom.childs[0] != ds_toolbox: dom.childs.pop(0) dom.childs.pop(0) return dom
def test_makeDoubleLinked(): dom = dhtmlparser.parseString("""<html><tag PARAM="true"></html>""") dhtmlparser.makeDoubleLinked(dom) assert dom.childs[0].parent == dom assert dom.childs[1].parent == dom assert dom.childs[0].childs[0].parent == dom.childs[0]
def test_makeDoubleLinked(): dom = dhtmlparser.parseString( """<html><tag PARAM="true"></html>""" ) dhtmlparser.makeDoubleLinked(dom) assert dom.childs[0].parent == dom assert dom.childs[1].parent == dom assert dom.childs[0].childs[0].parent == dom.childs[0]
def test_neighbours_pattern_both_corners(): dom = dhtmlparser.parseString( """ <xex>\tHello</xex> """ ) dhtmlparser.makeDoubleLinked(dom) xex = dom.find("xex")[0] res = path_patterns.neighbours_pattern(xex) assert not res
def test_has_neigh(): dom = dhtmlparser.parseString(SAUCE) dhtmlparser.makeDoubleLinked(dom) el = dom.find( "container", None, fn=utils.has_neigh(None, None, "something something", left=False) ) assert el assert len(el) == 1 assert el[0].getContent() == "and this"
def test_predecesors_pattern_shallow_root(): dom = dhtmlparser.parseString( """ <root> <x>content</x> </root> """ ) dhtmlparser.makeDoubleLinked(dom) x = dom.find("x")[0] res = path_patterns.predecesors_pattern(x, dom) assert not res
def transform(cls, virtual_fs: VirtualFS, root: Directory, page: HtmlPage): made_doublelinked = False add_style_to_the_header = False code_code = page.dom.match(["pre", {"class": "code"}], "code") code_wrap = page.dom.match(["pre", { "class": "code code-wrap" }], "code") for code_tag in code_code + code_wrap: code_content, lang = cls._parse_code_content_and_lang(code_tag) if not made_doublelinked: dhtmlparser.makeDoubleLinked(page.dom) made_doublelinked = True add_style_to_the_header = True if lang == "python" or lang == "python3" or lang == "py": cls._add_syntax_highlight_for(PythonLexer, code_tag, code_content) elif lang == "c": cls._add_syntax_highlight_for(CLexer, code_tag, code_content) elif lang == "c++" or lang == "cpp": cls._add_syntax_highlight_for(CppLexer, code_tag, code_content) elif lang == "smalltalk": cls._add_syntax_highlight_for(SmalltalkLexer, code_tag, code_content) elif lang == "xml": cls._add_syntax_highlight_for(XmlLexer, code_tag, code_content) elif lang == "html": cls._add_syntax_highlight_for(HtmlLexer, code_tag, code_content) elif lang == "css": cls._add_syntax_highlight_for(CssLexer, code_tag, code_content) elif lang == "yaml": cls._add_syntax_highlight_for(YamlLexer, code_tag, code_content) elif lang: settings.logger.error("Unknown lang definition: %s, skipping.", lang) add_style_to_the_header = False else: add_style_to_the_header = False if add_style_to_the_header: style = HtmlFormatter().get_style_defs() style_html = "<style>\n%s\n</style>" % style style_tag = dhtmlparser.parseString(style_html) page.dom.find("head")[0].childs.append(style_tag)
def cut_dom_to_area_of_interest(html): """ Raises: StopIteration: In case of no comments. ValueError: In case that there is missing elements from HTML. """ dom = html # make sure, that you don't modify `html` parameter if not isinstance(html, dhtmlparser.HTMLElement): dom = dhtmlparser.parseString(html) else: dom = copy.deepcopy(dom) dhtmlparser.makeDoubleLinked(dom) # comments are not stored in hierarchical structure, but in somehow # flat-nested lists # locate end of article ds_toolbox = dom.find("div", {"class": "ds_toolbox"}) if not ds_toolbox: # blogposts without any comments add_first_comment = dom.find( "a", fn=lambda x: "action=addDiz" in x.params.get("href", "") and x.getContent().strip() == "Vložit první komentář" ) if add_first_comment: raise StopIteration("No comments yet.") raise ValueError("Couldn't locate ds_toolbox!") ds_toolbox = first(ds_toolbox) dom = ds_toolbox.parent # ged rid of everything until end of the article while dom.childs[0] != ds_toolbox: dom.childs.pop(0) dom.childs.pop(0) return dom
def test_parent_iterator(): dom = dhtmlparser.parseString(""" <root> <a /> <sub> <a attr=1 /> </sub> </root> """) dhtmlparser.makeDoubleLinked(dom) a_tag = dom.find("a", {"attr": "1"})[0] assert a_tag parents = list(toc_guesser._parent_iterator(a_tag)) assert parents assert parents == [dom.find("sub")[0], dom.find("root")[0], dom]
def _create_dom(data): """ Creates doublelinked DOM from `data`. Args: data (str/HTMLElement): Either string or HTML element. Returns: obj: HTMLElement containing double linked DOM. """ if not isinstance(data, dhtmlparser.HTMLElement): data = dhtmlparser.parseString( utils.handle_encodnig(data) ) dhtmlparser.makeDoubleLinked(data) return data
def from_html(html, lazy=True): """ Convert HTML string to :class:`Blogpost` instance. Args: html (str): Input data. lazy (bool, default True): Be lazy (don't pull data by yourself from the site). Call :meth:`pull` for active download of all required informations. Returns: obj: :class:`Blogpost` instance. """ if not isinstance(html, dhtmlparser.HTMLElement): html = dhtmlparser.parseString(html) dhtmlparser.makeDoubleLinked(html) # support for legacy blogs title_tag = html.find("h2", {"class": "st_nadpis"}) if title_tag: title_tag = first(title_tag) rel_link = first(title_tag.find("a")).params["href"] link = url_context(rel_link) else: title_tag = first(html.find("h2")) link = first(html.find("link", {"rel": "canonical"})) link = link.params["href"] title = dhtmlparser.removeTags(title_tag).strip() # get meta meta = html.find("p", {"class": "meta-vypis"})[0] blog = Blogpost(url=link, lazy=lazy) if lazy: blog.title = title blog.intro = Blogpost._parse_intro(html, meta, title_tag) blog.rating = Blogpost._parse_rating_from_preview(meta) blog.created_ts = parse_timestamp(meta) blog.comments_n = Blogpost._parse_comments_n(meta) return blog
def transform(cls, virtual_fs: VirtualFS, root: Directory, page: HtmlPage): dhtmlparser.makeDoubleLinked(page.dom) description = page.metadata.page_description if not description: description = "" description = description.replace('"', ""e;") if page.dom.find("img") and page.metadata.image_index != -1: meta_html = cls._large_image_card(description, page) else: meta_html = cls.summary_card_html.format( title=page.title, description=description, user=settings.twitter_handle) meta_tags = dhtmlparser.parseString(meta_html) page.dom.find("head")[0].childs.extend(meta_tags.find("meta"))
def test_neighbours_pattern_right_corner(): dom = dhtmlparser.parseString( """ asd <xex>\tHello</xex> """ ) dhtmlparser.makeDoubleLinked(dom) xex = dom.find("xex")[0] res = path_patterns.neighbours_pattern(xex) assert res assert len(res) == 1 assert res[0].call_type == "left_neighbour_tag" assert res[0].index == 0 assert res[0].params.tag_name == "xex" assert res[0].params.params == None assert res[0].params.fn_params == [None, None, "asd"]
def transform(cls, virtual_fs: VirtualFS, root: Directory, page: HtmlPage): if not settings.generate_thumbnails: return if cls.resource_registry is None: cls.resource_registry = virtual_fs.resource_registry dhtmlparser.makeDoubleLinked(page.dom) for img in page.dom.find("img"): if not img.params.get("src"): settings.logger.warning("Image without src: `%s`", img.tagToString()) continue src = img.params["src"] if src.startswith("http://") or src.startswith("https://"): continue cls._add_thumbnail_for_image(img, src)
def guess_toc_element(document): """ For given `document`, guess which HTMLElement holds TOC (Table Of Content). This function picks most used cluster with highest derivation of ``<a>`` element count. Args: document (str): Document which should contain TOC somewhere. Returns: obj: HTMLelement instance which looks like it *may* contain TOC. """ dom = dhtmlparser.parseString(document) dhtmlparser.makeDoubleLinked(dom) links = dom.find("a") # construct parent tree tree = {} for link in links: tree[link] = [] for parent in _parent_iterator(link): num_of_links = _number_of_links(parent) tree[link].append( (num_of_links, parent) ) # find biggest jumps in number of elements in <a> clusters jumps = {} for link in links: jump = _identify_jump(tree[link]) jumps[jump] = jumps.get(jump, 0) + 1 # pick element containing most links return max(jumps, key=lambda k: jumps[k])
def pull(self): """ Download page with blogpost. Parse text, comments and everything else. Until this is called, following attributes are not known/parsed: - :attr:`text` - :attr:`tags` - :attr:`has_tux` - :attr:`comments` - :attr:`last_modified_ts` """ data = download(url=self.url) # this is because of f***s who forgot to close elements like in this # blogpost: https://www.abclinuxu.cz/blog/EmentuX/2005/10/all-in-one blog_data, comments_data = data.split('<p class="page_tools">') self._dom = dhtmlparser.parseString(blog_data) self._content_tag = None dhtmlparser.makeDoubleLinked(self._dom) self._parse_uid() self._parse_title() self._parse_text() self._parse_rating() self._parse_meta() self._tags = self._get_tags() # there are blogs with f****d up HTML which is basically unparsable if self.relative_url not in COMMENT_BANLIST: self.comments = Comment.comments_from_html(comments_data) self.comments_n = len(self.comments) # memory cleanup - this saves a LOT of memory self._dom = None self._content_tag = None
def guess_toc_element(document): """ For given `document`, guess which HTMLElement holds TOC (Table Of Content). This function picks most used cluster with highest derivation of ``<a>`` element count. Args: document (str): Document which should contain TOC somewhere. Returns: obj: HTMLelement instance which looks like it *may* contain TOC. """ dom = dhtmlparser.parseString(document) dhtmlparser.makeDoubleLinked(dom) links = dom.find("a") # construct parent tree tree = {} for link in links: tree[link] = [] for parent in _parent_iterator(link): num_of_links = _number_of_links(parent) tree[link].append((num_of_links, parent)) # find biggest jumps in number of elements in <a> clusters jumps = {} for link in links: jump = _identify_jump(tree[link]) jumps[jump] = jumps.get(jump, 0) + 1 # pick element containing most links return max(jumps, key=lambda k: jumps[k])
def test_parsers(): # Test parsers against http://www.zonerpress.cz/e-kniha-vyvoj-univerzalnych-aplikacii-pre-windows-8-a-windows-phone-8.1 html = handle_encodnig( _get_source('http://www.zonerpress.cz/e-kniha-vyvoj-univerzalnych-aplikacii-pre-windows-8-a-windows-phone-8.1') ) dom = dhtmlparser.parseString(html) dhtmlparser.makeDoubleLinked(dom) publisher = get_publisher(dom) assert publisher.getContent().strip() == 'Zoner Press' ISBN = get_ISBN(dom) assert ISBN.getContent().strip() == '978-80-7413-282-7' title = get_title(dom) assert title.getContent().strip() == 'E-kniha: V\xc3\xbdvoj univerz\xc3\xa1lnych aplik\xc3\xa1ci\xc3\xad pre windows 8 a Windows Phone 8.1' price = get_price(dom) assert price.getContent().strip() == '199 K\xc4\x8d' author = get_author(dom) assert author.getContent().strip().split() == ['Luboslav', 'Lacko', '(<a', 'href="http://www.zonerpress.cz/inshop/scripts/shop.aspx?action=DoSearch&limitedlevels=1&ParamID_1=Luboslav', 'Lacko">Zobrazit', 'v\xc5\xa1echny', 'knihy', 'tohoto', 'autora</a>)'] binding = get_binding(dom) assert binding.getContent().strip() == 'bro\xc5\xbeovan\xc3\xa1' pub_date = get_pub_date(dom) assert pub_date.getContent().strip() == '2014 (e-kniha)' pages = get_pages(dom) assert pages.getContent().strip() == '96 (pap\xc3\xadrov\xc3\xa1 kniha)' # Test parsers against http://www.zonerpress.cz/ocima-fotografa-graficky-pruvodce html = handle_encodnig( _get_source('http://www.zonerpress.cz/ocima-fotografa-graficky-pruvodce') ) dom = dhtmlparser.parseString(html) dhtmlparser.makeDoubleLinked(dom) publisher = get_publisher(dom) assert publisher.getContent().strip() == 'Zoner Press' ISBN = get_ISBN(dom) assert ISBN.getContent().strip() == '978-80-7413-275-9' title = get_title(dom) assert title.getContent().strip() == 'O\xc4\x8dima fotografa: Grafick\xc3\xbd pr\xc5\xafvodce' price = get_price(dom) assert price.getContent().strip() == '360 K\xc4\x8d' author = get_author(dom) assert author.getContent().strip().split() == ['Michael', 'Freeman', '(<a', 'href="http://www.zonerpress.cz/inshop/scripts/shop.aspx?action=DoSearch&limitedlevels=1&ParamID_1=Michael', 'Freeman">Zobrazit', 'v\xc5\xa1echny', 'knihy', 'tohoto', 'autora</a>)'] binding = get_binding(dom) assert binding.getContent().strip() == 'M\xc4\x9bkk\xc3\xa1 s klopnami' pub_date = get_pub_date(dom) assert pub_date.getContent().strip() == '2014' pages = get_pages(dom) assert pages.getContent().strip() == '192' # Test parsers against http://www.zonerpress.cz/konec-prokrastinace html = handle_encodnig( _get_source('http://www.zonerpress.cz/konec-prokrastinace') ) dom = dhtmlparser.parseString(html) dhtmlparser.makeDoubleLinked(dom) publisher = get_publisher(dom) assert publisher.getContent().strip() == 'Jan Melvil Publishing' ISBN = get_ISBN(dom) assert ISBN.getContent().strip() == '978-80-87270-51-6' title = get_title(dom) assert title.getContent().strip() == 'Konec prokrastinace' price = get_price(dom) assert price.getContent().strip() == '349 K\xc4\x8d' author = get_author(dom) assert author.getContent().strip().split() == ['Petr', 'Ludwig', '(<a', 'href="http://www.zonerpress.cz/inshop/scripts/shop.aspx?action=DoSearch&limitedlevels=1&ParamID_1=Petr', 'Ludwig">Zobrazit', 'v\xc5\xa1echny', 'knihy', 'tohoto', 'autora</a>)'] binding = get_binding(dom) assert binding.getContent().strip() == 'bro\xc5\xbeovan\xc3\xa1 s chlopn\xc4\x9bmi' pub_date = get_pub_date(dom) assert pub_date.getContent().strip() == '2013' pages = get_pages(dom) assert pages.getContent().strip() == '272'