コード例 #1
0
    def _get_last_five_tags(self):
        top_tag_code = '<div id="last_five_top">\n%s\n</div>' % self.last_five_html
        top_tag = dhtmlparser.parseString(top_tag_code).find("div")[0]

        bottom_tag_code = '<div id="last_five_bottom">\n%s\n</div>' % self.last_five_html
        bottom_tag = dhtmlparser.parseString(bottom_tag_code).find("div")[0]

        return top_tag, bottom_tag
コード例 #2
0
ファイル: user.py プロジェクト: Bystroushaak/abclinuxuapi
    def add_concept(self, text, title, ts_of_pub=None):
        """
        Adds new concept into your concepts.

        Args:
            text (str): Text of your concept.
            title (str): Title of your contept. Do not use HTML in title!
            ts_of_pub (int/float, default None): Timestamp of the publication.

        Raises:
            UserWarning: if the site is broken or user was logged out.
        """
        if not self.has_blog:
            raise ValueError("User doesn't have blog!")

        self.login()

        dom = dhtmlparser.parseString(self._get(self.blog_url))

        # get section with links to new blog
        s_sekce = filter(
            lambda x: "Vlož nový zápis" in x.getContent(),
            dom.find("div", {"class": "s_sekce"})
        )
        if not s_sekce:
            raise UserWarning("Can't resolve right div tag!")

        # get link to "add blog" page
        add_blog_link = filter(
            lambda x: "href" in x.params and
                      x.params["href"].endswith("action=add"),
            s_sekce[0].find("a")
        )
        if not add_blog_link:
            raise UserWarning("Can't resolve user number!")
        add_blog_link = add_blog_link[0].params["href"]

        # get "add blog" page
        data = self._get(ABCLINUXU_URL + add_blog_link)
        dom = dhtmlparser.parseString(data)

        form_action = dom.find("form", {"name": "form"})[0].params["action"]

        data = self.session.post(
            ABCLINUXU_URL + form_action,
            data={
                "cid": 0,
                "publish": shared.ts_to_concept_date(ts_of_pub),
                "content": text,
                "title": dhtmlparser.removeTags(title),
                "delay": "Do konceptů",
                "action": "add2"
            },
            verify=False,
        )
        data = data.text.encode("utf-8")
        check_error_div(data, '<div class="error" id="contentError">')
        check_error_div(data, '<div class="error" id="titleError">')
コード例 #3
0
def test_remove_tags():
    dom = dhtmlparser.parseString("a<b>xax<i>xe</i>xi</b>d")
    assert dhtmlparser.removeTags(dom) == "axaxxexid"

    dom = dhtmlparser.parseString("<b></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><i></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><!-- asd --><i></b>")
    assert not dhtmlparser.removeTags(dom)
コード例 #4
0
ファイル: user.py プロジェクト: vojtechkral/abclinuxuapi
    def add_concept(self, text, title, ts_of_pub=None):
        """
        Adds new concept into your concepts.

        Args:
            text (str): Text of your concept.
            title (str): Title of your contept. Do not use HTML in title!
            ts_of_pub (int/float, default None): Timestamp of the publication.

        Raises:
            UserWarning: if the site is broken or user was logged out.
        """
        if not self.has_blog:
            raise ValueError("User doesn't have blog!")

        self.login()

        dom = dhtmlparser.parseString(self._get(self.blog_url))

        # get section with links to new blog
        s_sekce = filter(lambda x: "Vlož nový zápis" in x.getContent(),
                         dom.find("div", {"class": "s_sekce"}))
        if not s_sekce:
            raise UserWarning("Can't resolve right div tag!")

        # get link to "add blog" page
        add_blog_link = filter(
            lambda x: "href" in x.params and x.params["href"].endswith(
                "action=add"), s_sekce[0].find("a"))
        if not add_blog_link:
            raise UserWarning("Can't resolve user number!")
        add_blog_link = add_blog_link[0].params["href"]

        # get "add blog" page
        data = self._get(ABCLINUXU_URL + add_blog_link)
        dom = dhtmlparser.parseString(data)

        form_action = dom.find("form", {"name": "form"})[0].params["action"]

        data = self.session.post(
            ABCLINUXU_URL + form_action,
            data={
                "cid": 0,
                "publish": shared.ts_to_concept_date(ts_of_pub),
                "content": text,
                "title": dhtmlparser.removeTags(title),
                "delay": "Do konceptů",
                "action": "add2"
            },
            verify=False,
        )
        data = data.text.encode("utf-8")
        check_error_div(data, '<div class="error" id="contentError">')
コード例 #5
0
    def parse(self):
        try:
            self.dom = dhtmlparser.parseString(self.content)
        except UnicodeDecodeError:
            self.content = self.content.encode("utf-8")
            self.dom = dhtmlparser.parseString(self.content)

        self.cropped_content = self.crop_content(
            content=self.content,
            dom=self.dom
        )
        self.cropped_dom = dhtmlparser.parseString(self.cropped_content)
コード例 #6
0
def parse_to_url(to_url='', title_word=''):
    "http://gaoxiao.jokeji.cn/GrapHtml/quweigaoxiao/20140709211210.htm"
    
    html_doc = urllib2.urlopen(to_url).read()
    html_doc = html_doc.decode('gbk').encode('utf8')
    
    dom = d.parseString(html_doc)
    
    target_uls = []  #要找的目标ul
    #找出所有的ul先
    uls = dom.find('ul')
    
    #找出目标ul的列表
    for ul in uls:
        ul_content = ul.getContent()
        sub_dom = d.parseString(ul_content)
        
        #rule 1,ul下面应该找不到a标签
        aes = sub_dom.find('a')
        if len(aes) > 0:
            continue
        
        #rule 2,ul下面应该能找到b标签
        bes = sub_dom.find('b')
        if len(bes) == 0:
            continue
        
        #rule 3 ,ul下面可以找到至少一个img ,且有属性style="CURSOR: hand"
        handes = sub_dom.find('img',{"style":"CURSOR: hand"})
        if len(handes) == 0:
            continue
        
        #最后余下的应该就是目标了
        target_uls.append(ul)
    
    info = []
    #解析目标ul
    for ul in target_uls:
        text = ul.getContent()
        #找出下面所有的img标签
        sub_dom = d.parseString(text)
        imgs = sub_dom.find('img')
        
        for img in imgs:
            src = base_url + img.params['src']
            title = img.params['alt']
            
            info.append({'src':src, 'title':title, 'overview':title_word})
    
    #保存起来
    for i in info:
        #print i['src'],i['title']
        save_img(i)
コード例 #7
0
def test_remove_tags():
    dom = dhtmlparser.parseString("a<b>xax<i>xe</i>xi</b>d")
    assert dhtmlparser.removeTags(dom) == "axaxxexid"

    dom = dhtmlparser.parseString("<b></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><i></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><!-- asd --><i></b>")
    assert not dhtmlparser.removeTags(dom)
コード例 #8
0
    def _add_sidebar_skeletons_to_page(
            self, page: 'HtmlPage') -> Tuple[HTMLElement, HTMLElement]:
        top_tag_code = """<div id="sidebar_top"></div>"""
        bottom_tag_code = '<div id="sidebar_bottom">\n</div>'

        top_tag = dhtmlparser.parseString(top_tag_code).find("div")[0]
        bottom_tag = dhtmlparser.parseString(bottom_tag_code).find("div")[0]

        body_tag = page.dom.find("body")[0]
        body_tag.childs.insert(0, top_tag)
        body_tag.childs.append(bottom_tag)

        return top_tag, bottom_tag
コード例 #9
0
    def _add_syntax_highlight_for(cls, lexer, code, code_content):
        formatter = HtmlFormatter(wrapcode=False)

        colored_text = highlight(code_content, lexer(), formatter)
        pre_tag = dhtmlparser.parseString(colored_text).find("pre")[0]

        # wrap content of the <pre> to the <code>
        code_tag = dhtmlparser.parseString("<code></code>").find("code")[0]
        code_tag.childs = pre_tag.childs
        pre_tag.childs = [code_tag]
        pre_tag.params["class"] = "code"

        code.parent.replaceWith(pre_tag)
コード例 #10
0
def test_parseString_cip():
    dom = dhtmlparser.parseString(
        """<html><tag PARAM="true"></html>""",
        cip=False
    )

    assert dom.childs
    assert len(dom.childs) == 2

    assert dom.childs[0].getTagName() == "html"
    assert dom.childs[1].getTagName() == "html"

    assert dom.childs[0].isOpeningTag()
    assert dom.childs[1].isEndTag()

    assert dom.childs[0].childs
    assert not dom.childs[1].childs

    assert dom.childs[0].childs[0].getTagName() == "tag"
    assert dom.childs[0].childs[0].params
    assert not dom.childs[0].childs[0].childs

    assert "param" not in dom.childs[0].childs[0].params
    assert "PARAM" in dom.childs[0].childs[0].params

    assert dom.childs[0].childs[0].params["PARAM"] == "true"

    with pytest.raises(KeyError):
        dom.childs[0].childs[0].params["param"]
コード例 #11
0
ファイル: blogpost.py プロジェクト: vojtechkral/abclinuxuapi
    def _parse_tags(tags_xml):
        tags_dom = dhtmlparser.parseString(tags_xml)

        # see http://www.abclinuxu.cz/ajax/tags/list for details
        return [
            Tag(tag.params["l"], tag.params["i"]) for tag in tags_dom.find("s")
        ]
コード例 #12
0
ファイル: aleph.py プロジェクト: grimmo/edeposit.amqp.aleph
def getListOfBases():
    """
    This function is here mainly for purposes of unittest

    Returns:
        list of str: Valid bases as they are used as URL parameters in links at
                     Aleph main page.
    """
    downer = Downloader()
    data = downer.download(ALEPH_URL + "/F/?func=file&file_name=base-list")
    dom = dhtmlparser.parseString(data.lower())

    # from default aleph page filter links containing local_base in their href
    base_links = filter(
        lambda x: "href" in x.params and "local_base" in x.params["href"],
        dom.find("a")
    )

    # split links by & - we will need only XXX from link.tld/..&local_base=XXX
    base_links = map(
        lambda x: x.params["href"].replace("?", "&", 1).split("&"),
        base_links
    )

    # filter only sections containing bases
    bases = map(
        lambda link: filter(lambda base: "local_base=" in base, link)[0],
        base_links
    )

    # filter bases from base sections
    bases = map(lambda x: x.split("=")[1].strip(), bases)

    return list(set(bases))  # list(set()) is same as unique()
コード例 #13
0
ファイル: user.py プロジェクト: vojtechkral/abclinuxuapi
    def get_blogposts(self):
        """
        Lists all of users PUBLISHED blogposts. For unpublished, see 
        :meth:`get_concepts`.

        Returns:
            list: sorted (old->new) list of Blogpost objects.
        """
        if not self.has_blog:
            return []

        def cut_crap(data):
            data = data.split(
                '<div class="s_nadpis linkbox_nadpis">Píšeme jinde</div>')[0]

            return data.split('<div class="st" id="st">')[1]

        cnt = 0
        posts = []
        parsed = [1]  # just placeholder for first iteration
        while parsed:
            data = self._get(self._compose_blogposts_url(cnt))

            dom = dhtmlparser.parseString(cut_crap(data))
            parsed = [
                Blogpost.from_html(blog_html)
                for blog_html in dom.find("div", {"class": "cl"})
            ]

            posts.extend(parsed)
            cnt += BLOG_STEP

        return sorted(posts, key=lambda x: x.created_ts)
コード例 #14
0
    def remove_fluff(self, body):
        empty = dhtmlparser.parseString("")

        def replace(selector):
            for el in selector:
                el.replaceWith(empty)

        replace(body.find("p", {"id": "copyright"}))
        replace(body.find("aside", {"id": "sidebar"}))
        replace(body.find("nav", {"id": "next-page"}))
        replace(body.find("div", {"id": "comment_bubble_wrapper"}))
        replace(body.find("div", {"class": "nocontent"}))
        replace(body.find("div", {"class": "tertiary-content-wrapper"}))
        replace(body.find("div", {"class": "more-link"}))
        replace(body.find("div", {"class": "view-content"}))
        replace(body.find("div", {"class": "block-content content"}))
        replace(body.find("div", {"class": "region region-content-aside"}))
        replace(body.find("div", {"role": "search"}))
        replace(
            body.find("div",
                      fn=lambda x: "block-Eggplant-navigation" in x.params.get(
                          "class", "")))
        replace(body.find("header"))
        replace(body.find("div", {"id": "tertiary-content-wrapper"}))
        replace(body.find("nav", {"class": "clearfix"}))

        return body.find("article", {"id": "article"})[0]
コード例 #15
0
    def add_pic(self, opened_file):
        """
        Add picture to the Concept.

        Args:
            opened_file (file): opened file object
        """
        # init meta
        if not self._meta:
            self._init_metadata()

        # get link to pic form
        data = download(url_context(self._meta["Přidej obrázek"]),
                        session=self._session)
        dom = dhtmlparser.parseString(data)

        # get information from pic form
        form = first(dom.find("form", {"enctype": "multipart/form-data"}))
        add_pic_url = form.params["action"]

        # send pic
        data = self._session.post(url_context(add_pic_url),
                                  data={
                                      "action": "addScreenshot2",
                                      "finish": "Nahrát"
                                  },
                                  files={"screenshot": opened_file})
        data = data.text.encode("utf-8")
        check_error_div(data, '<div class="error" id="screenshotError">')
コード例 #16
0
ファイル: user.py プロジェクト: vojtechkral/abclinuxuapi
    def get_concepts(self):
        """
        Return all concepts (unpublished blogs).

        Returns:
            list: List of Concept objects.
        """
        if not self.has_blog:
            raise ValueError("User doesn't have blog!")

        self.login()

        # get the f*****g untagged part of the site, where the links to the
        # concepts are stored
        data = self._get(self.blog_url)

        if '<div class="s_nadpis">Rozepsané zápisy</div>' not in data:
            return []

        data = data.split('<div class="s_nadpis">Rozepsané zápisy</div>')[1]

        dom = dhtmlparser.parseString(data)
        concept_list = dom.find("div", {"class": "s_sekce"})[0]

        # links to concepts are stored in <li>
        concepts = []
        for li in concept_list.find("li"):
            a = li.find("a")[0]

            concepts.append(
                Concept(title=a.getContent().strip(),
                        link=a.params["href"],
                        session=self.session))

        return concepts
コード例 #17
0
ファイル: user.py プロジェクト: vojtechkral/abclinuxuapi
    def from_user_id(user_id):
        """
        Transform `user_id` to instance of :class:`User`.

        Returns:
            obj: :class:`User` instance parsed from the `user_id`.
        """
        data = shared.download(url_context("/Profile/" + str(user_id)))
        dom = dhtmlparser.parseString(data)
        dhtmlparser.makeDoubleLinked(dom)

        shared.handle_errors(dom)

        # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků
        # na abclinuxu.cz</a>
        a_tags = dom.find(
            "a", fn=lambda x: x.params.get("href", "").startswith("/lide/"))

        # pick only links which have content that starts with Seznam
        links = [
            a_tag.params["href"] for a_tag in a_tags
            if a_tag.getContent().startswith("Seznam")
        ]

        username = links[-1].split("/")[2]

        return User(username)
コード例 #18
0
ファイル: user.py プロジェクト: vojtechkral/abclinuxuapi
    def _get_user_id(self):
        """
        Resolve user's ID number for logged user.

        Returns:
            str: USER id as string.
        """
        if self._user_id is not None:
            return self._user_id

        self.login()
        dom = dhtmlparser.parseString(self._get(ABCLINUXU_URL))

        # resolve user's navigation panel
        nav_bar = dom.match(
            ["div", {
                "class": "hl_vpravo"
            }], {
                "tag_name": "a",
                "fn": lambda x: x.params.get("href", "").startswith("/Profile")
            })

        if not nav_bar:
            raise ValueError("Can't parse user's navigation bar!")

        profile_link = first(nav_bar).params["href"]

        # transform /Profile/24642?action=myPage -> 24642
        self._user_id = profile_link.split("?")[0].split("/")[-1]

        return self._user_id
コード例 #19
0
def get_html_lang_tags(index_page):
    """
    Return `languages` stored in ``<meta>`` tags.

    ``<meta http-equiv="Content-language" content="cs">`` -> ``cs``

    Args:
        index_page (str): HTML content of the page you wish to analyze.

    Returns:
        list: List of :class:`.SourceString` objects.
    """
    dom = dhtmlparser.parseString(index_page)

    lang_tag = "content-language"
    lang_tags = dom.find(
        "meta",
        fn=lambda x: x.params.get("http-equiv", "").lower() == lang_tag
    )

    return [
        SourceString(tag.params["content"], "HTML")
        for tag in lang_tags
        if "content" in tag.params
    ]
コード例 #20
0
def _parse_format_pages_isbn(html_chunk):
    """
    Parse format, number of pages and ISBN.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        tuple: (format, pages, isbn), all as string.
    """
    ppi = get_first_content(
        html_chunk.find("div", {"class": "price-overflow"})
    )

    if not ppi:
        return None, None, None

    # all information this function should parse are at one line
    ppi = filter(lambda x: x.strip(), ppi.split("<br />"))[0]

    # parse isbn
    isbn = dhtmlparser.parseString(ppi)
    isbn = isbn.find("b")
    isbn = isbn[0].getContent() if isbn else None

    # parse pages and format
    pages = None
    book_format = None
    details = ppi.split("|")

    if len(details) >= 2:
        book_format = details[0].strip()
        pages = details[1].strip()

    return book_format, pages, isbn
コード例 #21
0
        def cut_dom_to_area_of_interest(html):
            dom = html

            # make sure, that you don't modify `html` parameter
            if not isinstance(html, dhtmlparser.HTMLElement):
                dom = dhtmlparser.parseString(html)
            else:
                dom = copy.deepcopy(dom)
            dhtmlparser.makeDoubleLinked(dom)

            # comments are not stored in hierarchical structure, but in somehow
            # flat-nested lists

            # locate end of article
            ds_toolbox = dom.find("div", {"class": "ds_toolbox"})

            if not ds_toolbox:
                raise ValueError("Couldn't locate ds_toolbox!")

            ds_toolbox = first(ds_toolbox)
            dom = ds_toolbox.parent

            # ged rid of everything until end of the article
            while dom.childs[0] != ds_toolbox:
                dom.childs.pop(0)

            dom.childs.pop(0)

            return dom
コード例 #22
0
ファイル: spotifier.py プロジェクト: Bystroushaak/spotifier
def login(username, password, http_proxy = None):
	"""
	Just login into spotify. This is usefull, because users from unsupported
	countries have to login thru IP from supported country every ~twoweeks, or
	their account is frozen until they do so.

	Function supports http_proxy parameter in format "http://server:port".

	Raise:
	 - SpotifierException if there is some problem.
	"""
	d = Downloader(http_proxy = http_proxy)
	dom = html.parseString(
		d.download(
			"https://www.spotify.com/us/login/?forward_url=%2Fus%2F",
		)
	)

	log_form = {
		"referrer": "",
		"utm-keywords": dom.find("input", {"name": "utm-keywords"})[0].params["value"],
		"user_name": username,
		"password": password
	}

	data = d.download(
		"https://www.spotify.com/us/xhr/json/login.php",
		post = log_form,
	)
	jdata = json.loads(data)

	if jdata["error"]:
		raise SpotifierException(jdata["msg"])
コード例 #23
0
def _process_book(html_chunk):
    """
    Parse available informations about book from the book details page.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        obj: :class:`structures.Publication` instance with book details.
    """
    title, book_url = _parse_title_url(html_chunk)

    # download page with details
    data = DOWNER.download(book_url)
    dom = dhtmlparser.parseString(
        handle_encodnig(data)
    )
    details = dom.find("div", {"id": "kniha_detail"})[0]

    # required parameters
    pub = Publication(
        title=title,
        authors=_parse_authors(html_chunk),
        price=_parse_price(details),
        publisher="CPress"
    )

    # optional parameters
    pub.optionals.URL = book_url
    pub.optionals.EAN = _parse_ean(details)
    pub.optionals.format = _parse_format(details)
    pub.optionals.pub_date = _parse_date(details)
    pub.optionals.description = _parse_description(details)

    return pub
コード例 #24
0
ファイル: mips_extractor.py プロジェクト: fm4d/clanky
def get_table():
    page = downloader.download(
        "https://en.wikipedia.org/wiki/Instructions_per_second")

    dom = dhtmlparser.parseString(page)

    return dom.find("table", {"class": "wikitable sortable"})[0]
コード例 #25
0
ファイル: __init__.py プロジェクト: Bystroushaak/abclinuxuapi
def iter_blogposts(start=0, end=None, lazy=True):
    """
    Iterate over blogs. Based at bloglist.

    Args:
        start (int, default 0): Start at this page.
        end (int, default None): End at this page.
        lazy (bool, default True): Initialize :class:`.Blogpost` objects only
             with informations from listings. Don't download full text and
             comments.

    Yields:
        obj: :class:`.Blogpost` objects.
    """
    for cnt, url in enumerate(_next_blog_url(start)):
        data = _shared.download(url)

        data = _remove_crap_from_bloglist(data)

        # parse basic info about all blogs at page
        dom = _dhtmlparser.parseString(data)
        for bcnt, blog in enumerate(dom.findB("div", {"class": "cl"})):
            yield Blogpost.from_html(blog, lazy=lazy)

            # every page has 25 blogposts, but somethimes I am getting more
            if bcnt >= 24:
                break

        # detect end of pagination at the bottom
        if not _should_continue(dom):
            break

        if end is not None and cnt >= end:
            break
コード例 #26
0
ファイル: user.py プロジェクト: Bystroushaak/abclinuxuapi
    def _get_user_id(self):
        """
        Resolve user's ID number for logged user.

        Returns:
            str: USER id as string.
        """
        if self._user_id is not None:
            return self._user_id

        self.login()
        dom = dhtmlparser.parseString(self._get(ABCLINUXU_URL))

        # resolve user's navigation panel
        nav_bar = dom.match(
            ["div", {"class": "hl_vpravo"}],
            {
                "tag_name": "a",
                "fn": lambda x: x.params.get("href", "").startswith("/Profile")
            }
        )

        if not nav_bar:
            raise ValueError("Can't parse user's navigation bar!")

        profile_link = first(nav_bar).params["href"]

        # transform /Profile/24642?action=myPage -> 24642
        self._user_id = profile_link.split("?")[0].split("/")[-1]

        return self._user_id
コード例 #27
0
def reg_header():
    return dhtmlparser.parseString("""    <div class="ds_hlavicka" id="9">
        <div class="ds_reseni" style="display:none">
        </div>


        11.2. 15:21

<a href="/lide/manasekp">manasekp</a>             | skóre: 27
             | blog: <a href="/blog/manasekp">manasekp</a>
             | Brno

        <br>

            <span class="ds_control_sbalit2" id="comment9_toggle2">
                <a onClick="schovej_vlakno(9)" title="Schová nebo rozbalí celé vlákno">Rozbalit</a>
                <a onClick="rozbal_vse(9)" title="Schová nebo rozbalí vše pod tímto komentářem">Rozbalit vše</a>
            </span>

        Re: Bolest proxy


            <div id="comment9_controls">
                
                <a href="/blog/EditDiscussion/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=add&amp;dizId=210591&amp;threadId=9">Odpovědět</a>
                    | <a onClick="schovej_vlakno(9)" id="comment9_toggle1" title="Schová nebo rozbalí celé vlákno" class="ds_control_sbalit3">Sbalit</a>
                    | <a href="#2" title="Odkaz na komentář o jednu úroveň výše">Výše</a>
                    | <a href="#9" title="Přímá adresa na tento komentář">Link</a>
                    | <a href="/EditUser;jsessionid=kufis2spplnh6gu671mxqe2j?action=toBlacklist&amp;bUid=9480&amp;url=/blog/show/400959#9" title="Přidá autora na seznam blokovaných uživatelů">Blokovat</a>
                | <a href="/blog/EditRequest/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=comment&amp;threadId=9" title="Žádost o přesun diskuse, stížnost na komentář">Admin</a>
            </div>

    </div>""").find("div")[0]
コード例 #28
0
ファイル: user.py プロジェクト: Bystroushaak/abclinuxuapi
    def get_blogposts(self):
        """
        Lists all of users PUBLISHED blogposts. For unpublished, see 
        :meth:`get_concepts`.

        Returns:
            list: sorted (old->new) list of Blogpost objects.
        """
        if not self.has_blog:
            return []

        def cut_crap(data):
            data = data.split(
                '<div class="s_nadpis linkbox_nadpis">Píšeme jinde</div>'
            )[0]

            return data.split('<div class="st" id="st">')[1]

        cnt = 0
        posts = []
        parsed = [1]  # just placeholder for first iteration
        while parsed:
            data = self._get(self._compose_blogposts_url(cnt))

            dom = dhtmlparser.parseString(cut_crap(data))
            parsed = [
                Blogpost.from_html(blog_html)
                for blog_html in dom.find("div", {"class": "cl"})
            ]

            posts.extend(parsed)
            cnt += BLOG_STEP

        return sorted(posts, key=lambda x: x.created_ts)
コード例 #29
0
def unreg_header():
    return dhtmlparser.parseString("""    <div class="ds_hlavicka" id="3">
        <div class="ds_reseni" style="display:none">
        </div>


        10.2. 21:53

               Tomáškova máma

        <br>

            <span class="ds_control_sbalit2" id="comment3_toggle2">
                <a onClick="schovej_vlakno(3)" title="Schová nebo rozbalí celé vlákno">Rozbalit</a>
                <a onClick="rozbal_vse(3)" title="Schová nebo rozbalí vše pod tímto komentářem">Rozbalit vše</a>
            </span>

        Re: Bolest proxy


            <div id="comment3_controls">
                
                <a href="/blog/EditDiscussion/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=add&amp;dizId=210591&amp;threadId=3">Odpovědět</a>
                    | <a onClick="schovej_vlakno(3)" id="comment3_toggle1" title="Schová nebo rozbalí celé vlákno" class="ds_control_sbalit3">Sbalit</a>
                    
                    | <a href="#3" title="Přímá adresa na tento komentář">Link</a>
                    | <a href="/EditUser;jsessionid=kufis2spplnh6gu671mxqe2j?action=toBlacklist&amp;bName=Tom%C3%A1%C5%A1kova%20m%C3%A1ma&amp;url=/blog/show/400959#3" title="Přidá autora na seznam blokovaných uživatelů">Blokovat</a>
                | <a href="/blog/EditRequest/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=comment&amp;threadId=3" title="Žádost o přesun diskuse, stížnost na komentář">Admin</a>
            </div>

    </div>
    """).find("div")[0]
コード例 #30
0
def test_parseString_cip():
    dom = dhtmlparser.parseString("""<html><tag PARAM="true"></html>""",
                                  cip=False)

    assert dom.childs
    assert len(dom.childs) == 2

    assert dom.childs[0].getTagName() == "html"
    assert dom.childs[1].getTagName() == "html"

    assert dom.childs[0].isOpeningTag()
    assert dom.childs[1].isEndTag()

    assert dom.childs[0].childs
    assert not dom.childs[1].childs

    assert dom.childs[0].childs[0].getTagName() == "tag"
    assert dom.childs[0].childs[0].params
    assert not dom.childs[0].childs[0].childs

    assert "param" not in dom.childs[0].childs[0].params
    assert "PARAM" in dom.childs[0].childs[0].params

    assert dom.childs[0].childs[0].params["PARAM"] == "true"

    with pytest.raises(KeyError):
        dom.childs[0].childs[0].params["param"]
コード例 #31
0
def test_findOneB():
    dom = dhtmlparser.parseString("""
        <root>
            <some id="first">
                <something id="first">
                    <xe id="wanted xe" />
                </something>
                <something id="second">
                    <xe id="another wanted xe" />
                    <something id="super" />
                </something>
                <xe id="another xe" />
            </some>
            <some id="second">
                <something id="last">
                    <xe id="last xe" />
                </something>
            </some>
        </root>
        """)

    none = dom.findOneB("nono")
    some = dom.findOneB("some")
    some2 = dom.findOneB("some", {"id": "second"})
    something = dom.findOneB("something", skip=2)
    another = dom.findOneB("xe")
    xe = dom.findOneB("xe", skip=1)

    assert len(some.childs) == 9
    assert len(some2.childs) == 4
    assert none == None
    assert something.params["id"] == "last"
    assert another.params["id"] == "another xe"
    assert xe.params["id"] == "wanted xe"
コード例 #32
0
def test_findOne():
    dom = dhtmlparser.parseString(
        """
        <root>
            <some id="first">
                <something id="first">
                    <xe id="wanted xe" />
                </something>
                <something id="second">
                    <xe id="another wanted xe" />
                    <something id="super" />
                </something>
                <xe id="another xe" />
            </some>
            <some id="second">
                <something id="last">
                    <xe id="last xe" />
                </something>
            </some>
        </root>
        """
    )

    none = dom.findOne("nono")
    some = dom.findOne("some")
    some2 = dom.findOne("some", {"id": "second"})
    something = dom.findOne("something", skip=2)
    xe = dom.findOne("xe")

    assert len(some.childs) == 9
    assert len(some2.childs) == 4
    assert none == None
    assert something.params["id"] == "super"
    assert xe.params["id"] == "wanted xe"
コード例 #33
0
def test_match():
    dom = dhtmlparser.parseString(
        """
        <root>
            <some>
                <something>
                    <xe id="wanted xe" />
                </something>
                <something>
                    <xe id="another wanted xe" />
                </something>
                <xe id="another xe" />
            </some>
            <some>
                <something>
                    <xe id="last wanted xe" />
                </something>
            </some>
        </root>
        """
    )

    xe = dom.match("root", "some", "something", "xe")
    assert len(xe) == 3
    assert xe[0].params["id"] == "wanted xe"
    assert xe[1].params["id"] == "another wanted xe"
    assert xe[2].params["id"] == "last wanted xe"
コード例 #34
0
def test_match_parameters():
    dom = dhtmlparser.parseString(
        """
        <root>
            <div id="1">
                <div id="5">
                    <xe id="wanted xe" />
                </div>
                <div id="10">
                    <xe id="another wanted xe" />
                </div>
                <xe id="another xe" />
            </div>
            <div id="2">
                <div id="20">
                    <xe id="last wanted xe" />
                </div>
            </div>
        </root>
        """
    )

    xe = dom.match(
        "root",
        {"tag_name": "div", "params": {"id": "1"}},
        ["div", {"id": "5"}],
        "xe"
    )

    assert len(xe) == 1
    assert first(xe).params["id"] == "wanted xe"
コード例 #35
0
def test_wfind_multiple_matches():
    dom = dhtmlparser.parseString(
        """
        <root>
            <some>
                <something>
                    <xe id="wanted xe" />
                </something>
                <something>
                    <xe id="another wanted xe" />
                </something>
                <xe id="another xe" />
            </some>
            <some>
                <something>
                    <xe id="last wanted xe" />
                </something>
            </some>
        </root>
        """
    )

    xe = dom.wfind("root").wfind("some").wfind("something").wfind("xe")

    assert len(xe.childs) == 3
    assert xe.childs[0].params["id"] == "wanted xe"
    assert xe.childs[1].params["id"] == "another wanted xe"
    assert xe.childs[2].params["id"] == "last wanted xe"
コード例 #36
0
def test_wfind_complicated():
    dom = dhtmlparser.parseString(
        """
        <root>
            <some>
                <something>
                    <xe id="wanted xe" />
                </something>
                <something>
                    asd
                </something>
                <xe id="another xe" />
            </some>
            <some>
                else
                <xe id="yet another xe" />
            </some>
        </root>
        """
    )

    xe = dom.wfind("root").wfind("some").wfind("something").find("xe")

    assert len(xe) == 1
    assert first(xe).params["id"] == "wanted xe"

    unicorn = dom.wfind("root").wfind("pink").wfind("unicorn")

    assert not unicorn.childs
コード例 #37
0
def get_publications():
    """
    Get list of publication offered by ben.cz.

    Returns:
        list: List of :class:`structures.Publication` objects.
    """
    data = DOWNER.download(URL)
    dom = dhtmlparser.parseString(data)

    book_list = dom.find("div", {"class": "seznamKniha"})

    assert book_list, "Can't find <div> with class 'seznamKniha'!"

    books = []
    for html_chunk in book_list:
        a = html_chunk.find("a")

        assert a, "Can't find link to the details of the book!"

        if a[0].find("span", {"class": "ruzek pripravujeme"}):
            continue

        books.append(
            _process_book(a[0].params["href"])
        )

    return books
コード例 #38
0
def test_findNextB():
    dom = dhtmlparser.parseString(
        """
        <root>
            <div>
                <something />
                <div id=2>
                    <xe />
                </div>
            </div>
            <div id="three">
            </div>
            <div id=4>
                <some>
                    <div>foo</div>
                </some>
                <div />
            </div>
        </root>
        """
    )

    gen = dom.findNextB("div")

    assert isinstance(gen, GeneratorType)

    l = [div for div in gen]

    assert len(l) == 11
    assert len(l[0].childs) == 6
    assert l[4].params["id"] == "4"
コード例 #39
0
ファイル: concept.py プロジェクト: Bystroushaak/abclinuxuapi
    def add_pic(self, opened_file):
        """
        Add picture to the Concept.

        Args:
            opened_file (file): opened file object
        """
        # init meta
        if not self._meta:
            self._init_metadata()

        # get link to pic form
        data = download(
            url_context(self._meta["Přidej obrázek"]),
            session=self._session
        )
        dom = dhtmlparser.parseString(data)

        # get information from pic form
        form = first(dom.find("form", {"enctype": "multipart/form-data"}))
        add_pic_url = form.params["action"]

        # send pic
        data = self._session.post(
            url_context(add_pic_url),
            data={
                "action": "addScreenshot2",
                "finish": "Nahrát"
            },
            files={"screenshot": opened_file}
        )
        data = data.text.encode("utf-8")
        check_error_div(data, '<div class="error" id="screenshotError">')
コード例 #40
0
def test_match():
    dom = dhtmlparser.parseString("""
        <root>
            <some>
                <something>
                    <xe id="wanted xe" />
                </something>
                <something>
                    <xe id="another wanted xe" />
                </something>
                <xe id="another xe" />
            </some>
            <some>
                <something>
                    <xe id="last wanted xe" />
                </something>
            </some>
        </root>
        """)

    xe = dom.match("root", "some", "something", "xe")
    assert len(xe) == 3
    assert xe[0].params["id"] == "wanted xe"
    assert xe[1].params["id"] == "another wanted xe"
    assert xe[2].params["id"] == "last wanted xe"
コード例 #41
0
def test_wfind_multiple_matches():
    dom = dhtmlparser.parseString("""
        <root>
            <some>
                <something>
                    <xe id="wanted xe" />
                </something>
                <something>
                    <xe id="another wanted xe" />
                </something>
                <xe id="another xe" />
            </some>
            <some>
                <something>
                    <xe id="last wanted xe" />
                </something>
            </some>
        </root>
        """)

    xe = dom.wfind("root").wfind("some").wfind("something").wfind("xe")

    assert len(xe.childs) == 3
    assert xe.childs[0].params["id"] == "wanted xe"
    assert xe.childs[1].params["id"] == "another wanted xe"
    assert xe.childs[2].params["id"] == "last wanted xe"
コード例 #42
0
def reg_header():
    return dhtmlparser.parseString("""    <div class="ds_hlavicka" id="9">
        <div class="ds_reseni" style="display:none">
        </div>


        11.2. 15:21

<a href="/lide/manasekp">manasekp</a>             | skóre: 27
             | blog: <a href="/blog/manasekp">manasekp</a>
             | Brno

        <br>

            <span class="ds_control_sbalit2" id="comment9_toggle2">
                <a onClick="schovej_vlakno(9)" title="Schová nebo rozbalí celé vlákno">Rozbalit</a>
                <a onClick="rozbal_vse(9)" title="Schová nebo rozbalí vše pod tímto komentářem">Rozbalit vše</a>
            </span>

        Re: Bolest proxy


            <div id="comment9_controls">
                
                <a href="/blog/EditDiscussion/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=add&amp;dizId=210591&amp;threadId=9">Odpovědět</a>
                    | <a onClick="schovej_vlakno(9)" id="comment9_toggle1" title="Schová nebo rozbalí celé vlákno" class="ds_control_sbalit3">Sbalit</a>
                    | <a href="#2" title="Odkaz na komentář o jednu úroveň výše">Výše</a>
                    | <a href="#9" title="Přímá adresa na tento komentář">Link</a>
                    | <a href="/EditUser;jsessionid=kufis2spplnh6gu671mxqe2j?action=toBlacklist&amp;bUid=9480&amp;url=/blog/show/400959#9" title="Přidá autora na seznam blokovaných uživatelů">Blokovat</a>
                | <a href="/blog/EditRequest/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=comment&amp;threadId=9" title="Žádost o přesun diskuse, stížnost na komentář">Admin</a>
            </div>

    </div>""").find("div")[0]
コード例 #43
0
def test_wfind_complicated():
    dom = dhtmlparser.parseString("""
        <root>
            <some>
                <something>
                    <xe id="wanted xe" />
                </something>
                <something>
                    asd
                </something>
                <xe id="another xe" />
            </some>
            <some>
                else
                <xe id="yet another xe" />
            </some>
        </root>
        """)

    xe = dom.wfind("root").wfind("some").wfind("something").find("xe")

    assert len(xe) == 1
    assert first(xe).params["id"] == "wanted xe"

    unicorn = dom.wfind("root").wfind("pink").wfind("unicorn")

    assert not unicorn.childs
コード例 #44
0
def unreg_header():
    return dhtmlparser.parseString("""    <div class="ds_hlavicka" id="3">
        <div class="ds_reseni" style="display:none">
        </div>


        10.2. 21:53

               Tomáškova máma

        <br>

            <span class="ds_control_sbalit2" id="comment3_toggle2">
                <a onClick="schovej_vlakno(3)" title="Schová nebo rozbalí celé vlákno">Rozbalit</a>
                <a onClick="rozbal_vse(3)" title="Schová nebo rozbalí vše pod tímto komentářem">Rozbalit vše</a>
            </span>

        Re: Bolest proxy


            <div id="comment3_controls">
                
                <a href="/blog/EditDiscussion/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=add&amp;dizId=210591&amp;threadId=3">Odpovědět</a>
                    | <a onClick="schovej_vlakno(3)" id="comment3_toggle1" title="Schová nebo rozbalí celé vlákno" class="ds_control_sbalit3">Sbalit</a>
                    
                    | <a href="#3" title="Přímá adresa na tento komentář">Link</a>
                    | <a href="/EditUser;jsessionid=kufis2spplnh6gu671mxqe2j?action=toBlacklist&amp;bName=Tom%C3%A1%C5%A1kova%20m%C3%A1ma&amp;url=/blog/show/400959#3" title="Přidá autora na seznam blokovaných uživatelů">Blokovat</a>
                | <a href="/blog/EditRequest/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=comment&amp;threadId=3" title="Žádost o přesun diskuse, stížnost na komentář">Admin</a>
            </div>

    </div>
    """).find("div")[0]
コード例 #45
0
def test_predecesors_pattern():
    dom = dhtmlparser.parseString(
        """
        <root>
            <xex>
                <x>content</x>
            </xex>
        </root>
        """
    )
    dhtmlparser.makeDoubleLinked(dom)

    x = dom.find("x")[0]

    res = path_patterns.predecesors_pattern(x, dom)

    assert res
    assert len(res) == 1

    assert isinstance(res[0], path_patterns.PathCall)

    assert res[0].call_type == "match"
    assert res[0].index == 0
    assert res[0].params == [
        ["root", None],
        ["xex", None],
        ["x", None],
    ]
コード例 #46
0
def test_neighbours_pattern_text_neigh():
    dom = dhtmlparser.parseString(
        """
        asd
        <xex>\tHello</xex>
        <xep></xep>
        asd
        """
    )
    dhtmlparser.makeDoubleLinked(dom)

    xex = dom.find("xex")[0]
    res = path_patterns.neighbours_pattern(xex)

    assert res
    assert len(res) == 2

    left, right = res

    assert left.call_type == "left_neighbour_tag"
    assert left.index == 0
    assert res[0].params.tag_name == "xex"
    assert res[0].params.params == None
    assert left.params.fn_params == [None, None, "asd"]

    assert right.call_type == "right_neighbour_tag"
    assert right.index == 0
    assert res[0].params.tag_name == "xex"
    assert res[0].params.params == None
    assert right.params.fn_params == ["xep", None, ""]
コード例 #47
0
ファイル: reddit_filter.py プロジェクト: Bystroushaak/rss_gen
def filter_feed(chan_id, filter_item):
    rss = _download_feed(chan_id)
    rss_dom = dhtmlparser.parseString(rss)

    for item in rss_dom.find("entry"):
        title = _pick_item_property(item, "title")
        link = _pick_item_property(item, "link")
        pub_date = _pick_item_property(item, "published")
        description = _pick_item_property(item, "content")
        real_link = _parse_description_link(description)

        if link:
            link = link.params.get("href", None)

        result = filter_item(
            title=title,
            link=link,
            real_link=real_link,
            pub_date=pub_date,
            description=description,
        )
        if result:
            item.replaceWith(dhtmlparser.HTMLElement(""))

    xml = rss_dom.prettify().splitlines()

    return '<?xml version="1.0" encoding="UTF-8"?>' + "\n".join(xml[1:])
コード例 #48
0
ファイル: user.py プロジェクト: Bystroushaak/abclinuxuapi
    def from_user_id(user_id):
        """
        Transform `user_id` to instance of :class:`User`.

        Returns:
            obj: :class:`User` instance parsed from the `user_id`.
        """
        data = shared.download(url_context("/Profile/" + str(user_id)))
        dom = dhtmlparser.parseString(data)
        dhtmlparser.makeDoubleLinked(dom)

        shared.handle_errors(dom)

        # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků
        # na abclinuxu.cz</a>
        a_tags = dom.find(
            "a",
            fn=lambda x: x.params.get("href", "").startswith("/lide/")
        )

        # pick only links which have content that starts with Seznam
        links = [
            a_tag.params["href"]
            for a_tag in a_tags
            if a_tag.getContent().startswith("Seznam")
        ]

        username = links[-1].split("/")[2]

        return User(username)
コード例 #49
0
def test_match_parameters():
    dom = dhtmlparser.parseString("""
        <root>
            <div id="1">
                <div id="5">
                    <xe id="wanted xe" />
                </div>
                <div id="10">
                    <xe id="another wanted xe" />
                </div>
                <xe id="another xe" />
            </div>
            <div id="2">
                <div id="20">
                    <xe id="last wanted xe" />
                </div>
            </div>
        </root>
        """)

    xe = dom.match("root", {
        "tag_name": "div",
        "params": {
            "id": "1"
        }
    }, ["div", {
        "id": "5"
    }], "xe")

    assert len(xe) == 1
    assert first(xe).params["id"] == "wanted xe"
コード例 #50
0
def test_findNextB():
    dom = dhtmlparser.parseString("""
        <root>
            <div>
                <something />
                <div id=2>
                    <xe />
                </div>
            </div>
            <div id="three">
            </div>
            <div id=4>
                <some>
                    <div>foo</div>
                </some>
                <div />
            </div>
        </root>
        """)

    gen = dom.findNextB("div")

    assert isinstance(gen, GeneratorType)

    l = [div for div in gen]

    assert len(l) == 11
    assert len(l[0].childs) == 6
    assert l[4].params["id"] == "4"
コード例 #51
0
ファイル: concept.py プロジェクト: Bystroushaak/abclinuxuapi
    def edit(self, text, title=None, date_of_pub=None):
        """
        Edit concept.

        Args:
            text (str): New text of the context.
            title (str, default None): New title of the concept. If not set,
                  old title is used.
            date_of_pub (str/int, default None): Date string in abclinuxu
                        format or timestamp determining when the concept should
                        be automatically published.

        Note:
            `date_of_pub` can be string in format ``"%Y-%m-%d %H:%M"``.
        """
        if not self._meta:
            self._init_metadata()

        data = download(
            url_context(self._meta["Uprav zápis"]),
            session=self._session
        )
        dom = dhtmlparser.parseString(data)

        form = dom.find("form", {"name": "form"})

        assert form, "Can't find edit form!"
        form = first(form)

        form_action = form.params["action"]

        if title is None:
            title = first(form.find("input", {"name": "title"}))
            title = title.params["value"]

        date = ""
        if date_of_pub is None:
            date = first(form.find("input", {"name": "publish"}))
            date = date.params["value"]
        elif isinstance(date_of_pub, basestring):
            date = date_of_pub
        else:
            date = ts_to_concept_date(date_of_pub)

        data = download(
            url=url_context(form_action),
            method="POST",
            data={
                "cid": 0,
                "publish": date,
                "content": text,
                "title": title,
                "delay": "Ulož",
                "action": "edit2"
            },
            session=self._session
        )
        check_error_div(data, '<div class="error" id="contentError">')
        check_error_page(data)
コード例 #52
0
def test_containsParamSubset():
    dom = dhtmlparser.parseString("<div id=x class=xex></div>")
    div = first(dom.find("div"))

    assert div.containsParamSubset({"id": "x"})
    assert div.containsParamSubset({"class": "xex"})
    assert div.containsParamSubset({"id": "x", "class": "xex"})
    assert not div.containsParamSubset({"asd": "bsd", "id": "x", "class": "xex"})
コード例 #53
0
def test_parse_description_missing():
    html = """
    <div class="detailPopis"></div>
    """

    result = ben_cz._parse_description(d.parseString(html))

    assert result is None
コード例 #54
0
def test_equality_of_output_with_comment():
    inp = """<head>
    <!-- <link rel="stylesheet" type="text/css" href="style.css"> -->
</head>
"""
    dom = dhtmlparser.parseString(inp)

    assert dom.__str__() == inp
コード例 #55
0
def test_params():
    dom = dhtmlparser.parseString("<xe id=1 />")
    xe = first(dom.find("xe"))

    assert xe.params["id"] == "1"

    xe.params = {}
    assert str(xe) == "<xe />"
コード例 #56
0
def test_params():
    dom = dhtmlparser.parseString("<xe id=1 />")
    xe = first(dom.find("xe"))

    assert xe.params["id"] == "1"

    xe.params = {}
    assert str(xe) == "<xe />"
コード例 #57
0
    def _apply_blacklist(self, text):
        dom = dhtmlparser.parseString(text)

        blacklist = dom.find("",
                             fn=lambda x: x.getTagName() in [
                                 "i",
                                 "a",
                                 "bq",
                                 "pre",
                                 "italic",
                                 "blockquote",
                             ])

        for el in blacklist:
            el.replaceWith(dhtmlparser.parseString(""))

        return str(dom)
    def _get_twitter_button_tag(cls):
        twitter_button_tag = (
            '<a class="twitter-share-button" id="twitter_button" href="#">'
            '<img src="%s" />'
            '</a>\n'
        )
        twitter_button_tag = twitter_button_tag % AddStaticFiles.tweet_button_ref

        return dhtmlparser.parseString(twitter_button_tag)