コード例 #1
0
    def _parse_meta(self):
        content = self._parse_content_tag()
        meta_vypis_tags = content.find("p", {"class": "meta-vypis"})

        if not meta_vypis_tags:
            return

        meta_vypis_tag = first(meta_vypis_tags)
        has_tux_tags = meta_vypis_tag.find("img", {"class": "blog_digest"})

        if has_tux_tags:
            self.has_tux = True

        # get clean string - another thing which is not semantic at all
        lines = dhtmlparser.removeTags(meta_vypis_tag)

        self.created_ts = parse_timestamp(lines)

        # rest will be picked one by one
        lines = lines.strip().splitlines()

        # parse last modification time
        modified_ts_line = [x for x in lines if "poslední úprava:" in x]
        if modified_ts_line:
            date_string = first(modified_ts_line).split(": ")[-1]
            self.last_modified_ts = parse_timestamp(date_string)

        # parse number of reads
        reads_line = [x for x in lines if "Přečteno:" in x]
        if reads_line:
            reads = first(reads_line).split(":")[-1].split("&")[0]
            self.readed = int(reads)
コード例 #2
0
ファイル: blogpost.py プロジェクト: vojtechkral/abclinuxuapi
    def _parse_meta(self):
        content = self._parse_content_tag()
        meta_vypis_tags = content.find("p", {"class": "meta-vypis"})

        if not meta_vypis_tags:
            return

        meta_vypis_tag = first(meta_vypis_tags)
        has_tux_tags = meta_vypis_tag.find("img", {"class": "blog_digest"})

        if has_tux_tags:
            self.has_tux = True

        # get clean string - another thing which is not semantic at all
        lines = dhtmlparser.removeTags(meta_vypis_tag)

        self.created_ts = parse_timestamp(lines)

        # rest will be picked one by one
        lines = lines.strip().splitlines()

        # parse last modification time
        modified_ts_line = [x for x in lines if "poslední úprava:" in x]
        if modified_ts_line:
            date_string = first(modified_ts_line).split(": ")[-1]
            self.last_modified_ts = parse_timestamp(date_string)

        # parse number of reads
        reads_line = [x for x in lines if "Přečteno:" in x]
        if reads_line:
            reads = first(reads_line).split(":")[-1].split("&")[0]
            self.readed = int(reads)
コード例 #3
0
    def _izolate_timestamp(head_tag):
        text_elements = head_tag.find(None, fn=lambda x: not x.isTag())

        text_clusters = [str(x).splitlines() for x in text_elements]
        lines = sum(text_clusters, [])  # flattern the list

        return parse_timestamp(lines)
コード例 #4
0
    def from_html(html, lazy=True):
        """
        Convert HTML string to :class:`Blogpost` instance.

        Args:
            html (str): Input data.
            lazy (bool, default True): Be lazy (don't pull data by yourself
                 from the site). Call :meth:`pull` for active download of all
                 required informations.

        Returns:
            obj: :class:`Blogpost` instance.
        """
        if not isinstance(html, dhtmlparser.HTMLElement):
            html = dhtmlparser.parseString(html)
            dhtmlparser.makeDoubleLinked(html)

        # support for legacy blogs
        title_tag = html.find("h2", {"class": "st_nadpis"})
        if title_tag:
            title_tag = first(title_tag)
            rel_link = first(title_tag.find("a")).params["href"]
            link = url_context(rel_link)
        else:
            title_tag = first(html.find("h2"))
            link = first(html.find("link", {"rel": "canonical"}))
            link = link.params["href"]

        title = dhtmlparser.removeTags(title_tag).strip()

        # get meta
        meta = html.find("p", {"class": "meta-vypis"})[0]

        blog = Blogpost(url=link, lazy=lazy)

        if lazy:
            blog.title = title
            blog.intro = Blogpost._parse_intro(html, meta, title_tag)
            blog.rating = Blogpost._parse_rating_from_preview(meta)
            blog.created_ts = parse_timestamp(meta)
            blog.comments_n = Blogpost._parse_comments_n(meta)

        return blog
コード例 #5
0
ファイル: blogpost.py プロジェクト: vojtechkral/abclinuxuapi
    def from_html(html, lazy=True):
        """
        Convert HTML string to :class:`Blogpost` instance.

        Args:
            html (str): Input data.
            lazy (bool, default True): Be lazy (don't pull data by yourself
                 from the site). Call :meth:`pull` for active download of all
                 required informations.

        Returns:
            obj: :class:`Blogpost` instance.
        """
        if not isinstance(html, dhtmlparser.HTMLElement):
            html = dhtmlparser.parseString(html)
            dhtmlparser.makeDoubleLinked(html)

        # support for legacy blogs
        title_tag = html.find("h2", {"class": "st_nadpis"})
        if title_tag:
            title_tag = first(title_tag)
            rel_link = first(title_tag.find("a")).params["href"]
            link = url_context(rel_link)
        else:
            title_tag = first(html.find("h2"))
            link = first(html.find("link", {"rel": "canonical"}))
            link = link.params["href"]

        title = dhtmlparser.removeTags(title_tag).strip()

        # get meta
        meta = html.find("p", {"class": "meta-vypis"})[0]

        blog = Blogpost(url=link, lazy=lazy)

        if lazy:
            blog.title = title
            blog.intro = Blogpost._parse_intro(html, meta, title_tag)
            blog.rating = Blogpost._parse_rating_from_preview(meta)
            blog.created_ts = parse_timestamp(meta)
            blog.comments_n = Blogpost._parse_comments_n(meta)

        return blog