Ejemplo n.º 1
0
def parse(html):
    '''return a list of dictionaries describing the stories on the front page'''
    elements = []
    p = PyQuery(html)
    # 90s markup woohoo!
    anchors = p('.title:nth-child(3) a:nth-child(1)')
    for a in anchors:
        # have to re-wrap here, because PyQuery just exposes internal lxml objects upon getting iterated
        a = PyQuery(a)
        subtext = a.closest('tr').next().find('.subtext')
        if not subtext:
            # More link
            continue
        children = map(PyQuery, subtext.children())
        try:
            span, submitted, comments = children[0], children[1], children[-1]
        except IndexError:
            # filter out ads
            continue
        comments = comments.text().rpartition(' ')[0]
        comments = int(comments) if comments else 0
        url = a.attr('href')
        elements.append({
            'pos': len(elements) + 1,
            'title': a.text(),
            'url': url,
            'domain': urlparse(url).netloc.rpartition('www.')[2],
            'comments': comments,
            'submitter': submitted.text(),
            'points': int(span.text().split()[0]),
            'id': int(span.attr('id').split('_', 1)[1]),
            'ago': submitted[0].tail.split('ago')[0].strip(),
        })
    logging.warning('parsed %s elements', len(elements))
    return elements
Ejemplo n.º 2
0
    def get_subforums_infos(self, html):
        """
        Get informations (description, number of topics and posts, ...) about
        the forums listed on a page
        """
        document = PyQuery(html)

        idpattern = re.compile(r"/([fc]\d+)-.*")

        for element in document("a.forumlink"):
            e = PyQuery(element)

            match = idpattern.fullmatch(clean_url(e.attr("href")))
            if not match:
                continue

            oldid = match.group(1)

            row = e.closest("tr")

            # Get forum status
            alt = row("td:nth-of-type(1) img").eq(0).attr("alt")
            self.forums[oldid].status = 1 if "verrouillé" in alt else 0

            # Get subforum description
            self.forums[oldid].description = row("td:nth-of-type(2) span").eq(
                1).html() or ""

            # TODO : Get subforum icon

            # Get subforum numbers of topics and posts
            self.forums[oldid].num_topics = int(row("td").eq(2).text())
            self.forums[oldid].num_posts = int(row("td").eq(3).text())
Ejemplo n.º 3
0
def parse(html):
    '''return a list of dictionaries describing the stories on the front page'''
    elements = []
    p = PyQuery(html)
    # 90s markup woohoo!
    anchors = p('.title:nth-child(3) a:nth-child(1)')
    for a in anchors:
        # have to re-wrap here, because PyQuery just exposes internal lxml objects upon getting iterated
        a = PyQuery(a)
        subtext = a.closest('tr').next().find('.subtext')
        if not subtext:
            # More link
            continue
        children = map(PyQuery, subtext.children())
        try:
            span, submitted, comments = children[0], children[1], children[-1]
        except IndexError:
            # filter out ads
            continue
        comments = comments.text().rpartition(' ')[0]
        comments = int(comments) if comments else 0
        url = a.attr('href')
        elements.append({
                      'pos': len(elements) + 1,
                    'title': a.text(),
                      'url': url,
                   'domain': urlparse(url).netloc.rpartition('www.')[2],
                 'comments': comments,
                'submitter': submitted.text(),
                   'points': int(span.text().split()[0]),
                       'id': int(span.attr('id').split('_', 1)[1]),
                      'ago': submitted[0].tail.split('ago')[0].strip(),
                })
    logging.warning('parsed %s elements', len(elements))
    return elements
Ejemplo n.º 4
0
    def get_subforums_infos(self, html):
        """
        Get informations (description, number of topics and posts, ...) about
        the forums listed on a page
        """
        document = PyQuery(html)

        idpattern = re.compile(r"/([fc]\d+)-.*")

        for element in document("a.forumlink"):
            e = PyQuery(element)

            match = idpattern.fullmatch(clean_url(e.attr("href")))
            if not match:
                continue

            oldid = match.group(1)

            row = e.closest("tr")

            # Get forum status
            alt = row("td:nth-of-type(1) img").eq(0).attr("alt")
            self.forums[oldid].status = 1 if "verrouillé" in alt else 0

            # Get subforum description
            self.forums[oldid].description = row("td:nth-of-type(2) span").eq(1).html() or ""

            # TODO : Get subforum icon

            # Get subforum numbers of topics and posts
            self.forums[oldid].num_topics = int(row("td").eq(2).text())
            self.forums[oldid].num_posts = int(row("td").eq(3).text())