Esempio n. 1
0
def parse_post(content):
    post = Post()
    users = []
    comments = []

    # Note: this code needs patched lxml with support for huge_tree in HTMLParser
    parser = lxml.etree.HTMLParser(recover=True, huge_tree=True)
    root = lxml.etree.HTML(content, parser=parser)

    # post
    post_node = root.xpath('//li[@class="hentry"]')[0]
    comments_node = post_node.xpath('.//div[@class="entry-comments"]')[0]
    author_node = post_node.xpath('.//p[@class="author"]')[0]

    post_url = post_node.xpath('.//a[@class="entry-title"]')[0].get('href')
    post.post_id = int(re.search(r'/(\d+)$', post_url).group(1))

    comment_list_raw = comments_node.xpath('ul')[0].get("id")
    post.comment_list_id = int(re.match(r'comments_(\d+)$', comment_list_raw).group(1))

    post.language = post_node.xpath('.//a[@rel="chapter"]')[0].text

    post.code = post_node.xpath('div[@class="entry-content"]/pre/code')[0].text
    post.text = inner_html(post_node.xpath('p[@class="description"]')[0])

    post.posted = parse_date(author_node.xpath('abbr')[0].get('title'))

    post.vote_plus, post.vote_minus, post.rating = parse_rating(post_node.xpath('p[@class="vote"]/strong')[0])

    # author info
    user = User()

    user_url = author_node.xpath('a[1]')[0].get('href')
    user.user_id = int(re.search(r'/user/(\d+)$', user_url).group(1))
    user.name = author_node.xpath('a[2]')[0].text
    user.avatar_hash = parse_avatar(author_node.xpath('a[1]/img')[0].get('src'))

    post.user_id = user.user_id
    users.append(user)

    # comments
    for comment_node in comments_node.xpath('.//div[@class="entry-comment-wrapper"]'):
        comment = Comment()

        comment.comment_id = int(re.match(r'comment-(\d+)$', comment_node.get('id')).group(1))
        comment.post_id = post.post_id

        parent_node = comment_node.getparent().getparent().getparent()
        if parent_node.tag == 'li':
            parent_node = parent_node.xpath('div[@class="entry-comment-wrapper"]')[0]
            comment.parent_id = int(re.match(r'comment-(\d+)$', parent_node.get('id')).group(1))
        else:
            comment.parent_id = None

        comment.text = inner_html(comment_node.xpath('.//span[@class="comment-text"]')[0])

        info_node = comment_node.xpath('p[@class="entry-info"]')[0]

        comment.posted = parse_date(info_node.xpath('abbr[@class="published"]')[0].get('title'))
        comment.vote_plus, comment.vote_minus, comment.rating = parse_rating(info_node.xpath('span[@class="comment-vote"]/strong')[0])

        user_node = info_node.xpath('strong[@class="entry-author"]/a')[0]

        user = User()
        user.user_id = int(re.search(r'/user/(\d+)$', user_node.get('href')).group(1))
        user.name = user_node.text
        user.avatar_hash = parse_avatar(info_node.xpath('img[@class="avatar"]')[0].get('src'))

        comment.user_id = user.user_id
        users.append(user)
        comments.append(comment)

    return (post, users, comments)