Esempio n. 1
0
def parse_profile_html(document):
    """
    Parse an LXML document to retrieve the profile data

    :param document: the LXML document to parse
    :return: a dictionary representing the profile
    """
    username_elements = document.xpath("//*[@class='author']")
    registered_elements = document.xpath("//*[@class='registered']")
    avatar_elements = document.xpath("//*[@class='title']//img")
    info_elements = document.xpath("//*[@class='info']")
    userid_elements = document.xpath("//*[@name='userid']")

    profile = {}

    if userid_elements:
        profile["id"] = userid_elements[0].attrib["value"]

    if username_elements:
        profile["username"] = username_elements[0].text_content()

    if registered_elements:
        profile["registered"] = registered_elements[0].text_content()

    if avatar_elements:
        profile["avatar"] = avatar_elements[0].attrib["src"] if avatar_elements[0].attrib["src"] else ""
        profile["is_newbie"] = profile["avatar"].endswith("/images/newbie.gif")

    if info_elements:
        info_text = info_elements[0].text_content()

        post_count = re.search(r"Post Count(\d+)", info_text)
        if post_count:
            profile["post_count"] = post_count.group(1)

        post_rate = re.search(r"Post Rate([\d\.]+)", info_text)
        if post_rate:
            profile["post_rate"] = post_rate.group(1)

        last_post = re.search(r"Last Post(.+)", info_text)
        if last_post:
            profile["last_post"] = last_post.group(1)

        gender = re.search(r"claims to be a ([-a-z0-9 ]+)", info_text)
        if gender:
            profile["gender"] = gender.group(1).lower()

    if "id" in profile:
        profile["profile_link"] = http.prepare_url(PROFILE_URL, { "action": "getinfo", "userid": profile["id"]})
    elif "username" in profile:
        profile["profile_link"] = http.prepare_url(PROFILE_URL, {"action": "getinfo", "username": profile["username"]})

    return profile
Esempio n. 2
0
def parse_profile_html(document):
    """
    Parse an LXML document to retrieve the profile data

    :param document: the LXML document to parse
    :return: a dictionary representing the profile
    """
    username_elements = document.xpath("//*[@class='author']")
    registered_elements = document.xpath("//*[@class='registered']")
    avatar_elements = document.xpath("//*[@class='title']//img")
    info_elements = document.xpath("//*[@class='info']")
    userid_elements = document.xpath("//*[@name='userid']")

    profile = {}

    if userid_elements:
        profile["id"] = userid_elements[0].attrib["value"]

    if username_elements:
        profile["username"] = username_elements[0].text_content()

    if registered_elements:
        profile["registered"] = registered_elements[0].text_content()

    if avatar_elements:
        profile["avatar"] = avatar_elements[0].attrib["src"] if avatar_elements[0].attrib["src"] else ""
        profile["is_newbie"] = profile["avatar"].endswith("/images/newbie.gif")

    if info_elements:
        info_text = info_elements[0].text_content()

        post_count = re.search(r"Post Count(\d+)", info_text)
        if post_count:
            profile["post_count"] = post_count.group(1)

        post_rate = re.search(r"Post Rate([\d\.]+)", info_text)
        if post_rate:
            profile["post_rate"] = post_rate.group(1)

        last_post = re.search(r"Last Post(.+)", info_text)
        if last_post:
            profile["last_post"] = last_post.group(1)

        gender = re.search(r"claims to be a ([-a-z0-9 ]+)", info_text)
        if gender:
            profile["gender"] = gender.group(1).lower()

    if "id" in profile:
        profile["profile_link"] = http.prepare_url(PROFILE_URL, { "action": "getinfo", "userid": profile["id"]})
    elif "username" in profile:
        profile["profile_link"] = http.prepare_url(PROFILE_URL, {"action": "getinfo", "username": profile["username"]})

    return profile
Esempio n. 3
0
def parse_thread_html(document):
    """
    Parse an LXML document to retrieve the thread data

    :param document: the LXML document to parse
    :return: a dictionary representing the thread
    """
    breadcrumbs_elements = document.xpath("//div[@class='breadcrumbs']//a")
    author_elements = document.xpath("//dt[contains(@class, author)]")
    last_page_elements = document.xpath("//a[@title='Last page']")

    if not breadcrumbs_elements:
        return

    if not author_elements:
        return

    if len(breadcrumbs_elements) < 2:
        return

    thread_id = int(breadcrumbs_elements[-1].attrib['href'].rsplit('=', 2)[1])

    breadcrumbs = [ e.text_content() for e in breadcrumbs_elements ]

    thread_title = breadcrumbs[-1]
    forum_title = breadcrumbs[-2]

    if author_elements:
        author = author_elements[0].text_content().strip()
    else:
        author = 'Unknown Author'

    # Handle GBS / FYAD / E/N / etc
    if ':' in forum_title:
        forum_title = forum_title.split(':')[0].strip()

    if forum_title in FORUM_ABBREVS:
        forum_title = FORUM_ABBREVS[forum_title]

    if last_page_elements:
        post_count = int(last_page_elements[0].text_content().split(" ")[0])
    else:
        post_count = 1

    posts = {x.attrib['id']: (
        x.xpath('.//dt[contains(@class, "author")]')[0].text_content(),
        x.xpath('.//*[@class="postdate"]')[0].text_content().strip('\n #?'),
        x.xpath('.//*[@class="postbody"]')[0].text_content().strip())
        for x in document.xpath('//table[contains(@class, "post")]')}

    return {
        "id": thread_id,
        "breadcrumbs": breadcrumbs,
        "forum_title": forum_title,
        "thread_title": thread_title,
        "author": author,
        "post_count": post_count,
        "posts": posts,
        "thread_link": http.prepare_url(THREAD_URL, {'threadid': thread_id}),
    }
Esempio n. 4
0
def parse_thread_html(document):
    """
    Parse an LXML document to retrieve the thread data

    :param document: the LXML document to parse
    :return: a dictionary representing the thread
    """
    breadcrumbs_elements = document.xpath("//div[@class='breadcrumbs']//a")
    author_elements = document.xpath("//dt[contains(@class, author)]")
    last_page_elements = document.xpath("//a[@title='Last page']")

    if not breadcrumbs_elements:
        return

    if not author_elements:
        return

    if len(breadcrumbs_elements) < 2:
        return

    thread_id = int(breadcrumbs_elements[-1].attrib['href'].rsplit('=', 2)[1])

    breadcrumbs = [ e.text_content() for e in breadcrumbs_elements ]

    thread_title = breadcrumbs[-1]
    forum_title = breadcrumbs[-2]

    if author_elements:
        author = author_elements[0].text_content().strip()
    else:
        author = 'Unknown Author'

    # Handle GBS / FYAD / E/N / etc
    if ':' in forum_title:
        forum_title = forum_title.split(':')[0].strip()

    if forum_title in FORUM_ABBREVS:
        forum_title = FORUM_ABBREVS[forum_title]

    if last_page_elements:
        post_count = int(last_page_elements[0].text_content().split(" ")[0])
    else:
        post_count = 1

    return {
        "id": thread_id,
        "breadcrumbs": breadcrumbs,
        "forum_title": forum_title,
        "thread_title": thread_title,
        "author": author,
        "post_count": post_count,
        "thread_link": http.prepare_url(THREAD_URL, {'threadid': thread_id})
    }
Esempio n. 5
0
def qrcode(inp):
    """qrcode [link] returns a link for a QR code."""

    args = {
        "cht": "qr",  # chart type (QR)
        "chs": "200x200",  # dimensions
        "chl": inp  # data
    }

    link = http.prepare_url("http://chart.googleapis.com/chart", args)

    return web.try_isgd(link)