Ejemplo n.º 1
0
def joke_360wa_parser(url):
    title_config = {
        "params": {
            "selector": "div.p_left > p.title1 > a"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.p_left > p:nth-of-type(2)"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "p.p_ding span"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#recent > div.p1")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        jokes.append(joke)
    return jokes
Ejemplo n.º 2
0
def joke_khdx_parser(url):
    text_config = {"params": {"selector": "dd.content"}, "method": "select"}
    user_config = {"params": {"selector": "p.user > a"}, "method": "select"}
    user_icon_config = {"params": {"selector": "img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "a.ding > div > i"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "a.cai > div > i"
        },
        "method": "select"
    }
    pb_time_config = {"params": {"selector": "span.fr"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="dl.main-list")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.publish_ori_icon = urljoin(url, joke.publish_ori_icon)
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Ejemplo n.º 3
0
def joke_pengfu_parser(url):
    id_config = {"method": "select", "attribute": "id"}
    title_config = {"params": {"selector": "h1.dp-b > a"}, "method": "select"}
    text_config = {
        "params": {
            "selector": "div.content-img"
        },
        "method": "select"
    }
    user_config = {
        "params": {
            "selector": "p.user_name_list > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "a.mem-header > img"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "span.ding em"}, "method": "select"}
    dislike_config = {
        "params": {
            "selector": "span.cai em"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "span.commentClick em"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.list-item")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        # code = get_tag_attribute(tag, id_config, "text")  # Comment need
        jokes.append(joke)
    return jokes
Ejemplo n.º 4
0
def joke_biedoul_parser(url):
    title_config = {
        "params": {
            "selector": "div.dz-list-con > a > p"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.dz-list-con > p"
        },
        "method": "select"
    }
    user_config = {
        "params": {
            "selector": "div.dz-username > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "div.user-portrait > img.avatar"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "a.zanUp"}, "method": "select"}
    dislike_config = {"params": {"selector": "a.zanDown"}, "method": "select"}
    pb_time_config = {
        "params": {
            "selector": "div.dz-username > span"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.lcommon.dz-bg > div")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Ejemplo n.º 5
0
def joke_neihan_parser(url):
    document = http.download_json(url=url)
    groups = document["data"]["data"]
    jokes = list()
    for g in groups:
        g = g["group"]
        joke = JokeFields()
        joke.publish_ori_name = g["user"]["name"]
        joke.publish_ori_icon = g["user"]["avatar_url"]
        joke.publish_time = format_datetime_string(g["create_time"])
        joke.text = g["text"]
        joke.n_comment = int(g["comment_count"])
        joke.n_like = int(g["digg_count"])
        joke.n_dislike = int(g["bury_count"])
        # _comment_need = g["code"]  # 评论需要该字段
        jokes.append(joke)
    return jokes
Ejemplo n.º 6
0
def joke_netease_parser(url):
    document = http.download_json(url=url)
    data = document[u"段子"]
    jokes = list()
    for g in data:
        if g.get("imgsum", 0) == 0:
            joke = JokeFields()
            joke.title = g["title"]
            joke.publish_ori_name = g["source"]
            joke.text = g["digest"]
            joke.n_comment = int(g["replyCount"])
            joke.n_like = int(g["upTimes"])
            joke.n_dislike = int(g["downTimes"])
            # _comment_need = g["docid"]  # 评论需要该字段
            jokes.append(joke)
    return jokes
Ejemplo n.º 7
0
def joke_xiha_parser(url):
    def get_metas(ids):
        url = "http://dg.xxhh.com/getcnums/?__jsonp__=fn&ids={ids}".format(
            ids=",".join(ids))
        document = http.download_json(url=url, skip=(3, -1))
        metas = dict()
        for i, meta in enumerate(document.get("d", [])):
            metas[ids[i]] = (int(meta[0]), int(meta[1]), int(meta[2])
                             )  # comment, like, dislike
        return metas

    user_config = {
        "params": {
            "selector": "div.user-info-username > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "div.user-avatar40 > a > img"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.article > pre"
        },
        "method": "select"
    }
    id_config = {"params": {"selector": "div.comment"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.min > div.section")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        _id = get_tag_attribute(tag, id_config, "id")
        _id = _id.replace("comment-", "")
        joke.id = _id  # Note add id attribute, comment need this filed
        jokes.append(joke)
    metas = get_metas([joke.id for joke in jokes])
    for joke in jokes:
        meta = metas[joke.id]
        joke.n_comment, joke.n_like, joke.n_dislike = meta
        del joke.id
    return jokes
Ejemplo n.º 8
0
def joke_nbsw_parser(url):
    text_config = {"params": {"selector": "div.ecae > p"}, "method": "select"}
    user_config = {"params": {"selector": "a.local-link"}, "method": "select"}
    user_icon_config = {
        "params": {
            "selector": "img.avatar"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "div.count-box"}, "method": "select"}
    comment_config = {
        "params": {
            "selector": "span.wppviews"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "span.meta > abbr"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="ul#postlist > li")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.text = joke.text.strip("[...]")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Ejemplo n.º 9
0
def joke_duanzidao_parser(url):
    text_config = {"params": {"selector": "div.article"}, "method": "select"}
    user_config = {
        "params": {
            "selector": "table.author td > ul > li > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "td.avatar img"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "em.good-btn > span"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "em.bad-btn > span"
        },
        "method": "select"
    }
    pb_time_config = {"params": {"selector": "table"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#main > div.panel")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Ejemplo n.º 10
0
def joke_qiushi_parser(url):
    headers = {
        "User-Agent":
        "qiushibalke_10.8.1_WIFI_auto_19",
        "Source":
        "android_10.8.1",
        "Model":
        "Xiaomi/hydrogen/hydrogen:6.0.1/MMB29M/V7.5.6.0.MBCCNDE:user/release-keys",
        "Uuid":
        "IMEI_8728c26518fa3ae795a7f787073d375f",
        "Deviceidinfo":
        '{"DEVICEID": "862535037295724","SIMNO": "89860112817005617959","IMSI": "460012225499106","ANDROID_ID": "27dafccd6e32bfb2","SDK_INT": 23,"SERIAL"a882d7f9","MAC": "02:00:00:00:00:00","RANDOM": ""}'
    }
    req = http.Request(url=url, headers=headers)
    document = http.download_json(request=req)
    data = document["items"]
    jokes = list()
    for g in data:
        if not g.get("user"):
            continue
        joke = JokeFields()
        joke.publish_ori_name = g["user"]["login"]
        avatar = g["user"].get("thumb")
        if not avatar:
            continue
        if avatar.startswith("//"):
            avatar = "http:" + avatar
        joke.publish_ori_icon = avatar
        joke.publish_time = format_datetime_string(g["created_at"])
        joke.text = g["content"]
        joke.n_comment = int(g.get("comments_count", 0))
        if g.get("votes"):
            joke.n_like = int(g["votes"]["up"])
            joke.n_dislike = int(g["votes"]["down"])
        jokes.append(joke)
    return jokes
Ejemplo n.º 11
0
def joke_waduanzi_parser(url):
    title_config = {
        "params": {
            "selector": "h2.item-title > a"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.item-content"
        },
        "method": "select"
    }
    user_config = {
        "params": {
            "selector": "div.post-author > a"
        },
        "method": "select"
    }
    # user_icon_config = {"params": {"selector": "div.post-author > img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "div.item-toolbar > ul > li:nth-of-type(1) > a"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "div.item-toolbar > ul > li:nth-of-type(2) > a"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.post-item")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        # joke.publish_ori_icon =get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        jokes.append(joke)
    return jokes
Ejemplo n.º 12
0
def joke_fun48_parser(url):
    def get_full_content(ori_url):
        text_config = {
            "params": {
                "selector": "article.article"
            },
            "method": "select"
        }
        document = http.download_html(url=ori_url)
        soup = BeautifulSoup(document, "lxml")
        text = get_tag_attribute(soup, text_config, "text")
        return text

    title_config = {
        "params": {
            "selector": "div.texttitle > a"
        },
        "method": "select"
    }
    ori_url_config = {
        "params": {
            "selector": "div.texttitle > a"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div.card-info"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#isonormal > div")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_url = get_tag_attribute(tag, ori_url_config, "href")
        joke.text = get_full_content(joke.publish_ori_url)
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.text = joke.text.strip("[...]")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Ejemplo n.º 13
0
def joke_helegehe_parser(url):
    text_config = {"params": {"selector": "a.contentHerf"}, "method": "select"}
    user_config = {"params": {"selector": "h2"}, "method": "select"}
    user_icon_config = {"params": {"selector": "img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "a.output-leftSupport"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "a.output-leftOpposition"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div.publishedIn"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="article.post")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Ejemplo n.º 14
0
def joke_3jy_parser(url):
    title_config = {"params": {"selector": "h2 > a"}, "method": "select"}
    text_config = {"params": {"selector": "div.c"}, "method": "select"}
    user_config = {"params": {"selector": "a.u_name"}, "method": "select"}
    like_config = {"params": {"selector": "p.zan"}, "method": "select"}
    dislike_config = {"params": {"selector": "p.bs"}, "method": "select"}

    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#zb > div.xh")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        jokes.append(joke)
    return jokes
Ejemplo n.º 15
0
def joke_caoegg_parser(url):
    text_config = {
        "params": {
            "selector": "div.c > a > span"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "div#dateright span.voteyes > font"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "div#dateright span.voteno > font"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div#dateright"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#wrap_info > div.infobox")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.text = joke.text.strip("What a f*****g day!")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Ejemplo n.º 16
0
def joke_budejie_parser(url):
    text_config = {
        "params": {
            "selector": "div.j-r-list-c-desc > a"
        },
        "method": "select"
    }
    user_config = {"params": {"selector": "img.u-logo"}, "method": "select"}
    user_icon_config = {
        "params": {
            "selector": "img.u-logo"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-up"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-down"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "li.j-comment"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "span.u-time"
        },
        "method": "select"
    }
    repost_config = {
        "params": {
            "selector": "div.j-r-list-tool-ct-share-c"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.j-r-list > ul > li")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "alt")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config,
                                                  "data-original")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        joke.n_repost = get_tag_attribute_int(tag, repost_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        jokes.append(joke)
    return jokes