Exemple #1
0
def joke_khdx_parser(url):
    text_config = {"params": {"selector": "dd.content"}, "method": "select"}
    user_config = {"params": {"selector": "p.user > a"}, "method": "select"}
    user_icon_config = {"params": {"selector": "img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "a.ding > div > i"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "a.cai > div > i"
        },
        "method": "select"
    }
    pb_time_config = {"params": {"selector": "span.fr"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="dl.main-list")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.publish_ori_icon = urljoin(url, joke.publish_ori_icon)
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Exemple #2
0
def joke_pengfu_parser(url):
    id_config = {"method": "select", "attribute": "id"}
    title_config = {"params": {"selector": "h1.dp-b > a"}, "method": "select"}
    text_config = {
        "params": {
            "selector": "div.content-img"
        },
        "method": "select"
    }
    user_config = {
        "params": {
            "selector": "p.user_name_list > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "a.mem-header > img"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "span.ding em"}, "method": "select"}
    dislike_config = {
        "params": {
            "selector": "span.cai em"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "span.commentClick em"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.list-item")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        # code = get_tag_attribute(tag, id_config, "text")  # Comment need
        jokes.append(joke)
    return jokes
Exemple #3
0
def joke_biedoul_parser(url):
    title_config = {
        "params": {
            "selector": "div.dz-list-con > a > p"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.dz-list-con > p"
        },
        "method": "select"
    }
    user_config = {
        "params": {
            "selector": "div.dz-username > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "div.user-portrait > img.avatar"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "a.zanUp"}, "method": "select"}
    dislike_config = {"params": {"selector": "a.zanDown"}, "method": "select"}
    pb_time_config = {
        "params": {
            "selector": "div.dz-username > span"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.lcommon.dz-bg > div")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Exemple #4
0
def joke_360wa_parser(url):
    title_config = {
        "params": {
            "selector": "div.p_left > p.title1 > a"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.p_left > p:nth-of-type(2)"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "p.p_ding span"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#recent > div.p1")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        jokes.append(joke)
    return jokes
Exemple #5
0
def joke_waduanzi_parser(url):
    title_config = {
        "params": {
            "selector": "h2.item-title > a"
        },
        "method": "select"
    }
    text_config = {
        "params": {
            "selector": "div.item-content"
        },
        "method": "select"
    }
    user_config = {
        "params": {
            "selector": "div.post-author > a"
        },
        "method": "select"
    }
    # user_icon_config = {"params": {"selector": "div.post-author > img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "div.item-toolbar > ul > li:nth-of-type(1) > a"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "div.item-toolbar > ul > li:nth-of-type(2) > a"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.post-item")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        # joke.publish_ori_icon =get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        jokes.append(joke)
    return jokes
Exemple #6
0
def joke_duanzidao_parser(url):
    text_config = {"params": {"selector": "div.article"}, "method": "select"}
    user_config = {
        "params": {
            "selector": "table.author td > ul > li > a"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "td.avatar img"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "em.good-btn > span"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "em.bad-btn > span"
        },
        "method": "select"
    }
    pb_time_config = {"params": {"selector": "table"}, "method": "select"}
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#main > div.panel")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Exemple #7
0
def joke_3jy_parser(url):
    title_config = {"params": {"selector": "h2 > a"}, "method": "select"}
    text_config = {"params": {"selector": "div.c"}, "method": "select"}
    user_config = {"params": {"selector": "a.u_name"}, "method": "select"}
    like_config = {"params": {"selector": "p.zan"}, "method": "select"}
    dislike_config = {"params": {"selector": "p.bs"}, "method": "select"}

    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#zb > div.xh")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.title = get_tag_attribute(tag, title_config, "text")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        jokes.append(joke)
    return jokes
Exemple #8
0
def joke_caoegg_parser(url):
    text_config = {
        "params": {
            "selector": "div.c > a > span"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "div#dateright span.voteyes > font"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "div#dateright span.voteno > font"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div#dateright"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div#wrap_info > div.infobox")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.text = joke.text.strip("What a f*****g day!")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Exemple #9
0
def joke_nbsw_parser(url):
    text_config = {"params": {"selector": "div.ecae > p"}, "method": "select"}
    user_config = {"params": {"selector": "a.local-link"}, "method": "select"}
    user_icon_config = {
        "params": {
            "selector": "img.avatar"
        },
        "method": "select"
    }
    like_config = {"params": {"selector": "div.count-box"}, "method": "select"}
    comment_config = {
        "params": {
            "selector": "span.wppviews"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "span.meta > abbr"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="ul#postlist > li")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.text = joke.text.strip("[...]")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Exemple #10
0
def joke_helegehe_parser(url):
    text_config = {"params": {"selector": "a.contentHerf"}, "method": "select"}
    user_config = {"params": {"selector": "h2"}, "method": "select"}
    user_icon_config = {"params": {"selector": "img"}, "method": "select"}
    like_config = {
        "params": {
            "selector": "a.output-leftSupport"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "a.output-leftOpposition"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "div.publishedIn"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="article.post")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "text")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        jokes.append(joke)
    return jokes
Exemple #11
0
def video_weibo_parser(url):
    body = video_weibo_downloader(url)
    weibo_video_url_re = re.compile(r"video_src=(.*?)&playerType")
    title_config = {"params": {"selector": "div.txt_cut"}, "method": "select"}
    publish_name_config = {
        "params": {
            "selector": "div.item_a"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "img.face_pho"
        },
        "method": "select"
    }
    thumbnail_config = {
        "params": {
            "selector": "img.piccut"
        },
        "method": "select"
    }
    repost_config = {
        "params": {
            "selector": "li:nth-of-type(1) > a em:nth-of-type(2)"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "li:nth-of-type(2) > a em:nth-of-type(2)"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "li:nth-of-type(3) > a em:nth-of-type(2)"
        },
        "method": "select"
    }
    read_config = {
        "params": {
            "selector": "div.bot_number > em:nth-of-type(2)"
        },
        "method": "select"
    }
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.weibo_tv_frame > ul.li_list_1 > a")
    videos = list()
    for tag in tags:
        video = VideoFields()
        url = urljoin(url, extract_tag_attribute(tag, name="href"))
        try:
            content = video_weibo_downloader(url)
            video_url = unquote_plus(weibo_video_url_re.findall(content)[0])
            soup = BeautifulSoup(content, "lxml")
            root = soup.select_one("div.WB_handle > ul")
            video.n_repost = get_tag_attribute_int(root, repost_config)
            video.n_comment = get_tag_attribute_int(root, comment_config)
            video.n_like = get_tag_attribute_int(root, like_config)
            video.n_read = get_tag_attribute_int(soup, read_config)
        except Exception:
            continue
        if "miaopai" not in video_url:
            continue
        video.src = remove_url_query_params(video_url)
        video.publish_ori_url = url
        video.title = get_tag_attribute(tag, title_config, "text")
        video.publish_ori_name = get_tag_attribute(tag, publish_name_config,
                                                   "text")
        video.thumbnail = get_tag_attribute(tag, thumbnail_config, "src")
        video.publish_ori_icon = get_tag_attribute(tag, publish_icon_config,
                                                   "src")
        video.duration = 0
        video.tags = g_tags(video.title)
        videos.append(video)
    return videos
Exemple #12
0
def video_miaopai_parser(url):
    # 根据秒拍号进行列表抓取
    body = http.download_html(url=url)
    video_url_template = "http://gslb.miaopai.com/stream/{id}.mp4"
    detail_url_template = "http://www.miaopai.com/show/{id}.htm"
    vid_re = re.compile('data-scid="(.*?)"')
    cover_re = re.compile('data-img="(.*?)"')
    title_config = {
        "params": {
            "selector": "div.viedoAbout > p"
        },
        "method": "select"
    }
    publish_name_config = {
        "params": {
            "selector": "p.personalDataN"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "a.pic > img"
        },
        "method": "select"
    }
    read_config = {
        "params": {
            "selector": "p.personalDataT > span.red"
        },
        "method": "select"
    }
    tag_config = {
        "params": {
            "selector": "div.viedoAbout > p.orange"
        },
        "method": "select"
    }
    num_like_config = {
        "params": {
            "selector": "ul.commentLike > li > a"
        },
        "method": "select"
    }
    num_comment_config = {
        "params": {
            "selector": "ul.commentLike a.commentIco"
        },
        "method": "select"
    }
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.contentLeft > div.videoCont")
    videos = list()
    for tag in tags:
        video = VideoFields()
        vid = vid_re.findall(str(tag))
        vid = vid[0]
        video.title = get_tag_attribute(tag, title_config, "text")
        video.n_comment = get_tag_attribute_int(tag, num_comment_config,
                                                "text")
        video.n_read = get_tag_attribute_int(tag, read_config, "text")
        video.n_like = get_tag_attribute_int(tag, num_like_config, "text")
        video.tags = get_tag_attribute(tag, tag_config, "text")
        video.tags = ";".join(
            filter(lambda y: y != "",
                   map(lambda x: x.strip(), video.tags.split("#"))))
        video.publish_ori_name = get_tag_attribute(soup, publish_name_config,
                                                   "text")
        video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config,
                                                   "src")
        video.src = video_url_template.format(id=vid)
        video.publish_ori_url = detail_url_template.format(id=vid)
        video.thumbnail = cover_re.findall(str(tag))[0]
        videos.append(video)
        sleep(0.2)
    return videos
Exemple #13
0
def video_autohome_parser(url):
    body = http.download_html(url=url)
    autohome_vid_re = re.compile(r'vid=(.*?)&|vid: \"(.*?)\"')
    video_info_url_template = "http://p-vp.autohome.com.cn/api/gmi?mid={mid}&useragent=Android"
    title_config = {
        "params": {
            "selector": "div.video-item-tit > a"
        },
        "method": "select"
    }
    detail_config = {
        "params": {
            "selector": "div.video-item-tit > a"
        },
        "method": "select"
    }
    publish_time_config = {
        "params": {
            "selector": "div:nth-of-type(3) span:nth-of-type(3)"
        },
        "method": "select"
    }
    publish_name_config = {
        "params": {
            "selector": "a#author_nickName"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "img#author_headimageurl"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "span.videocom"
        },
        "method": "select"
    }
    read_config = {
        "params": {
            "selector": "span.count-eye"
        },
        "method": "select"
    }
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.video-item")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.title = get_tag_attribute(tag, title_config, "text")
        video.publish_time = get_tag_attribute(tag, publish_time_config,
                                               "text")
        video.publish_time = format_datetime_string(video.publish_time)
        video.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        video.n_read = get_tag_attribute_int(tag, read_config, "text")
        detail_url = urljoin(url, get_tag_attribute(tag, detail_config,
                                                    "href"))
        try:
            req = http.Request(url=detail_url)
            response = http.download(req)
            _, content = http.response_url_content(response)
            vid_one, vid_two = autohome_vid_re.findall(content)[0]
            vid = vid_one if vid_one else vid_two
            soup = BeautifulSoup(content, "lxml")
            ts = soup.select("div.card-label > a") or soup.select(
                "a.video-label")
            video.tags = ";".join(
                [extract_tag_attribute(t, "text") for t in ts])
            kinenames = ";".join([
                extract_tag_attribute(t, "text")
                for t in soup.select("a.kindname")
            ])
            if kinenames:
                video.tags += ";" + kinenames
            video.publish_ori_name = get_tag_attribute(soup,
                                                       publish_name_config,
                                                       "text")
            video.publish_ori_icon = get_tag_attribute(soup,
                                                       publish_icon_config,
                                                       "src")
            if video.publish_ori_icon:
                _u = urljoin(url, video.publish_ori_icon)
                video.publish_ori_icon = remove_url_query_params(_u)
        except Exception:
            continue
        info_url = video_info_url_template.format(mid=vid)
        try:
            req = http.Request(url=info_url)
            response = http.download(req)
            content = response.body[5:-1]
            info = json.loads(content)
        except Exception as e:
            try:
                content = response.body
                info = json.loads(content)
            except:
                continue
        if int(info["status"]) == 0:
            continue
        video.src = remove_url_query_params(info["copies"][-1]["playurl"])
        video.publish_ori_url = detail_url
        video.thumbnail = info["img"]
        video.duration = int(info["duration"])
        videos.append(video)
        sleep(0.2)
    return videos
Exemple #14
0
def joke_budejie_parser(url):
    text_config = {
        "params": {
            "selector": "div.j-r-list-c-desc > a"
        },
        "method": "select"
    }
    user_config = {"params": {"selector": "img.u-logo"}, "method": "select"}
    user_icon_config = {
        "params": {
            "selector": "img.u-logo"
        },
        "method": "select"
    }
    like_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-up"
        },
        "method": "select"
    }
    dislike_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-down"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "li.j-comment"
        },
        "method": "select"
    }
    pb_time_config = {
        "params": {
            "selector": "span.u-time"
        },
        "method": "select"
    }
    repost_config = {
        "params": {
            "selector": "div.j-r-list-tool-ct-share-c"
        },
        "method": "select"
    }
    document = http.download_html(url=url)
    soup = BeautifulSoup(document, "lxml")
    tags = soup.select(selector="div.j-r-list > ul > li")
    jokes = list()
    for tag in tags:
        joke = JokeFields()
        joke.publish_ori_name = get_tag_attribute(tag, user_config, "alt")
        joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config,
                                                  "data-original")
        joke.text = get_tag_attribute(tag, text_config, "text")
        joke.n_like = get_tag_attribute_int(tag, like_config, "text")
        joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text")
        pb_time = get_tag_attribute(tag, pb_time_config, "text")
        joke.publish_time = format_datetime_string(pb_time)
        joke.n_repost = get_tag_attribute_int(tag, repost_config, "text")
        joke.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        jokes.append(joke)
    return jokes
Exemple #15
0
def video_budejie_parser(url):
    detail_url_config = {
        "params": {
            "selector": "div.j-r-list-c-desc > a"
        },
        "method": "select"
    }
    title_config = {
        "params": {
            "selector": "div.j-r-list-c-desc > a"
        },
        "method": "select"
    }
    publish_name_config = {
        "params": {
            "selector": "div.u-txt > a"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "div.u-img img"
        },
        "method": "select"
    }
    publish_time_config = {
        "params": {
            "selector": "div.u-txt > span"
        },
        "method": "select"
    }
    src_config = {
        "params": {
            "selector": "div.j-video-c > div.j-video"
        },
        "method": "select"
    }
    cover_config = {
        "params": {
            "selector": "div.j-video-c > div.j-video"
        },
        "method": "select"
    }
    duration_config = {
        "params": {
            "selector": "div.j-r-list-c > div.j-video-c"
        },
        "method": "select"
    }
    num_like_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-up > span"
        },
        "method": "select"
    }
    num_dislike_config = {
        "params": {
            "selector": "li.j-r-list-tool-l-down > span"
        },
        "method": "select"
    }
    num_comment_config = {
        "params": {
            "selector": "span.comment-counts"
        },
        "method": "select"
    }
    num_repost_config = {
        "params": {
            "selector": "div.j-r-list-tool-ct-share-c > span"
        },
        "method": "select"
    }
    body = http.download_html(url=url)
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.j-r-list > ul > li")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.publish_ori_url = get_tag_attribute(tag, detail_url_config,
                                                  "href")
        video.publish_ori_url = urljoin(url, video.publish_ori_url)
        video.title = get_tag_attribute(tag, title_config, "text")
        video.publish_ori_name = get_tag_attribute(soup, publish_name_config,
                                                   "text")
        video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config,
                                                   "src")
        video.publish_time = get_tag_attribute(soup, publish_time_config,
                                               "text")
        video.src = get_tag_attribute(tag, src_config, "data-mp4")
        video.thumbnail = get_tag_attribute(tag, cover_config, "data-poster")
        video.n_like = get_tag_attribute_int(tag, num_like_config, "text")
        video.n_dislike = get_tag_attribute_int(tag, num_dislike_config,
                                                "text")
        video.n_comment = get_tag_attribute_int(tag, num_comment_config,
                                                "text")
        video.n_repost = get_tag_attribute_int(tag, num_repost_config, "text")
        video.duration = get_tag_attribute(tag, duration_config,
                                           "data-videoMlen")
        print video.duration
        videos.append(video)
        sleep(0.2)
    return videos
Exemple #16
0
def video_4399pk_parser(url):
    # http://joke.4399pk.com/video/find.html#

    def get_num_comment(id):
        n_comment_url = "http://joke.4399pk.com/wap/funnycourse-num-id-%s" % id
        content = http.download_json(url=n_comment_url)
        n_comment = content["msg"]["vcomment"]
        return int(n_comment)

    def get_wap_detail(id):
        meta = {}
        detail_wap = "http://joke.4399pk.com/wap/video-content-id-%s.html" % vid
        content = http.download_json(detail_wap)
        soup = BeautifulSoup(content, "lxml")
        meta["name"] = get_tag_attribute(soup, publish_name_config, "text")
        meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src")
        return meta

    def get_video_inf(id):
        pass

    detail_url_config = {"params": {"selector": "a.img"}, "method": "select"}
    title_config = {"params": {"selector": "div.tit"}, "method": "select"}
    num_like_config = {
        "params": {
            "selector": "div.info > span.fr > em"
        },
        "method": "select"
    }
    publish_name_config = {
        "params": {
            "selector": "div.kind-user.cf > div.fl > p"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "div.kind-user.cf img"
        },
        "method": "select"
    }
    body = http.download_html(url=url)
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="div.piclist > ul > li")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.publish_ori_url = get_tag_attribute(tag, detail_url_config,
                                                  "href")
        video.title = get_tag_attribute(tag, title_config, "text")
        video.n_like = get_tag_attribute_int(tag, num_like_config, "text")
        vid = video.publish_ori_url.split("/")[-1].split(".")[0]
        video.n_comment = get_num_comment(vid)
        video.publish_ori_name = get_tag_attribute(soup, publish_name_config,
                                                   "text")
        video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config,
                                                   "src")
        print video.duration
        videos.append(video)
        sleep(0.2)
    return videos
Exemple #17
0
def video_pearvideo_parser(url):
    def format_duration(d_text):
        duration = map(lambda x: int(x), d_text.split(":"))
        duration = filter(lambda y: y != 0, duration)
        length = len(duration)
        result = 0
        for i in range(length, 0, -1):
            result += duration[length - i] * pow(60, i - 1)
        return int(result)

    def get_detail_info(url):
        meta = {}
        content = http.download_html(url=url)
        soup = BeautifulSoup(content, "lxml")
        meta["src"] = src_re.findall(content)[0]
        meta["name"] = get_tag_attribute(soup, publish_name_config, "alt")
        meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src")
        meta["time"] = get_tag_attribute(soup, publish_time_config, "text")
        meta["thumbnail"] = get_tag_attribute(soup, cover_config, "src")
        return meta

    detail_url_config = {
        "params": {
            "selector": "a.vervideo-lilink"
        },
        "method": "select"
    }
    title_config = {
        "params": {
            "selector": "div.vervideo-title"
        },
        "method": "select"
    }
    duration_config = {
        "params": {
            "selector": "div.duration"
        },
        "method": "select"
    }
    num_like_config = {"params": {"selector": "span.fav"}, "method": "select"}
    publish_name_config = {
        "params": {
            "selector": "div.thiscat img"
        },
        "method": "select"
    }
    publish_icon_config = {
        "params": {
            "selector": "div.thiscat img"
        },
        "method": "select"
    }
    cover_config = {
        "params": {
            "selector": "div#poster img"
        },
        "method": "select"
    }
    publish_time_config = {
        "params": {
            "selector": "div.details-content div.date"
        },
        "method": "select"
    }
    src_re = re.compile('dUrl="(.*?)"')
    body = http.download_html(url=url)
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector="li.categoryem ")
    videos = list()
    for tag in tags:
        video = VideoFields()
        video.publish_ori_url = get_tag_attribute(tag, detail_url_config,
                                                  "href")
        video.publish_ori_url = urljoin(url, video.publish_ori_url)
        video.title = get_tag_attribute(tag, title_config, "text")
        video.duration = get_tag_attribute(tag, duration_config, "text")
        video.duration = format_duration(video.duration)
        video.n_like = get_tag_attribute_int(tag, num_like_config, "text")
        meta = get_detail_info(video.publish_ori_url)
        video.publish_ori_name = meta["name"]
        video.publish_ori_icon = meta["icon"]
        video.publish_time = meta["time"]
        video.publish_time = format_datetime_string(video.publish_time)
        video.thumbnail = meta["thumbnail"]
        video.src = meta["src"]
        videos.append(video)
        sleep(0.2)
    return videos
Exemple #18
0
def video_thepaper_parser(url):
    body = http.download_html(url=url)
    thepaper_video_url_re = re.compile(r'source src="(.*?)" type="video/mp4"')
    detail_config = {"params": {"selector": "a"}, "method": "select"}
    title_config = {
        "params": {
            "selector": "div.video_title"
        },
        "method": "select"
    }
    user_name_config = {
        "params": {
            "selector": "div.t_source > a"
        },
        "method": "select"
    }
    thumbnail_config = {
        "params": {
            "selector": "div.video_list_pic > img"
        },
        "method": "select"
    }
    user_icon_config = {
        "params": {
            "selector": "div.video_txt_r_icon img"
        },
        "method": "select"
    }
    duration_config = {
        "params": {
            "selector": "div.video_list_pic > span.p_time"
        },
        "method": "select"
    }
    comment_config = {
        "params": {
            "selector": "div.t_source > span.reply"
        },
        "method": "select"
    }
    description_config = {"params": {"selector": "p"}, "method": "select"}
    soup = BeautifulSoup(body, "lxml")
    tags = soup.select(selector=".video_news")
    videos = list()
    for tag in tags:
        url = urljoin("http://www.thepaper.cn/",
                      get_tag_attribute(tag, detail_config, "href"))
        try:
            req = http.Request(url=url)
            response = http.download(req)
            _, content = http.response_url_content(response)
            video_url = unquote_plus(thepaper_video_url_re.findall(content)[0])
        except Exception:
            continue
        video = VideoFields()
        video.title = get_tag_attribute(tag, title_config, "text")
        video.src = video_url
        video.publish_ori_url = url
        video.publish_ori_name = get_tag_attribute(tag, user_name_config,
                                                   "text")
        video.publish_ori_name = video.publish_ori_name.replace(
            u"@所有人", u"澎湃视频")
        video.thumbnail = get_tag_attribute(tag, thumbnail_config, "src")
        video.n_comment = get_tag_attribute_int(tag, comment_config, "text")
        video.description = get_tag_attribute(tag, description_config, "text")
        string = get_tag_attribute(tag, duration_config, "text")
        if string:
            try:
                m, s = string.split(":")
                second = int(m) * 60 + int(s)
            except Exception:
                pass
            else:
                video.duration = second
        detail = BeautifulSoup(content, "lxml")
        video.publish_ori_icon = get_tag_attribute(detail, user_icon_config,
                                                   "src")
        videos.append(video)
    return videos