def joke_khdx_parser(url): text_config = {"params": {"selector": "dd.content"}, "method": "select"} user_config = {"params": {"selector": "p.user > a"}, "method": "select"} user_icon_config = {"params": {"selector": "img"}, "method": "select"} like_config = { "params": { "selector": "a.ding > div > i" }, "method": "select" } dislike_config = { "params": { "selector": "a.cai > div > i" }, "method": "select" } pb_time_config = {"params": {"selector": "span.fr"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="dl.main-list") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.publish_ori_icon = urljoin(url, joke.publish_ori_icon) joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_pengfu_parser(url): id_config = {"method": "select", "attribute": "id"} title_config = {"params": {"selector": "h1.dp-b > a"}, "method": "select"} text_config = { "params": { "selector": "div.content-img" }, "method": "select" } user_config = { "params": { "selector": "p.user_name_list > a" }, "method": "select" } user_icon_config = { "params": { "selector": "a.mem-header > img" }, "method": "select" } like_config = {"params": {"selector": "span.ding em"}, "method": "select"} dislike_config = { "params": { "selector": "span.cai em" }, "method": "select" } comment_config = { "params": { "selector": "span.commentClick em" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.list-item") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") # code = get_tag_attribute(tag, id_config, "text") # Comment need jokes.append(joke) return jokes
def joke_biedoul_parser(url): title_config = { "params": { "selector": "div.dz-list-con > a > p" }, "method": "select" } text_config = { "params": { "selector": "div.dz-list-con > p" }, "method": "select" } user_config = { "params": { "selector": "div.dz-username > a" }, "method": "select" } user_icon_config = { "params": { "selector": "div.user-portrait > img.avatar" }, "method": "select" } like_config = {"params": {"selector": "a.zanUp"}, "method": "select"} dislike_config = {"params": {"selector": "a.zanDown"}, "method": "select"} pb_time_config = { "params": { "selector": "div.dz-username > span" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.lcommon.dz-bg > div") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_360wa_parser(url): title_config = { "params": { "selector": "div.p_left > p.title1 > a" }, "method": "select" } text_config = { "params": { "selector": "div.p_left > p:nth-of-type(2)" }, "method": "select" } like_config = {"params": {"selector": "p.p_ding span"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#recent > div.p1") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") jokes.append(joke) return jokes
def joke_waduanzi_parser(url): title_config = { "params": { "selector": "h2.item-title > a" }, "method": "select" } text_config = { "params": { "selector": "div.item-content" }, "method": "select" } user_config = { "params": { "selector": "div.post-author > a" }, "method": "select" } # user_icon_config = {"params": {"selector": "div.post-author > img"}, "method": "select"} like_config = { "params": { "selector": "div.item-toolbar > ul > li:nth-of-type(1) > a" }, "method": "select" } dislike_config = { "params": { "selector": "div.item-toolbar > ul > li:nth-of-type(2) > a" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.post-item") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") # joke.publish_ori_icon =get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") jokes.append(joke) return jokes
def joke_duanzidao_parser(url): text_config = {"params": {"selector": "div.article"}, "method": "select"} user_config = { "params": { "selector": "table.author td > ul > li > a" }, "method": "select" } user_icon_config = { "params": { "selector": "td.avatar img" }, "method": "select" } like_config = { "params": { "selector": "em.good-btn > span" }, "method": "select" } dislike_config = { "params": { "selector": "em.bad-btn > span" }, "method": "select" } pb_time_config = {"params": {"selector": "table"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#main > div.panel") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_3jy_parser(url): title_config = {"params": {"selector": "h2 > a"}, "method": "select"} text_config = {"params": {"selector": "div.c"}, "method": "select"} user_config = {"params": {"selector": "a.u_name"}, "method": "select"} like_config = {"params": {"selector": "p.zan"}, "method": "select"} dislike_config = {"params": {"selector": "p.bs"}, "method": "select"} document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#zb > div.xh") jokes = list() for tag in tags: joke = JokeFields() joke.title = get_tag_attribute(tag, title_config, "text") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") jokes.append(joke) return jokes
def joke_caoegg_parser(url): text_config = { "params": { "selector": "div.c > a > span" }, "method": "select" } like_config = { "params": { "selector": "div#dateright span.voteyes > font" }, "method": "select" } dislike_config = { "params": { "selector": "div#dateright span.voteno > font" }, "method": "select" } pb_time_config = { "params": { "selector": "div#dateright" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div#wrap_info > div.infobox") jokes = list() for tag in tags: joke = JokeFields() joke.text = get_tag_attribute(tag, text_config, "text") joke.text = joke.text.strip("What a f*****g day!") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_nbsw_parser(url): text_config = {"params": {"selector": "div.ecae > p"}, "method": "select"} user_config = {"params": {"selector": "a.local-link"}, "method": "select"} user_icon_config = { "params": { "selector": "img.avatar" }, "method": "select" } like_config = {"params": {"selector": "div.count-box"}, "method": "select"} comment_config = { "params": { "selector": "span.wppviews" }, "method": "select" } pb_time_config = { "params": { "selector": "span.meta > abbr" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="ul#postlist > li") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.text = joke.text.strip("[...]") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def joke_helegehe_parser(url): text_config = {"params": {"selector": "a.contentHerf"}, "method": "select"} user_config = {"params": {"selector": "h2"}, "method": "select"} user_icon_config = {"params": {"selector": "img"}, "method": "select"} like_config = { "params": { "selector": "a.output-leftSupport" }, "method": "select" } dislike_config = { "params": { "selector": "a.output-leftOpposition" }, "method": "select" } pb_time_config = { "params": { "selector": "div.publishedIn" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="article.post") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "text") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "src") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) jokes.append(joke) return jokes
def video_weibo_parser(url): body = video_weibo_downloader(url) weibo_video_url_re = re.compile(r"video_src=(.*?)&playerType") title_config = {"params": {"selector": "div.txt_cut"}, "method": "select"} publish_name_config = { "params": { "selector": "div.item_a" }, "method": "select" } publish_icon_config = { "params": { "selector": "img.face_pho" }, "method": "select" } thumbnail_config = { "params": { "selector": "img.piccut" }, "method": "select" } repost_config = { "params": { "selector": "li:nth-of-type(1) > a em:nth-of-type(2)" }, "method": "select" } comment_config = { "params": { "selector": "li:nth-of-type(2) > a em:nth-of-type(2)" }, "method": "select" } like_config = { "params": { "selector": "li:nth-of-type(3) > a em:nth-of-type(2)" }, "method": "select" } read_config = { "params": { "selector": "div.bot_number > em:nth-of-type(2)" }, "method": "select" } soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.weibo_tv_frame > ul.li_list_1 > a") videos = list() for tag in tags: video = VideoFields() url = urljoin(url, extract_tag_attribute(tag, name="href")) try: content = video_weibo_downloader(url) video_url = unquote_plus(weibo_video_url_re.findall(content)[0]) soup = BeautifulSoup(content, "lxml") root = soup.select_one("div.WB_handle > ul") video.n_repost = get_tag_attribute_int(root, repost_config) video.n_comment = get_tag_attribute_int(root, comment_config) video.n_like = get_tag_attribute_int(root, like_config) video.n_read = get_tag_attribute_int(soup, read_config) except Exception: continue if "miaopai" not in video_url: continue video.src = remove_url_query_params(video_url) video.publish_ori_url = url video.title = get_tag_attribute(tag, title_config, "text") video.publish_ori_name = get_tag_attribute(tag, publish_name_config, "text") video.thumbnail = get_tag_attribute(tag, thumbnail_config, "src") video.publish_ori_icon = get_tag_attribute(tag, publish_icon_config, "src") video.duration = 0 video.tags = g_tags(video.title) videos.append(video) return videos
def video_miaopai_parser(url): # 根据秒拍号进行列表抓取 body = http.download_html(url=url) video_url_template = "http://gslb.miaopai.com/stream/{id}.mp4" detail_url_template = "http://www.miaopai.com/show/{id}.htm" vid_re = re.compile('data-scid="(.*?)"') cover_re = re.compile('data-img="(.*?)"') title_config = { "params": { "selector": "div.viedoAbout > p" }, "method": "select" } publish_name_config = { "params": { "selector": "p.personalDataN" }, "method": "select" } publish_icon_config = { "params": { "selector": "a.pic > img" }, "method": "select" } read_config = { "params": { "selector": "p.personalDataT > span.red" }, "method": "select" } tag_config = { "params": { "selector": "div.viedoAbout > p.orange" }, "method": "select" } num_like_config = { "params": { "selector": "ul.commentLike > li > a" }, "method": "select" } num_comment_config = { "params": { "selector": "ul.commentLike a.commentIco" }, "method": "select" } soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.contentLeft > div.videoCont") videos = list() for tag in tags: video = VideoFields() vid = vid_re.findall(str(tag)) vid = vid[0] video.title = get_tag_attribute(tag, title_config, "text") video.n_comment = get_tag_attribute_int(tag, num_comment_config, "text") video.n_read = get_tag_attribute_int(tag, read_config, "text") video.n_like = get_tag_attribute_int(tag, num_like_config, "text") video.tags = get_tag_attribute(tag, tag_config, "text") video.tags = ";".join( filter(lambda y: y != "", map(lambda x: x.strip(), video.tags.split("#")))) video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") video.src = video_url_template.format(id=vid) video.publish_ori_url = detail_url_template.format(id=vid) video.thumbnail = cover_re.findall(str(tag))[0] videos.append(video) sleep(0.2) return videos
def video_autohome_parser(url): body = http.download_html(url=url) autohome_vid_re = re.compile(r'vid=(.*?)&|vid: \"(.*?)\"') video_info_url_template = "http://p-vp.autohome.com.cn/api/gmi?mid={mid}&useragent=Android" title_config = { "params": { "selector": "div.video-item-tit > a" }, "method": "select" } detail_config = { "params": { "selector": "div.video-item-tit > a" }, "method": "select" } publish_time_config = { "params": { "selector": "div:nth-of-type(3) span:nth-of-type(3)" }, "method": "select" } publish_name_config = { "params": { "selector": "a#author_nickName" }, "method": "select" } publish_icon_config = { "params": { "selector": "img#author_headimageurl" }, "method": "select" } comment_config = { "params": { "selector": "span.videocom" }, "method": "select" } read_config = { "params": { "selector": "span.count-eye" }, "method": "select" } soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.video-item") videos = list() for tag in tags: video = VideoFields() video.title = get_tag_attribute(tag, title_config, "text") video.publish_time = get_tag_attribute(tag, publish_time_config, "text") video.publish_time = format_datetime_string(video.publish_time) video.n_comment = get_tag_attribute_int(tag, comment_config, "text") video.n_read = get_tag_attribute_int(tag, read_config, "text") detail_url = urljoin(url, get_tag_attribute(tag, detail_config, "href")) try: req = http.Request(url=detail_url) response = http.download(req) _, content = http.response_url_content(response) vid_one, vid_two = autohome_vid_re.findall(content)[0] vid = vid_one if vid_one else vid_two soup = BeautifulSoup(content, "lxml") ts = soup.select("div.card-label > a") or soup.select( "a.video-label") video.tags = ";".join( [extract_tag_attribute(t, "text") for t in ts]) kinenames = ";".join([ extract_tag_attribute(t, "text") for t in soup.select("a.kindname") ]) if kinenames: video.tags += ";" + kinenames video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") if video.publish_ori_icon: _u = urljoin(url, video.publish_ori_icon) video.publish_ori_icon = remove_url_query_params(_u) except Exception: continue info_url = video_info_url_template.format(mid=vid) try: req = http.Request(url=info_url) response = http.download(req) content = response.body[5:-1] info = json.loads(content) except Exception as e: try: content = response.body info = json.loads(content) except: continue if int(info["status"]) == 0: continue video.src = remove_url_query_params(info["copies"][-1]["playurl"]) video.publish_ori_url = detail_url video.thumbnail = info["img"] video.duration = int(info["duration"]) videos.append(video) sleep(0.2) return videos
def joke_budejie_parser(url): text_config = { "params": { "selector": "div.j-r-list-c-desc > a" }, "method": "select" } user_config = {"params": {"selector": "img.u-logo"}, "method": "select"} user_icon_config = { "params": { "selector": "img.u-logo" }, "method": "select" } like_config = { "params": { "selector": "li.j-r-list-tool-l-up" }, "method": "select" } dislike_config = { "params": { "selector": "li.j-r-list-tool-l-down" }, "method": "select" } comment_config = { "params": { "selector": "li.j-comment" }, "method": "select" } pb_time_config = { "params": { "selector": "span.u-time" }, "method": "select" } repost_config = { "params": { "selector": "div.j-r-list-tool-ct-share-c" }, "method": "select" } document = http.download_html(url=url) soup = BeautifulSoup(document, "lxml") tags = soup.select(selector="div.j-r-list > ul > li") jokes = list() for tag in tags: joke = JokeFields() joke.publish_ori_name = get_tag_attribute(tag, user_config, "alt") joke.publish_ori_icon = get_tag_attribute(tag, user_icon_config, "data-original") joke.text = get_tag_attribute(tag, text_config, "text") joke.n_like = get_tag_attribute_int(tag, like_config, "text") joke.n_dislike = get_tag_attribute_int(tag, dislike_config, "text") pb_time = get_tag_attribute(tag, pb_time_config, "text") joke.publish_time = format_datetime_string(pb_time) joke.n_repost = get_tag_attribute_int(tag, repost_config, "text") joke.n_comment = get_tag_attribute_int(tag, comment_config, "text") jokes.append(joke) return jokes
def video_budejie_parser(url): detail_url_config = { "params": { "selector": "div.j-r-list-c-desc > a" }, "method": "select" } title_config = { "params": { "selector": "div.j-r-list-c-desc > a" }, "method": "select" } publish_name_config = { "params": { "selector": "div.u-txt > a" }, "method": "select" } publish_icon_config = { "params": { "selector": "div.u-img img" }, "method": "select" } publish_time_config = { "params": { "selector": "div.u-txt > span" }, "method": "select" } src_config = { "params": { "selector": "div.j-video-c > div.j-video" }, "method": "select" } cover_config = { "params": { "selector": "div.j-video-c > div.j-video" }, "method": "select" } duration_config = { "params": { "selector": "div.j-r-list-c > div.j-video-c" }, "method": "select" } num_like_config = { "params": { "selector": "li.j-r-list-tool-l-up > span" }, "method": "select" } num_dislike_config = { "params": { "selector": "li.j-r-list-tool-l-down > span" }, "method": "select" } num_comment_config = { "params": { "selector": "span.comment-counts" }, "method": "select" } num_repost_config = { "params": { "selector": "div.j-r-list-tool-ct-share-c > span" }, "method": "select" } body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.j-r-list > ul > li") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.publish_ori_url = urljoin(url, video.publish_ori_url) video.title = get_tag_attribute(tag, title_config, "text") video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") video.publish_time = get_tag_attribute(soup, publish_time_config, "text") video.src = get_tag_attribute(tag, src_config, "data-mp4") video.thumbnail = get_tag_attribute(tag, cover_config, "data-poster") video.n_like = get_tag_attribute_int(tag, num_like_config, "text") video.n_dislike = get_tag_attribute_int(tag, num_dislike_config, "text") video.n_comment = get_tag_attribute_int(tag, num_comment_config, "text") video.n_repost = get_tag_attribute_int(tag, num_repost_config, "text") video.duration = get_tag_attribute(tag, duration_config, "data-videoMlen") print video.duration videos.append(video) sleep(0.2) return videos
def video_4399pk_parser(url): # http://joke.4399pk.com/video/find.html# def get_num_comment(id): n_comment_url = "http://joke.4399pk.com/wap/funnycourse-num-id-%s" % id content = http.download_json(url=n_comment_url) n_comment = content["msg"]["vcomment"] return int(n_comment) def get_wap_detail(id): meta = {} detail_wap = "http://joke.4399pk.com/wap/video-content-id-%s.html" % vid content = http.download_json(detail_wap) soup = BeautifulSoup(content, "lxml") meta["name"] = get_tag_attribute(soup, publish_name_config, "text") meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src") return meta def get_video_inf(id): pass detail_url_config = {"params": {"selector": "a.img"}, "method": "select"} title_config = {"params": {"selector": "div.tit"}, "method": "select"} num_like_config = { "params": { "selector": "div.info > span.fr > em" }, "method": "select" } publish_name_config = { "params": { "selector": "div.kind-user.cf > div.fl > p" }, "method": "select" } publish_icon_config = { "params": { "selector": "div.kind-user.cf img" }, "method": "select" } body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.piclist > ul > li") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.title = get_tag_attribute(tag, title_config, "text") video.n_like = get_tag_attribute_int(tag, num_like_config, "text") vid = video.publish_ori_url.split("/")[-1].split(".")[0] video.n_comment = get_num_comment(vid) video.publish_ori_name = get_tag_attribute(soup, publish_name_config, "text") video.publish_ori_icon = get_tag_attribute(soup, publish_icon_config, "src") print video.duration videos.append(video) sleep(0.2) return videos
def video_pearvideo_parser(url): def format_duration(d_text): duration = map(lambda x: int(x), d_text.split(":")) duration = filter(lambda y: y != 0, duration) length = len(duration) result = 0 for i in range(length, 0, -1): result += duration[length - i] * pow(60, i - 1) return int(result) def get_detail_info(url): meta = {} content = http.download_html(url=url) soup = BeautifulSoup(content, "lxml") meta["src"] = src_re.findall(content)[0] meta["name"] = get_tag_attribute(soup, publish_name_config, "alt") meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src") meta["time"] = get_tag_attribute(soup, publish_time_config, "text") meta["thumbnail"] = get_tag_attribute(soup, cover_config, "src") return meta detail_url_config = { "params": { "selector": "a.vervideo-lilink" }, "method": "select" } title_config = { "params": { "selector": "div.vervideo-title" }, "method": "select" } duration_config = { "params": { "selector": "div.duration" }, "method": "select" } num_like_config = {"params": {"selector": "span.fav"}, "method": "select"} publish_name_config = { "params": { "selector": "div.thiscat img" }, "method": "select" } publish_icon_config = { "params": { "selector": "div.thiscat img" }, "method": "select" } cover_config = { "params": { "selector": "div#poster img" }, "method": "select" } publish_time_config = { "params": { "selector": "div.details-content div.date" }, "method": "select" } src_re = re.compile('dUrl="(.*?)"') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="li.categoryem ") videos = list() for tag in tags: video = VideoFields() video.publish_ori_url = get_tag_attribute(tag, detail_url_config, "href") video.publish_ori_url = urljoin(url, video.publish_ori_url) video.title = get_tag_attribute(tag, title_config, "text") video.duration = get_tag_attribute(tag, duration_config, "text") video.duration = format_duration(video.duration) video.n_like = get_tag_attribute_int(tag, num_like_config, "text") meta = get_detail_info(video.publish_ori_url) video.publish_ori_name = meta["name"] video.publish_ori_icon = meta["icon"] video.publish_time = meta["time"] video.publish_time = format_datetime_string(video.publish_time) video.thumbnail = meta["thumbnail"] video.src = meta["src"] videos.append(video) sleep(0.2) return videos
def video_thepaper_parser(url): body = http.download_html(url=url) thepaper_video_url_re = re.compile(r'source src="(.*?)" type="video/mp4"') detail_config = {"params": {"selector": "a"}, "method": "select"} title_config = { "params": { "selector": "div.video_title" }, "method": "select" } user_name_config = { "params": { "selector": "div.t_source > a" }, "method": "select" } thumbnail_config = { "params": { "selector": "div.video_list_pic > img" }, "method": "select" } user_icon_config = { "params": { "selector": "div.video_txt_r_icon img" }, "method": "select" } duration_config = { "params": { "selector": "div.video_list_pic > span.p_time" }, "method": "select" } comment_config = { "params": { "selector": "div.t_source > span.reply" }, "method": "select" } description_config = {"params": {"selector": "p"}, "method": "select"} soup = BeautifulSoup(body, "lxml") tags = soup.select(selector=".video_news") videos = list() for tag in tags: url = urljoin("http://www.thepaper.cn/", get_tag_attribute(tag, detail_config, "href")) try: req = http.Request(url=url) response = http.download(req) _, content = http.response_url_content(response) video_url = unquote_plus(thepaper_video_url_re.findall(content)[0]) except Exception: continue video = VideoFields() video.title = get_tag_attribute(tag, title_config, "text") video.src = video_url video.publish_ori_url = url video.publish_ori_name = get_tag_attribute(tag, user_name_config, "text") video.publish_ori_name = video.publish_ori_name.replace( u"@所有人", u"澎湃视频") video.thumbnail = get_tag_attribute(tag, thumbnail_config, "src") video.n_comment = get_tag_attribute_int(tag, comment_config, "text") video.description = get_tag_attribute(tag, description_config, "text") string = get_tag_attribute(tag, duration_config, "text") if string: try: m, s = string.split(":") second = int(m) * 60 + int(s) except Exception: pass else: video.duration = second detail = BeautifulSoup(content, "lxml") video.publish_ori_icon = get_tag_attribute(detail, user_icon_config, "src") videos.append(video) return videos