def parse_list_haowai(document, url=None): """ 号外新闻需要单独解析, 列表页有下载操作 """ FIELDS = { "url", "title", "publish_time", "publish_site", "author", "abstract", "keywords", "comment_id" } class Fields(object): def __init__(self): for name in FIELDS: self.__dict__[name] = "" self.html = "" def to_dict(self): return dict(self.__dict__) def wrap_content(title, content): html = """ <!DOCTYPE html> <html lang="en"> <head><meta charset="UTF-8"><title>{}</title></head> <body><div id="content">{}</div></body></html> """ html = html.format(title.encode("utf-8"), content.encode("utf-8")) return html import json url_format = "http://api.myhaowai.com/api/article/get_article_by_aid?aid={}&readFrom=app" data = json.loads(document) if not data or data["result"].get("code") == "1": return list() feeds = data.get("contentList", list()) result = list() for feed in feeds: aid = feed.get("aid") if not aid: continue detail_url = url_format.format(aid) try: doc = http.download_json(url=detail_url) doc = doc["article_info"] content = http.download_json(doc["content_url"]) content = content.get("content") title = doc["title"] if not (content and title): continue fields = Fields() fields.title = title fields.url = doc["content_url"] fields.publish_site = doc.get("nickname", "") fields.publish_time = clean_date_time(doc.get("pubtime", "")) fields.html = wrap_content(title=title, content=content) except Exception: raise else: result.append(fields.to_dict()) return result
def video_zaker_parser(url): document = http.download_json(url=url) data = document["data"].get("articles", []) videos = list() for item in data: url = item["full_url"] req = http.Request(url=url) try: response = http.download(req) doc = response.json() except Exception: continue detail = doc.get("data") if not detail: continue src = detail["video_info"]["url"] if src.endswith("m3u8"): src = src.replace("m3u8", "mp4") label = detail["video_info"]["video_label"].split(":")[::-1] duration = 0 for n, i in enumerate(label): duration += pow(60, n) * int(i) video = VideoFields() video.title = item["title"] video.publish_ori_name = item["auther_name"] video.publish_ori_url = item["weburl"] video.publish_ori_icon = detail["article_group"]["logo"]["url"] video.thumbnail = detail["video_info"]["pic_url"] video.duration = duration video.src = src videos.append(video) return videos
def get_wap_detail(id): meta = {} detail_wap = "http://joke.4399pk.com/wap/video-content-id-%s.html" % vid content = http.download_json(detail_wap) soup = BeautifulSoup(content, "lxml") meta["name"] = get_tag_attribute(soup, publish_name_config, "text") meta["icon"] = get_tag_attribute(soup, publish_icon_config, "src") return meta
def get_metas(ids): url = "http://dg.xxhh.com/getcnums/?__jsonp__=fn&ids={ids}".format( ids=",".join(ids)) document = http.download_json(url=url, skip=(3, -1)) metas = dict() for i, meta in enumerate(document.get("d", [])): metas[ids[i]] = (int(meta[0]), int(meta[1]), int(meta[2]) ) # comment, like, dislike return metas
def get_video_src(vid): # 获取视频地址 main_parse_url = "http://www.acfun.tv/video/getVideo.aspx?id=%s" % vid info = http.download_json(url=main_parse_url) sourceType = info['sourceType'] if sourceType != 'zhuzhan': return [] encode = info['encode'] pass return vid
def joke_netease_parser(url): document = http.download_json(url=url) data = document[u"段子"] jokes = list() for g in data: if g.get("imgsum", 0) == 0: joke = JokeFields() joke.title = g["title"] joke.publish_ori_name = g["source"] joke.text = g["digest"] joke.n_comment = int(g["replyCount"]) joke.n_like = int(g["upTimes"]) joke.n_dislike = int(g["downTimes"]) # _comment_need = g["docid"] # 评论需要该字段 jokes.append(joke) return jokes
def joke_neihan_parser(url): document = http.download_json(url=url) groups = document["data"]["data"] jokes = list() for g in groups: g = g["group"] joke = JokeFields() joke.publish_ori_name = g["user"]["name"] joke.publish_ori_icon = g["user"]["avatar_url"] joke.publish_time = format_datetime_string(g["create_time"]) joke.text = g["text"] joke.n_comment = int(g["comment_count"]) joke.n_like = int(g["digg_count"]) joke.n_dislike = int(g["bury_count"]) # _comment_need = g["code"] # 评论需要该字段 jokes.append(joke) return jokes
def video_kuaishou_parser(url): documents = http.download_json(url=url) data = documents.get("feeds", []) videos = list() for item in data: urls = item.get("main_mv_urls") thumbs = item.get("cover_thumbnail_urls") avatars = item.get("headurls") if not (urls and thumbs and avatars): continue video = VideoFields() video.title = item["caption"] video.publish_time = format_datetime_string(item["timestamp"]) video.publish_ori_name = item["user_name"] video.publish_ori_url = avatars[0]["url"] video.src = urls[0]["url"] video.thumbnail = thumbs[0]["url"] video.duration = int(item["ext_params"].get("video", 0) / 1000.0) videos.append(video) return videos
def video_meipai_parser(url): documents = http.download_json(url=url) data = [doc["media"] for doc in documents if doc["type"] == "media"] videos = list() for item in data: video = VideoFields() video.title = item["caption"] video.publish_time = format_datetime_string(item["created_at"]) video.publish_ori_url = item["url"] video.publish_ori_name = item["user"]["screen_name"] video.publish_ori_icon = item["user"]["avatar"] video.src = item["video"] video.thumbnail = item["cover_pic"] video.duration = int(item.get("time", 0)) video.n_like = int(item.get("likes_count", 0)) video.n_comment = int(item.get("comments_count", 0)) video.n_repost = int(item.get("reposts_count", 0)) video.tags = g_tags(video.title) videos.append(video) return videos
def video_duowan_parser(url): detail_info_template = "http://video.duowan.com/jsapi/playPageVideoInfo/?vids={vid}" detail_url_config = { "params": { "selector": "a.uiVideo__ori" }, "method": "select" } video_src_re = re.compile('<video src="(.*?)" id="video"') body = http.download_html(url=url) soup = BeautifulSoup(body, "lxml") tags = soup.select(selector="div.uiVideo__item") videos = list() for tag in tags: video = VideoFields() detail_url = get_tag_attribute(tag, detail_url_config, "href") vid = detail_url.split("/")[-1].strip(".html") m_detail_url = detail_url.replace(".com/", ".cn/") detail_json_url = detail_info_template.format(vid=vid) jsond_data = http.download_json(url=detail_json_url) video_info = jsond_data[vid] video.title = video_info["video_title"] video.n_comment = int(video_info["video_raw_comment_num"]) video.n_read = video_info["video_raw_play_num"] video.n_like = int(video_info["video_raw_support"]) video.tags = ";".join(video_info["video_tags"]) video.publish_ori_name = video_info["user_nickname"] video.publish_ori_icon = video_info["user_avatar"] video.publish_time = format_datetime_string( video_info["video_upload_time"]) video.publish_ori_url = video_info["video_url"] video.thumbnail = video_info["video_big_cover"] video.duration = int(video_info["video_raw_duration"]) m_detail_content = http.download_html(url=m_detail_url) video.src = video_src_re.findall(m_detail_content)[0] videos.append(video) sleep(0.2) return videos
def joke_qiushi_parser(url): headers = { "User-Agent": "qiushibalke_10.8.1_WIFI_auto_19", "Source": "android_10.8.1", "Model": "Xiaomi/hydrogen/hydrogen:6.0.1/MMB29M/V7.5.6.0.MBCCNDE:user/release-keys", "Uuid": "IMEI_8728c26518fa3ae795a7f787073d375f", "Deviceidinfo": '{"DEVICEID": "862535037295724","SIMNO": "89860112817005617959","IMSI": "460012225499106","ANDROID_ID": "27dafccd6e32bfb2","SDK_INT": 23,"SERIAL"a882d7f9","MAC": "02:00:00:00:00:00","RANDOM": ""}' } req = http.Request(url=url, headers=headers) document = http.download_json(request=req) data = document["items"] jokes = list() for g in data: if not g.get("user"): continue joke = JokeFields() joke.publish_ori_name = g["user"]["login"] avatar = g["user"].get("thumb") if not avatar: continue if avatar.startswith("//"): avatar = "http:" + avatar joke.publish_ori_icon = avatar joke.publish_time = format_datetime_string(g["created_at"]) joke.text = g["content"] joke.n_comment = int(g.get("comments_count", 0)) if g.get("votes"): joke.n_like = int(g["votes"]["up"]) joke.n_dislike = int(g["votes"]["down"]) jokes.append(joke) return jokes
def video_acfun_parser(url): # http://www.acfun.cn/list/getlist?channelId=134&sort=0&pageSize=20&pageNo=1 def get_video_src(vid): # 获取视频地址 main_parse_url = "http://www.acfun.tv/video/getVideo.aspx?id=%s" % vid info = http.download_json(url=main_parse_url) sourceType = info['sourceType'] if sourceType != 'zhuzhan': return [] encode = info['encode'] pass return vid json_data = http.download_json(url=url) item_list = json_data["data"]["data"] videos = list() for item in item_list: video = VideoFields() video.title = item["title"] video.n_comment = int(item["commentCount"]) video.n_read = int(item["viewCountFormat"]) video.n_like = None video.tags = None video.publish_ori_name = item["username"] video.publish_ori_icon = item["userAvatar"] video.publish_time = format_datetime_string( item["contributeTimeFormat"]) video.publish_ori_url = urljoin(url, item["link"]) video.thumbnail = item["coverImage"] video.duration = int(item["duration"]) vid = item["videoId"] video.src = get_video_src(vid) videos.append(video) sleep(0.2) return videos
def get_num_comment(id): n_comment_url = "http://joke.4399pk.com/wap/funnycourse-num-id-%s" % id content = http.download_json(url=n_comment_url) n_comment = content["msg"]["vcomment"] return int(n_comment)