def clean_video(video): video = video['video'] flvid = video.get('flvid') duration = video.get('duration') duration = map(int, duration.split(":")) if len(duration) == 2: duration = duration[0] * 60 + duration[1] elif len(duration) == 3: duration = duration[0] * 3600 + duration[1] * 60 + duration[2] video = VideoItemModel({ "title": video.get('title'), "image": video.get('img'), "duration": duration, "description": video.get('introduce'), "url": "http://www.56.com/u13/v_%s.html" % flvid, "stream": ["http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % flvid], "stream_high": ["http://vxml.56.com/html5/%s/?src=3g&res=qvga" % flvid] }) return video
def crawl(self): videos = [] mid = self.key url = DETAIL % mid detail = loadurl(url) description = detail.get('plots') description = ''.join(description.split()) if self.data.get('channel') == u'鐢靛奖': dict_ = detail['pinfos']['mpurls'] video = VideoItemModel({ "title": self.data.get('title'), "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃 "image": self.data.get('image'), "description": description, "stream": [{ 'url': dict_['tv'].get('url'), 'size': dict_['tv'].get('bits'), 'format': 'mp4' }] }) videos.append(video) else: try: sort = detail['pinfos'].get('sort')[0] episodes = detail['pinfos']['content'][sort]['fsps'] except: episodes = detail['pinfos']['fsps'] for episode in episodes: plots = episode.get('plots') plots = ''.join(plots.split()) video = VideoItemModel({ "title": episode.get('taskname'), "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃 "image": episode.get('picurl'), "description": plots, "stream": getstream(episode.get('mpurls')) }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": mid, #婧愮珯ID "title": self.data["title"], "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃 "image": self.data.get('image'), #鍥剧墖url "categories": self.data.get('category'), #鍒嗙被 "channel": self.data.get('channel'), #棰戦亾 "region": detail.get('country'), #鍦板尯 "videos": videos, #瑙嗛涓撹緫 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿 "actors": detail.get('lactor'), "directors": detail.get('director'), "description": description, }) #瀵煎嚭鏁版嵁 export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): catecode = self.data["catecode"] last_updated = self.data.get("updated", datetime.min) current_updated = datetime.max max_time = last_updated page = 1 pagesize = 20 while True: try: data = api_albums(catecode, page, pagesize) for item in data["videos"]: try: sid = item.get('sid') detail = api_album(sid) if sid else None model = self.extract_model(item, detail) if sid: videos = self.get_videos(sid) if model['channel'] in [ u'综艺' ]: #reverse order for zongyi videos = [video for video in reversed(videos)] else: video = VideoItemModel({ "title": model["title"], "image": model["image"], "description": model["description"], "time": model["time"], "price": model["price"], "duration": model["duration"], "url": model["url"] }) videos = [video] model['videos'] = videos export(model) current_updated = model["time"] if max_time < current_updated: max_time = current_updated except: self.logger.warning(get_exception_info()) if current_updated < last_updated: break if page * pagesize >= data["count"]: break except: self.logger.warning(get_exception_info()) page += 1 self.data["updated"] = max_time
def crawl(self): timestr = self.data.get('videoLength', '00:00') duration = gettime(timestr) videos = [] video = VideoItemModel({ "title": self.data.get('title'), "url": self.data.get('videoURLMid'), #网页地址 "image": self.data.get('imgURL'), "description": self.data.get('desc'), "stream": [{ "url": self.data.get('videoURLMid'), #视频文件播放地址 "size": self.data.get('videoSizeMid'), "format": "mp4", #视频格式(协议) "duration": duration }], "stream_low": [{ "url": self.data.get('videoURLLow'), "size": self.data.get('videoSizeLow'), "format": "mp4", "duration": duration }], "stream_high": [{ "url": self.data.get('videoURLHigh'), "size": self.data.get('videoSizeHigh'), "format": "mp4", "duration": duration }] }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": self.data.get('id'), #源站ID "title": self.data.get("title"), "url": self.data.get('shareurl'), #详情页的地址 "image": self.data.get('imgURL'), #图片url "channel": CHANNEL, #频道 "videos": videos, #视频专辑 "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间 "description": self.data.get('desc'), }) #导出数据 export(model) self.data['to_album_id'] = model['to_album_id']
def clean_video(video, play_url): url = "%s/%s.html" % (play_url, video.get("vid")) new_video = VideoItemModel({ "title": video.get("tt") + video.get("secondtitle"), "url": url, "stream": [{ "url": "javascript:getUrl('tencent', '%s')" % url }], "image": video.get("screenshot"), }) return new_video
def crawl(self): album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key hxs = load_html(album_url) urls = hxs.select("//div[@class='movievod']/li/input/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append(VideoItemModel({ "title" : title, "url" : url, "stream" : [{"url" : url, "format" : "qvod", "size" : size}], })) kv = {} for s in hxs.select("//div[@class='videoDetail']/p"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract()) try: image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0] except: image = None model = VideoSourceModel({ "source" : self.data['source'], "source_id" : self.key, "title" : self.data["title"], "time" : self.data.get('time'), "url" : album_url, "image" : image, "completed" : self.data.get('completed'), "categories" : [self.data.get('category')], "channel" : self.data.get('category'), "region" : self.data.get('region'), "videos" : videos, "actors" : split(kv.get(u'影片主演:')), "directors" : split(kv.get(u'影片导演:')), "pubtime" : parse_date(kv.get(u'上映年份:')), "description" : description, "completed" : not kv.get(u'连载状态:'), }) export(model)
def crawl_video(self, show_id): videos = [] page = 1 pagesize = 30 while True: data = api_videos(show_id, page, pagesize) if not data['results']: break for item in data['results']: try: video = VideoItemModel({ "title": item['title'], "source_id": item['videoid'], "url": "http://v.youku.com/v_show/id_%s.html" % item['videoid'], }) jsurl = "javascript: getUrl('youku', '%s')" % item[ "videoid"] video["stream"] = [{"url": jsurl}] # TODO: # ret = api_plays(item['videoid']) # results = ret.get('results', {}) # for key, fmt in FORMATS_NORMAL: # if results.get(key): # video["stream_low"] = self.extract_stream(results[key], fmt) # break # for key, fmt in FORMATS_HIGH: # if results.get(key): # video["stream"] = self.extract_stream(results[key], fmt) # break # for key, fmt in FORMATS_HD: # if results.get(key): # video["stream_high"] = self.extract_stream(results[key], fmt) # break videos.append(video) except: self.logger.warning(get_exception_info()) if pagesize * page >= data['total']: break page += 1 return videos
def extract_video(self, item): item = item["map"] video = VideoItemModel({ "title": item.get("tv_name"), "image": item.get("ver_big_pic"), "description": item.get("tv_desc"), "duration": int(item.get("time_length", "0")), "url": item.get("url_html5", item.get("tv_url")), #out of date! "time" : datetime.strptime(item.get("update_time", "1970-01-01 00:00:00")[:19], "%Y-%m-%d %H:%M:%S"), }) playinfo = api_playinfo(item["tv_ver_id"]) stream_nor = [] stream_high = [] stream_super = [] stream_mobile = [{ "url": playinfo.get("downloadurl", ""), "size": playinfo.get("file_size_mobile", 0), "format": "mp4", }] if playinfo.get("url_nor_mp4"): urls = playinfo["url_nor_mp4"].split(",") durations = playinfo["clipsDuration_nor"] sizes = playinfo["clipsBytes_nor"] stream_nor = self.extract_stream(urls, durations, sizes) if playinfo.get("url_high_mp4"): urls = playinfo["url_high_mp4"].split(",") durations = playinfo["clipsDuration_high"] sizes = playinfo["clipsBytes_high"] stream_high = self.extract_stream(urls, durations, sizes) if playinfo.get("url_super_mp4"): urls = playinfo["url_super_mp4"].split(",") durations = playinfo["clipsDuration_super"] sizes = playinfo["clipsBytes_super"] stream_super = self.extract_stream(urls, durations, sizes) video["stream_low"] = stream_nor video["stream_high"] = stream_super #video["stream"] = stream_high video["stream"] = stream_mobile return video
def extract_video(video): m = re.match(".+/(.+).(?=m3u8|mp4)", video.res[len(video.res) - 1].vid) if m: vid = m.group(1) else: raise Exception("No vid found.") item = VideoItemModel({ 'url': "http://m.iqiyi.com/play.html?tvid=%s&vid=%s" % (video._id, vid), 'title': video._n, 'duration': int(video._dn), 'description': video.desc, }) return item
def get_videos(album_id, url_key): list_videos = api_video(album_id) list_videos = list_videos.get('items') videos = [] for item in list_videos: video = VideoItemModel({ "title": item['title'], "url": "http://www.tudou.com/albumplay/%s/%s.html" % (url_key, item['itemCode']), "image": item['item_img_hd'], "duration": int(item['duration']), "stream": [{ "url": "javascript:getUrl('tudou', '%s')" % item['vcode'] }] }) videos.append(video) return videos
def crawl(self): album_id = self.key if self.data['channel'] in SHORT_VIDEO: url = "http://v.qq.com/page/%s/%s/%s/%s.html" % ( album_id[0], album_id[1], album_id[-1], album_id) pubtime = datetime.strptime( self.data["pubtime"], "%Y-%m-%d %H:%M:%S") video = VideoItemModel({ "title": self.data["title"], "url": url, "stream": [{ "url": "javascript:getUrl('tencent', '%s')" % url }], "image": self.data["image"], "channel": self.data["channel"], }) model = VideoSourceModel({ "source": self.data["source"], "source_id": album_id, "title": self.data["title"], "url": url, "image": self.data["image"], "channel": self.data["channel"], "pubtime": pubtime, "videos": [video] }) export(model) self.data['to_album_id'] = model['to_album_id'] else: album_url = "http://v.qq.com/detail/%s/%s.html" % ( album_id[0], album_id) album_data = api_album(album_id[0], album_id) if album_data['trailer'] == 1: play_url = "http://v.qq.com/prev/%s/%s" % ( album_id[0], album_id) else: play_url = "http://v.qq.com/cover/%s/%s" % ( album_id[0], album_id) description = album_data.get("columndesc") if not description: description = album_data.get("desc") description = "".join(description.split()) try: pubtime = datetime.strptime(self.data.get("pubtime"), "%Y") except: pubtime = datetime.utcfromtimestamp(0) videos = [] columnid = album_data.get('columnid') rely = album_data.get('rely') if columnid: # columnid != 0 for video_dict in rely: for year, months in video_dict.iteritems(): for month in months: videolist_id = "%s_%s" % (year, month) videos_data = api_video(columnid, videolist_id) for video in videos_data['items']: time = video.get('date') time = datetime.strptime(time, "%Y-%m-%d") url = "http://v.qq.com/cover/%s/%s.html" % ( video.get('coverid')[0], video.get('coverid')) video = VideoItemModel({ "title": video.get('sectitle'), "description": video.get('breif'), "url": url, "stream": [{ "url": "javascript:getUrl('tencent', '%s')" % url }], "image": video.get('snapurl'), "time": time }) videos.append(video) if not columnid: # columnid == 0, only one video for video in album_data['videos']: videos.append(clean_video(video, play_url)) # self.data is not None: export(data) if self.data: model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": album_data['columnname'] if album_data['columnname'] else self.data["title"], "image": self.data.get("image"), "url": album_url, "actors": self.data.get("actors"), "directors": self.data.get("directors"), "categories": self.data.get("categories"), "channel": self.data.get("channel"), "region": self.data.get("region"), "description": description, "pubtime": pubtime, "videos": videos, }) # self.data is None: crawl web data first # (http://v.qq.com/cover/x/xxxxx.html), and export(data) else: hxs = load_html(play_url) channel = hxs.select( "//div[@class='mod_crumbs']/a[1]/text()").extract()[0] album_hxs = hxs.select( "//div[@class='mod_video_intro mod_video_intro_rich']") image = album_hxs.select("a/img/@src").extract()[0] title = album_hxs.select( "div[@class='video_title']/strong/a/text()").extract()[0] directors = [] for director_hxs in album_hxs.select("//div[@itemprop='director']/a"): director = director_hxs.select("span/text()").extract()[0] directors.append(director) actors = [] for actor_hxs in album_hxs.select("//div[@itemprop='actors']/a"): actor = actor_hxs.select("span/text()").extract()[0] actors.append(actor) region = album_hxs.select( "//div[@class='info_area']/span[@class='content']/a/text()").extract()[0] categories = [] for categorie_hxs in album_hxs.select("//div[@class='info_category']/span[@class='content']/a"): categorie = categorie_hxs.select("text()").extract()[0] categories.append(categorie) pubtime = album_hxs.select( "//div[@class='info_years']/span[@class='content']/a/text()").extract()[0] if re.match("^\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y") else: pubtime = datetime.utcfromtimestamp(0) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": image, "url": album_url, "actors": actors, "directors": directors, "categories": categories, "channel": channel, "region": region, "description": description, "pubtime": pubtime, "videos": videos, }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): type = 4 album_id = self.key title = self.data['title'].encode('utf-8') channel = self.data.get('channel') if channel in LONG_VIDEO_CHANNELS.items(): album_data = api_album(type, album_id, title) album_data = album_data['data'] pubtime = album_data.get("public_time") pubtime = datetime.strptime(pubtime, "%Y%m%d") videos = [] for video in album_data['data']: video = clean_video(video) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": album_data.get("bpic"), "image2": album_data.get("mpic"), "url": album_data.get("web_url"), "actors": album_data.get("actors"), "directors": album_data.get("director"), "categories": album_data.get("tname"), "tags": self.data.get("tags"), "channel": channel, "region": album_data.get("zname")[0], "description": album_data.get("introduce"), "pubtime": pubtime, "videos": videos, }) else: video = VideoItemModel({ "title": title, "description": self.data.get("description"), "url": "http://www.56.com/u13/v_%s.html" % album_id, "stream": [{ "url": "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id }], "stream_high": [{ "url": "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id }] }) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": self.data.get("bpic"), "image2": self.data.get("mpic"), "tags": self.data.get("tags"), "url": self.data.get("web_url"), "channel": channel, "description": self.data.get("introduce"), "videos": [video], }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): source_id = self.key album_data = api_album(source_id, pcode, version) album_data = album_data['body'] title = album_data.get("nameCn") pubtime = album_data.get("releaseDate") if re.match("^\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y") elif re.match("^\d+-\d+-\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y-%m-%d") else: pubtime = datetime.utcfromtimestamp(0) directors = album_data.get("directory").split(" ") actors = album_data.get("starring").split(" ") desc = album_data.get("description") desc = "".join(desc.split()) region = album_data.get("area") categories = album_data.get("subCategory").split(" ") tags = album_data.get("tag").split(" ") url = "http://so.letv.com/tv/%s.html" % source_id videos = [] b = 1 s = 60 o = -1 m = 0 series_data = api_series(source_id, b, s, o, m, pcode, version) for series in series_data['body']['videoInfo']: id = series['id'] mid = series['mid'] url = "http://www.letv.com/ptv/vplay/%s.html" % id vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % ( mid, pcode, version) jsurl = "javascript:getUrl('letv', '%s')" % vurl video = VideoItemModel({ "title": series.get("nameCn"), "url": url, "stream": [{ "url": jsurl }], "image": series.get("picAll"), "duration": series.get("duration") }) videos.append(video) model = VideoSourceModel({ "source_id": source_id, "source": self.data.get('source'), "url": url, "channel": self.data['channel'], 'title': title, "image": self.data['image'], "pubtime": pubtime, "directors": directors, "actors": actors, "desc": desc, "region": region, "categories": categories, "tags": tags, "videos": videos }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_url = "http://www.265zy.com/detail/?%s.html" % self.key hxs = load_html(album_url) urls = hxs.select( "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append( VideoItemModel({ "title": title, "url": url, "stream": [{ "url": url, "format": "qvod", "size": size }], })) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join( hxs.select("//div[@class='intro']/.//text()").extract()) try: image = urlparse.urljoin( "http://www.265zy.com/", hxs.select("//div[@class='img']/img/@src").extract()[0]) except: image = None model = VideoSourceModel({ "source": self.data['source'], "source_id": self.key, "title": self.data["title"], "image": image, "url": album_url, "time": self.data.get('time'), "categories": [self.data.get('category')], "channel": self.data.get('category'), "region": self.data.get('region'), "videos": videos, "actors": split(kv.get(u"影片演员:")), "pubtime": parse_date(kv.get(u"上映日期:")), "completed": kv.get(u"影片状态:", "").find(u"连载") == -1, "description": description, }) export(model)
def process_album(self, item): sites = {} fangying_id = re.findall("f_(.+)\.html", item['link'])[0] for play in item['plays']: site = play['site'] if site not in SITES: continue if play["url"].find("fangying.com") != -1: stream = [] else: format = "thunder" if site == "thunder" else "" stream = [{"url": play["url"], "format": format}] video = VideoItemModel({ "title": play["title"], "url": play["url"], "stream": stream, }) if not sites.has_key(site): sites[site] = [] sites[site].append(dict(video)) model = None for site, videos in sites.iteritems(): model = VideoSourceModel({ "source": self.data['source'], "source_id": fangying_id, "videos": videos, "title": item['title'], "directors": item['directors'].split("/"), "actors": item['performers'].split("/"), "description": item['description'], 'categories': item['genres'].split("/"), 'region': item['countries'].split("/")[0], 'duration': parse_duration(item['duration']), 'image': item['avatar_middle'], 'score': float(item['douban_rating']) if item.get('douban_rating') else None, 'url': item['link'], 'price': 0.0, 'pubtime': parse_pubtime(item['release_time']), 'channel': CHANNELS.get(self.key) }) export(model) if model: Scheduler.schedule(RelationCrawler.type, key=fangying_id, data={ 'title': model['title'], 'url': model['url'] })
def crawl(self): #key为专辑源站ID album_id = self.key album_url = "http://bdzy.cc/detail/?%s.html" % album_id hxs = load_html(album_url) urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract() videos = [] for url in urls: m = re.match("bdhd://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] #视频剧集 video = VideoItemModel({ "title" : title, "url" : url, #网页地址 (这里没有,所以采用播放地址) "stream" : [ { "url" : url, #视频文件播放地址 "size" : size, "format" : "bdhd" #视频格式(协议) }], }) videos.append(video) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract()) try: image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0] except: image = None #视频导出的数据模型 model = VideoSourceModel({ "source" : self.data['source'], #视频源 "source_id" : album_id, #源站ID "title" : self.data["title"], "url" : album_url, #网页地址 "image" : image, #图片url "time" : self.data.get('time'), #源站更新时间 "categories" : [self.data.get('category')], #分类 "channel" : self.data.get('category'), #频道 "region" : self.data.get('region'), #地区 "videos" : videos, #视频专辑数组 "pubtime" : parse_date(kv.get(u"上映日期:")), #上映时间 "actors" : split(kv.get(u"影片演员:")), "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, #是否完结 "description" : description, }) #导出数据 export(model)