def crawl(self): album_id = self.key params = dict(PARAMS_PROTOTYPE) params.update(PARAMS_ALBUM) ret = call_api("getAlbum", params, [album_id, None, None, album_id, None, '1', '0']) if ret._id != album_id: return model = extract_album(ret, self.data['source']) videos = [] if ret.tv['block']: block = ret.tv['block'] for block_index in xrange(len(ret.tv['block'])): if block_index != 0: block_now = block[block_index] ret = call_api( "getAlbum", params, [album_id, None, None, None, '1', '1', block_now]) videos.extend(self.extract_videos(album_id, ret.tv['other'])) if not ret.tv['block']: videos.extend(self.extract_videos(album_id, ret.tv['other'])) model['videos'] = videos export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): for spec in TOP_SPEC: video_set = set() videos = [] count = TOP_COUNT / len(spec['sources']) for src, param in spec['sources']: func = getattr(self, "crawl_%s" % src) if func: try: titles = func(param, count) for title in titles: if title not in video_set: video_set.add(title) videos.append(title) except: self.logger.warning(get_exception_info()) rank = VideoRankModel({ "source" : self.data['source'], "title" : spec["title"], "type" : spec["type"], "videos" : videos, }) export(rank)
def crawl(self): album_id = self.key channel = self.data["channel"] detail = api_album(album_id) if album_id else None title = detail["tv_name"] directors = detail["director"].split(";") actors = detail["actor"].split(";") region = detail["area"] categories = detail["tv_cont_cats"].split(";") ver_image = detail["ver_high_pic"] hor_image = detail["hor_high_pic"] url = detail["s_url"] description = detail["tv_desc"] # 视频导出的数据模型 model = VideoSourceModel({ "source" : self.data['source'], "source_id" : album_id, "title" : title, "url" : url, "directors" : directors, "actors" : actors, "region" : region, "categories" : categories, "channel" : channel, "description" : description, "image" : ver_image, "image2" : hor_image, }) # 导出数据 export(model) self.data['to_album_id'] = model['to_album_id'] return
def crawl(self): album_id = self.key channel = self.data["channel"] detail = api_album(album_id) if album_id else None title = detail["tv_name"] directors = detail["director"].split(";") actors = detail["actor"].split(";") region = detail["area"] categories = detail["tv_cont_cats"].split(";") ver_image = detail["ver_high_pic"] hor_image = detail["hor_high_pic"] url = detail["s_url"] description = detail["tv_desc"] # 视频导出的数据模型 model = VideoSourceModel({ "source": self.data['source'], "source_id": album_id, "title": title, "url": url, "directors": directors, "actors": actors, "region": region, "categories": categories, "channel": channel, "description": description, "image": ver_image, "image2": hor_image, }) # 导出数据 export(model) self.data['to_album_id'] = model['to_album_id'] return
def crawl(self): catecode = self.data["catecode"] last_updated = self.data.get("updated", datetime.min) current_updated = datetime.max max_time = last_updated page = 1 pagesize = 20 while True: try: data = api_albums(catecode, page, pagesize) for item in data["videos"]: try: sid = item.get('sid') detail = api_album(sid) if sid else None model = self.extract_model(item, detail) if sid: videos = self.get_videos(sid) if model['channel'] in [ u'综艺' ]: #reverse order for zongyi videos = [video for video in reversed(videos)] else: video = VideoItemModel({ "title": model["title"], "image": model["image"], "description": model["description"], "time": model["time"], "price": model["price"], "duration": model["duration"], "url": model["url"] }) videos = [video] model['videos'] = videos export(model) current_updated = model["time"] if max_time < current_updated: max_time = current_updated except: self.logger.warning(get_exception_info()) if current_updated < last_updated: break if page * pagesize >= data["count"]: break except: self.logger.warning(get_exception_info()) page += 1 self.data["updated"] = max_time
def crawl(self): album_id = self.key params = dict(PARAMS_PROTOTYPE) params.update(PARAMS_ALBUM) ret = call_api("getAlbum", params, [ album_id, None, None, album_id, None, '1', '0']) if ret._id != album_id: return model = extract_album(ret, self.data['source']) videos = [] if ret.tv['block']: block = ret.tv['block'] for block_index in xrange(len(ret.tv['block'])): if block_index != 0: block_now = block[block_index] ret = call_api("getAlbum", params, [ album_id, None, None, None, '1', '1', block_now]) videos.extend(self.extract_videos( album_id, ret.tv['other'])) if not ret.tv['block']: videos.extend( self.extract_videos(album_id, ret.tv['other'])) model['videos'] = videos export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): for spec in TOP_SPEC: video_set = set() videos = [] count = TOP_COUNT / len(spec['sources']) for src, param in spec['sources']: func = getattr(self, "crawl_%s" % src) if func: try: titles = func(param, count) for title in titles: if title not in video_set: video_set.add(title) videos.append(title) except: self.logger.warning(get_exception_info()) rank = VideoRankModel({ "source": self.data['source'], "title": spec["title"], "type": spec["type"], "videos": videos, }) export(rank)
def crawl(self): videos = [] mid = self.key url = DETAIL % mid detail = loadurl(url) description = detail.get('plots') description = ''.join(description.split()) if self.data.get('channel') == u'鐢靛奖': dict_ = detail['pinfos']['mpurls'] video = VideoItemModel({ "title": self.data.get('title'), "url": MOVIE_PLAY % mid, #缃戦〉鍦板潃 "image": self.data.get('image'), "description": description, "stream": [{ 'url': dict_['tv'].get('url'), 'size': dict_['tv'].get('bits'), 'format': 'mp4' }] }) videos.append(video) else: try: sort = detail['pinfos'].get('sort')[0] episodes = detail['pinfos']['content'][sort]['fsps'] except: episodes = detail['pinfos']['fsps'] for episode in episodes: plots = episode.get('plots') plots = ''.join(plots.split()) video = VideoItemModel({ "title": episode.get('taskname'), "url": PLAY_URL % (mid,episode.get('number')), #缃戦〉鍦板潃 "image": episode.get('picurl'), "description": plots, "stream": getstream(episode.get('mpurls')) }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": mid, #婧愮珯ID "title": self.data["title"], "url": detail.get('shareurl'), #璇︽儏椤电殑鍦板潃 "image": self.data.get('image'), #鍥剧墖url "categories": self.data.get('category'), #鍒嗙被 "channel": self.data.get('channel'), #棰戦亾 "region": detail.get('country'), #鍦板尯 "videos": videos, #瑙嗛涓撹緫 "pubtime": parse_date(detail.get('rinfo').split(' ')[0]), #涓婃槧鏃堕棿 "actors": detail.get('lactor'), "directors": detail.get('director'), "description": description, }) #瀵煎嚭鏁版嵁 export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): timestr = self.data.get('videoLength', '00:00') duration = gettime(timestr) videos = [] video = VideoItemModel({ "title": self.data.get('title'), "url": self.data.get('videoURLMid'), #网页地址 "image": self.data.get('imgURL'), "description": self.data.get('desc'), "stream": [{ "url": self.data.get('videoURLMid'), #视频文件播放地址 "size": self.data.get('videoSizeMid'), "format": "mp4", #视频格式(协议) "duration": duration }], "stream_low": [{ "url": self.data.get('videoURLLow'), "size": self.data.get('videoSizeLow'), "format": "mp4", "duration": duration }], "stream_high": [{ "url": self.data.get('videoURLHigh'), "size": self.data.get('videoSizeHigh'), "format": "mp4", "duration": duration }] }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": self.data.get('id'), #源站ID "title": self.data.get("title"), "url": self.data.get('shareurl'), #详情页的地址 "image": self.data.get('imgURL'), #图片url "channel": CHANNEL, #频道 "videos": videos, #视频专辑 "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间 "description": self.data.get('desc'), }) #导出数据 export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): type = 4 album_id = self.key title = self.data['title'].encode('utf-8') channel = self.data.get('channel') if channel in LONG_VIDEO_CHANNELS.items(): album_data = api_album(type, album_id, title) album_data = album_data['data'] pubtime = album_data.get("public_time") pubtime = datetime.strptime(pubtime, "%Y%m%d") videos = [] for video in album_data['data']: video = clean_video(video) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": album_data.get("bpic"), "image2": album_data.get("mpic"), "url": album_data.get("web_url"), "actors": album_data.get("actors"), "directors": album_data.get("director"), "categories": album_data.get("tname"), "tags": self.data.get("tags"), "channel": channel, "region": album_data.get("zname")[0], "description": album_data.get("introduce"), "pubtime": pubtime, "videos": videos, }) else: video = VideoItemModel({ "title": title, "description": self.data.get("description"), "url": "http://www.56.com/u13/v_%s.html" % album_id, "stream": [{"url": "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id}], "stream_high": [{"url": "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id}] }) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": self.data.get("bpic"), "image2": self.data.get("mpic"), "tags": self.data.get("tags"), "url": self.data.get("web_url"), "channel": channel, "description": self.data.get("introduce"), "videos": [video], }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): for spec in TOP_SPECS: titles = crawl_top(spec["url"]) rank = VideoRankModel({ "source": self.data['source'], "type": spec["type"], "title": spec["title"], "videos": titles, }) export(rank)
def crawl(self): for spec in TOP_SPECS: titles = crawl_top(spec["url"]) rank = VideoRankModel({ "source": self.data['source'], "type": spec["type"], "title": spec["title"], "videos": titles, }) export(rank)
def crawl(self): url = self.data['url'] title = self.data['title'] items = api_recommend(url, 10) related = [item['title'] for item in items] album = VideoSourceModel({ 'source' : self.data['source'], 'title' : title, 'related' : related, }) export(album)
def crawl(self): url = self.data['url'] title = self.data['title'] items = api_recommend(url, 10) related = [item['title'] for item in items] album = VideoSourceModel({ 'source': self.data['source'], 'title': title, 'related': related, }) export(album)
def crawl(self): album_id = self.key detail_data = api_detail(album_id) detail_data = detail_data.get('detail') channel = detail_data.get('cats') title = detail_data.get('title') title = "".join(title.split()) image = detail_data.get('img') url = detail_data.get('play_url') url_key = re.findall( "http://www.tudou.com/albumplay/(.+)/.+\.html", url)[0] album_url = "http://www.tudou.com/albumcover/%s.html" % url_key if channel == u"动漫": actors = detail_data.get('seiyuu') else: actors = detail_data.get('performer') if channel == u"综艺": directors = detail_data.get('host') else: directors = detail_data.get('director') categories = detail_data.get('genre') region = detail_data.get('area')[0] description = detail_data.get('desc') description = "".join(description.split()) pubtime = detail_data.get('showdate') # 未知发布时间pubtime != 0 if pubtime: pubtime = datetime.strptime(str(pubtime), "%Y") # 未知发布时间pubtime == 0 if not pubtime: pubtime = datetime.utcfromtimestamp(0) videos = get_videos(album_id, url_key) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": image, "url": album_url, "actors": actors, "directors": directors, "categories": categories, "channel": channel, "region": region, "description": description, "pubtime": pubtime, "videos": videos, }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_id = self.key detail_data = api_detail(album_id) detail_data = detail_data.get('detail') channel = detail_data.get('cats') title = detail_data.get('title') title = "".join(title.split()) image = detail_data.get('img') url = detail_data.get('play_url') url_key = re.findall("http://www.tudou.com/albumplay/(.+)/.+\.html", url)[0] album_url = "http://www.tudou.com/albumcover/%s.html" % url_key if channel == u"动漫": actors = detail_data.get('seiyuu') else: actors = detail_data.get('performer') if channel == u"综艺": directors = detail_data.get('host') else: directors = detail_data.get('director') categories = detail_data.get('genre') region = detail_data.get('area')[0] description = detail_data.get('desc') description = "".join(description.split()) pubtime = detail_data.get('showdate') # 未知发布时间pubtime != 0 if pubtime: pubtime = datetime.strptime(str(pubtime), "%Y") # 未知发布时间pubtime == 0 if not pubtime: pubtime = datetime.utcfromtimestamp(0) videos = get_videos(album_id, url_key) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": image, "url": album_url, "actors": actors, "directors": directors, "categories": categories, "channel": channel, "region": region, "description": description, "pubtime": pubtime, "videos": videos, }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_url = "http://zyqvod.com/vod/index.asp?id=%s" % self.key hxs = load_html(album_url) urls = hxs.select("//div[@class='movievod']/li/input/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append(VideoItemModel({ "title" : title, "url" : url, "stream" : [{"url" : url, "format" : "qvod", "size" : size}], })) kv = {} for s in hxs.select("//div[@class='videoDetail']/p"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='movievod']/p[2]/text()").extract()) try: image = hxs.select("//div[@class='videoPic']/img/@src").extract()[0] except: image = None model = VideoSourceModel({ "source" : self.data['source'], "source_id" : self.key, "title" : self.data["title"], "time" : self.data.get('time'), "url" : album_url, "image" : image, "completed" : self.data.get('completed'), "categories" : [self.data.get('category')], "channel" : self.data.get('category'), "region" : self.data.get('region'), "videos" : videos, "actors" : split(kv.get(u'影片主演:')), "directors" : split(kv.get(u'影片导演:')), "pubtime" : parse_date(kv.get(u'上映年份:')), "description" : description, "completed" : not kv.get(u'连载状态:'), }) export(model)
def crawl(self): for spec in TOP_sohu: rank_list = crawl_top(spec.get('url'),spec.get('channel'),50) # print 'source:%s channel:%s priority:%s num:%s'%(spec['source'],spec['channel'],spec['priority'],len(rank_list)) rank = VideoTopModel({ 'source': spec['source'], 'channel': spec['channel'], 'priority': spec['priority'], 'type': spec['type'], 'updatetime': datetime.now().isoformat(), 'list': rank_list }) export(rank)
def crawl(self): album_url = "http://hakuzy.com/detail/?%s.html" % self.key hxs = load_html(album_url) urls = hxs.select("//td[@class='bt']/.//input[@id='copy_yah']/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append(VideoItemModel({ "title" : title, "url" : url, "stream" : [{"url" : url, "format" : "qvod", "size" : size}], })) kv = {} for s in hxs.select("/html/body/table[4]/tbody/tr[1]/td[2]/table/tbody/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract()) try: image = hxs.select("//div[@class='img']/img/@src").extract()[0] except: image = None model = VideoSourceModel({ "source" : SOURCE, "source_id" : self.key, "title" : self.data["title"], "time" : self.data.get('time'), "url" : album_url, "image" : image, "categories" : [self.data.get('category')], "channel" : self.data.get('category'), "region" : self.data.get('region'), "videos" : videos, "pubtime" : parse_date(kv.get(u"上映日期:")), "actors" : split(kv.get(u"影片演员:")), "directors" : split(kv.get(u"影片导演:")), "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, "description" : description, }) export(model)
def crawl(self): catecode = self.data["catecode"] last_updated = self.data.get("updated", datetime.min) current_updated = datetime.max max_time = last_updated page = 1 pagesize = 20 while True: try: data = api_albums(catecode, page, pagesize) for item in data["videos"]: try: sid = item.get('sid') detail = api_album(sid) if sid else None model = self.extract_model(item, detail) if sid: videos = self.get_videos(sid) if model['channel'] in [u'综艺']: #reverse order for zongyi videos = [video for video in reversed(videos)] else: video = VideoItemModel({ "title" : model["title"], "image" : model["image"], "description" : model["description"], "time" : model["time"], "price" : model["price"], "duration" : model["duration"], "url" : model["url"] }) videos = [video] model['videos'] = videos export(model) current_updated = model["time"] if max_time < current_updated: max_time = current_updated except: self.logger.warning(get_exception_info()) if current_updated < last_updated: break if page * pagesize >= data["count"]: break except: self.logger.warning(get_exception_info()) page += 1 self.data["updated"] = max_time
def process_album(self, item): sites = {} fangying_id = re.findall("f_(.+)\.html", item['link'])[0] for play in item['plays']: site = play['site'] if site not in SITES: continue if play["url"].find("fangying.com") != -1: stream = [] else: format = "thunder" if site == "thunder" else "" stream = [{"url" : play["url"], "format" : format}] video = VideoItemModel({ "title" : play["title"], "url" : play["url"], "stream" : stream, }) if not sites.has_key(site): sites[site] = [] sites[site].append(dict(video)) model = None for site, videos in sites.iteritems(): model = VideoSourceModel({ "source" : self.data['source'], "source_id" : fangying_id, "videos" : videos, "title" : item['title'], "directors" : item['directors'].split("/"), "actors" : item['performers'].split("/"), "description" : item['description'], 'categories' : item['genres'].split("/"), 'region' : item['countries'].split("/")[0], 'duration' : parse_duration(item['duration']), 'image' : item['avatar_middle'], 'score' : float(item['douban_rating']) if item.get('douban_rating') else None, 'url' : item['link'], 'price' : 0.0, 'pubtime' : parse_pubtime(item['release_time']), 'channel' : CHANNELS.get(self.key) }) export(model) if model: Scheduler.schedule(RelationCrawler.type, key = fangying_id, data = {'title' : model['title'], 'url' : model['url']})
def crawl(self): album_id = self.key channel = self.data.get('channel') if channel in CHANNELS.values(): model = self.crawl_show(album_id) model['image'] = self.data.get('image') model['image2'] = self.data.get('image2') if channel: model['channel'] = channel videos = self.crawl_video(album_id) if channel == u'综艺': videos = [video for video in reversed(videos)] model['videos'] = videos export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_id = self.key channel = self.data.get('channel') if channel in CHANNELS.values(): model = self.crawl_show(album_id) model['image'] = self.data.get('image') model['image2'] = self.data.get('image2') if channel: model['channel'] = channel videos = self.crawl_video(album_id) if channel == u'综艺': videos = [video for video in reversed(videos)] model['videos'] = videos export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): timestr = self.data.get('videoLength','00:00') duration = gettime(timestr) videos = [] video = VideoItemModel({ "title": self.data.get('title'), "url": self.data.get('videoURLMid'), #网页地址 "image": self.data.get('imgURL'), "description": self.data.get('desc'), "stream": [{ "url": self.data.get('videoURLMid'), #视频文件播放地址 "size": self.data.get('videoSizeMid'), "format": "mp4", #视频格式(协议) "duration": duration }], "stream_low":[{ "url": self.data.get('videoURLLow'), "size": self.data.get('videoSizeLow'), "format": "mp4", "duration": duration }], "stream_high":[{ "url": self.data.get('videoURLHigh'), "size": self.data.get('videoSizeHigh'), "format": "mp4", "duration": duration }] }) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": self.data.get('id'), #源站ID "title": self.data.get("title"), "url": self.data.get('shareurl'), #详情页的地址 "image": self.data.get('imgURL'), #图片url "channel": CHANNEL, #频道 "videos": videos, #视频专辑 "pubtime": parse_date(self.data.get('videoPublishTime')), #上映时间 "description": self.data.get('desc'), }) #导出数据 export(model) self.data['to_album_id'] = model['to_album_id']
def update_region(): conn = Connection() db = conn.content_video2 count = 1 source_videos = db.video.source.find() for source_video in source_videos: model = VideoSourceModel({ "videos": source_video['videos'], "image": source_video['image'], "related": source_video['related'], "duration": source_video['duration'], "title": source_video['title'], "comments": source_video['comments'], "source": source_video['source'], "score": source_video['score'], "actors": source_video['actors'], "price": source_video['price'], "channel": source_video['channel'], "description": source_video['description'], "tags": source_video['tags'], "deleted": source_video['deleted'], "completed": source_video['completed'], "visits": source_video['visits'], "favorites": source_video['favorites'], "authorities": source_video['authorities'], "categories": source_video['categories'], "created": source_video['created'], "url": source_video['url'], "region": source_video['region'], "directors": source_video['directors'], "pubtime": source_video['pubtime'], "time": source_video['time'], "source_id": source_video['source_id'] }) export(model) count += 1 print "count = %s" % count print "count = %s" % count print "map complete."
def update_region(): conn = Connection() db = conn.content_video2 count = 1 source_videos = db.video.source.find() for source_video in source_videos: model = VideoSourceModel({ "videos": source_video['videos'], "image": source_video['image'], "related": source_video['related'], "duration": source_video['duration'], "title": source_video['title'], "comments": source_video['comments'], "source": source_video['source'], "score": source_video['score'], "actors": source_video['actors'], "price": source_video['price'], "channel": source_video['channel'], "description": source_video['description'], "tags": source_video['tags'], "deleted": source_video['deleted'], "completed": source_video['completed'], "visits": source_video['visits'], "favorites": source_video['favorites'], "authorities": source_video['authorities'], "categories": source_video['categories'], "created": source_video['created'], "url": source_video['url'], "region": source_video['region'], "directors": source_video['directors'], "pubtime": source_video['pubtime'], "time": source_video['time'], "source_id": source_video['source_id'] }) export(model) count += 1 print "count = %s" % count print "count = %s" % count print "map complete."
def crawl(self): type = 4 album_id = self.key title = self.data['title'].encode('utf-8') channel = self.data.get('channel') if channel in LONG_VIDEO_CHANNELS.items(): album_data = api_album(type, album_id, title) album_data = album_data['data'] pubtime = album_data.get("public_time") pubtime = datetime.strptime(pubtime, "%Y%m%d") videos = [] for video in album_data['data']: video = clean_video(video) videos.append(video) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": album_data.get("bpic"), "image2": album_data.get("mpic"), "url": album_data.get("web_url"), "actors": album_data.get("actors"), "directors": album_data.get("director"), "categories": album_data.get("tname"), "tags": self.data.get("tags"), "channel": channel, "region": album_data.get("zname")[0], "description": album_data.get("introduce"), "pubtime": pubtime, "videos": videos, }) else: video = VideoItemModel({ "title": title, "description": self.data.get("description"), "url": "http://www.56.com/u13/v_%s.html" % album_id, "stream": [{ "url": "http://vxml.56.com/html5/%s/?src=3g&res=qqvga" % album_id }], "stream_high": [{ "url": "http://vxml.56.com/html5/%s/?src=3g&res=qvga" % album_id }] }) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": self.data.get("bpic"), "image2": self.data.get("mpic"), "tags": self.data.get("tags"), "url": self.data.get("web_url"), "channel": channel, "description": self.data.get("introduce"), "videos": [video], }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_url = "http://www.265zy.com/detail/?%s.html" % self.key hxs = load_html(album_url) urls = hxs.select( "//td[@class='bt']/.//input[@id='copy_yah']/@value").extract() videos = [] for url in urls: m = re.match("qvod://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] videos.append( VideoItemModel({ "title": title, "url": url, "stream": [{ "url": url, "format": "qvod", "size": size }], })) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join( hxs.select("//div[@class='intro']/.//text()").extract()) try: image = urlparse.urljoin( "http://www.265zy.com/", hxs.select("//div[@class='img']/img/@src").extract()[0]) except: image = None model = VideoSourceModel({ "source": self.data['source'], "source_id": self.key, "title": self.data["title"], "image": image, "url": album_url, "time": self.data.get('time'), "categories": [self.data.get('category')], "channel": self.data.get('category'), "region": self.data.get('region'), "videos": videos, "actors": split(kv.get(u"影片演员:")), "pubtime": parse_date(kv.get(u"上映日期:")), "completed": kv.get(u"影片状态:", "").find(u"连载") == -1, "description": description, }) export(model)
def crawl(self): source_id = self.key album_data = api_album(source_id, pcode, version) album_data = album_data['body'] title = album_data.get("nameCn") pubtime = album_data.get("releaseDate") if re.match("^\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y") elif re.match("^\d+-\d+-\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y-%m-%d") else: pubtime = datetime.utcfromtimestamp(0) directors = album_data.get("directory").split(" ") actors = album_data.get("starring").split(" ") desc = album_data.get("description") desc = "".join(desc.split()) region = album_data.get("area") categories = album_data.get("subCategory").split(" ") tags = album_data.get("tag").split(" ") url = "http://so.letv.com/tv/%s.html" % source_id videos = [] b = 1 s = 60 o = -1 m = 0 series_data = api_series(source_id, b, s, o, m, pcode, version) for series in series_data['body']['videoInfo']: id = series['id'] mid = series['mid'] url = "http://www.letv.com/ptv/vplay/%s.html" % id vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % ( mid, pcode, version) jsurl = "javascript:getUrl('letv', '%s')" % vurl video = VideoItemModel({ "title": series.get("nameCn"), "url": url, "stream": [{ "url": jsurl }], "image": series.get("picAll"), "duration": series.get("duration") }) videos.append(video) model = VideoSourceModel({ "source_id": source_id, "source": self.data.get('source'), "url": url, "channel": self.data['channel'], 'title': title, "image": self.data['image'], "pubtime": pubtime, "directors": directors, "actors": actors, "desc": desc, "region": region, "categories": categories, "tags": tags, "videos": videos }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): album_id = self.key if self.data['channel'] in SHORT_VIDEO: url = "http://v.qq.com/page/%s/%s/%s/%s.html" % ( album_id[0], album_id[1], album_id[-1], album_id) pubtime = datetime.strptime( self.data["pubtime"], "%Y-%m-%d %H:%M:%S") video = VideoItemModel({ "title": self.data["title"], "url": url, "stream": [{ "url": "javascript:getUrl('tencent', '%s')" % url }], "image": self.data["image"], "channel": self.data["channel"], }) model = VideoSourceModel({ "source": self.data["source"], "source_id": album_id, "title": self.data["title"], "url": url, "image": self.data["image"], "channel": self.data["channel"], "pubtime": pubtime, "videos": [video] }) export(model) self.data['to_album_id'] = model['to_album_id'] else: album_url = "http://v.qq.com/detail/%s/%s.html" % ( album_id[0], album_id) album_data = api_album(album_id[0], album_id) if album_data['trailer'] == 1: play_url = "http://v.qq.com/prev/%s/%s" % ( album_id[0], album_id) else: play_url = "http://v.qq.com/cover/%s/%s" % ( album_id[0], album_id) description = album_data.get("columndesc") if not description: description = album_data.get("desc") description = "".join(description.split()) try: pubtime = datetime.strptime(self.data.get("pubtime"), "%Y") except: pubtime = datetime.utcfromtimestamp(0) videos = [] columnid = album_data.get('columnid') rely = album_data.get('rely') if columnid: # columnid != 0 for video_dict in rely: for year, months in video_dict.iteritems(): for month in months: videolist_id = "%s_%s" % (year, month) videos_data = api_video(columnid, videolist_id) for video in videos_data['items']: time = video.get('date') time = datetime.strptime(time, "%Y-%m-%d") url = "http://v.qq.com/cover/%s/%s.html" % ( video.get('coverid')[0], video.get('coverid')) video = VideoItemModel({ "title": video.get('sectitle'), "description": video.get('breif'), "url": url, "stream": [{ "url": "javascript:getUrl('tencent', '%s')" % url }], "image": video.get('snapurl'), "time": time }) videos.append(video) if not columnid: # columnid == 0, only one video for video in album_data['videos']: videos.append(clean_video(video, play_url)) # self.data is not None: export(data) if self.data: model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": album_data['columnname'] if album_data['columnname'] else self.data["title"], "image": self.data.get("image"), "url": album_url, "actors": self.data.get("actors"), "directors": self.data.get("directors"), "categories": self.data.get("categories"), "channel": self.data.get("channel"), "region": self.data.get("region"), "description": description, "pubtime": pubtime, "videos": videos, }) # self.data is None: crawl web data first # (http://v.qq.com/cover/x/xxxxx.html), and export(data) else: hxs = load_html(play_url) channel = hxs.select( "//div[@class='mod_crumbs']/a[1]/text()").extract()[0] album_hxs = hxs.select( "//div[@class='mod_video_intro mod_video_intro_rich']") image = album_hxs.select("a/img/@src").extract()[0] title = album_hxs.select( "div[@class='video_title']/strong/a/text()").extract()[0] directors = [] for director_hxs in album_hxs.select("//div[@itemprop='director']/a"): director = director_hxs.select("span/text()").extract()[0] directors.append(director) actors = [] for actor_hxs in album_hxs.select("//div[@itemprop='actors']/a"): actor = actor_hxs.select("span/text()").extract()[0] actors.append(actor) region = album_hxs.select( "//div[@class='info_area']/span[@class='content']/a/text()").extract()[0] categories = [] for categorie_hxs in album_hxs.select("//div[@class='info_category']/span[@class='content']/a"): categorie = categorie_hxs.select("text()").extract()[0] categories.append(categorie) pubtime = album_hxs.select( "//div[@class='info_years']/span[@class='content']/a/text()").extract()[0] if re.match("^\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y") else: pubtime = datetime.utcfromtimestamp(0) model = VideoSourceModel({ "source": self.data.get('source'), "source_id": album_id, "title": title, "image": image, "url": album_url, "actors": actors, "directors": directors, "categories": categories, "channel": channel, "region": region, "description": description, "pubtime": pubtime, "videos": videos, }) export(model) self.data['to_album_id'] = model['to_album_id']
def crawl(self): source_id = self.key album_data = api_album(source_id, pcode, version) album_data = album_data['body'] title = album_data.get("nameCn") pubtime = album_data.get("releaseDate") if re.match("^\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y") elif re.match("^\d+-\d+-\d+$", pubtime): pubtime = datetime.strptime(pubtime, "%Y-%m-%d") else: pubtime = datetime.utcfromtimestamp(0) directors = album_data.get("directory").split(" ") actors = album_data.get("starring").split(" ") desc = album_data.get("description") desc = "".join(desc.split()) region = album_data.get("area") categories = album_data.get("subCategory").split(" ") tags = album_data.get("tag").split(" ") url = "http://so.letv.com/tv/%s.html" % source_id videos = [] b = 1 s = 60 o = -1 m = 0 series_data = api_series(source_id, b, s, o, m, pcode, version) for series in series_data['body']['videoInfo']: id = series['id'] mid = series['mid'] url = "http://www.letv.com/ptv/vplay/%s.html" % id vurl = "http://dynamic.app.m.letv.com/android/dynamic.php?mod=minfo&ctl=videofile&act=index&mmsid=%s&pcode=%s&version=%s" % ( mid, pcode, version) jsurl = "javascript:getUrl('letv', '%s')" % vurl video = VideoItemModel({ "title": series.get("nameCn"), "url": url, "stream": [{ "url": jsurl }], "image": series.get("picAll"), "duration": series.get("duration") }) videos.append(video) model = VideoSourceModel({ "source_id": source_id, "source": self.data.get('source'), "url": url, "channel": self.data['channel'], 'title': title, "image": self.data['image'], "pubtime": pubtime, "directors": directors, "actors": actors, "desc": desc, "region": region, "categories": categories, "tags": tags, "videos": videos }) export(model) self.data['to_album_id'] = model['to_album_id']
def process_album(self, item): sites = {} fangying_id = re.findall("f_(.+)\.html", item['link'])[0] for play in item['plays']: site = play['site'] if site not in SITES: continue if play["url"].find("fangying.com") != -1: stream = [] else: format = "thunder" if site == "thunder" else "" stream = [{"url": play["url"], "format": format}] video = VideoItemModel({ "title": play["title"], "url": play["url"], "stream": stream, }) if not sites.has_key(site): sites[site] = [] sites[site].append(dict(video)) model = None for site, videos in sites.iteritems(): model = VideoSourceModel({ "source": self.data['source'], "source_id": fangying_id, "videos": videos, "title": item['title'], "directors": item['directors'].split("/"), "actors": item['performers'].split("/"), "description": item['description'], 'categories': item['genres'].split("/"), 'region': item['countries'].split("/")[0], 'duration': parse_duration(item['duration']), 'image': item['avatar_middle'], 'score': float(item['douban_rating']) if item.get('douban_rating') else None, 'url': item['link'], 'price': 0.0, 'pubtime': parse_pubtime(item['release_time']), 'channel': CHANNELS.get(self.key) }) export(model) if model: Scheduler.schedule(RelationCrawler.type, key=fangying_id, data={ 'title': model['title'], 'url': model['url'] })
def crawl(self): #key为专辑源站ID album_id = self.key album_url = "http://bdzy.cc/detail/?%s.html" % album_id hxs = load_html(album_url) urls = hxs.select("//td[@class='bt']/.//li/input/@value").extract() videos = [] for url in urls: m = re.match("bdhd://(.+)", url) if not m: continue words = m.group(1).split("|") size = int(words[0]) #md5 = words[1] title = words[2].split(".")[0] #视频剧集 video = VideoItemModel({ "title" : title, "url" : url, #网页地址 (这里没有,所以采用播放地址) "stream" : [ { "url" : url, #视频文件播放地址 "size" : size, "format" : "bdhd" #视频格式(协议) }], }) videos.append(video) kv = {} for s in hxs.select("/html/body/table[2]/tr[1]/td[2]/table/tr"): texts = s.select(".//text()").extract() if len(texts) >= 2: kv[texts[0].strip()] = texts[1].strip() description = "\n".join(hxs.select("//div[@class='intro']/.//text()").extract()) try: image = hxs.select("/html/body/table[2]/tr[1]/td[1]/img/@src").extract()[0] except: image = None #视频导出的数据模型 model = VideoSourceModel({ "source" : self.data['source'], #视频源 "source_id" : album_id, #源站ID "title" : self.data["title"], "url" : album_url, #网页地址 "image" : image, #图片url "time" : self.data.get('time'), #源站更新时间 "categories" : [self.data.get('category')], #分类 "channel" : self.data.get('category'), #频道 "region" : self.data.get('region'), #地区 "videos" : videos, #视频专辑数组 "pubtime" : parse_date(kv.get(u"上映日期:")), #上映时间 "actors" : split(kv.get(u"影片演员:")), "completed" : kv.get(u"影片状态:", "").find(u"连载") == -1, #是否完结 "description" : description, }) #导出数据 export(model)