def text_infos_resolve(labels, infos, mediaItem): try: if mediaItem == None: return if labels and infos: labels = str(labels[0]).splitlines() label = ''.join(labels) label = label.replace(' ','') if label.startswith(u'导演'): mediaItem['director'] = Util.join_list_safely(infos) elif label.startswith(u'主演'): mediaItem['actor'] = Util.join_list_safely(infos) elif label.startswith(u'类型'): mediaItem['type'] = Util.join_list_safely(infos) elif label.startswith(u'地区'): mediaItem['district'] = Util.join_list_safely(infos) elif label.startswith(u'上映'): release_date = ''.join(infos) release_dates = re.findall(r'[\d]+', release_date) release_date = ''.join(release_dates) release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date elif label.startswith(u'片长'): duration = ''.join(infos) durations = re.findall(r'[\d]+', duration) duration = ''.join(durations) mediaItem['duration'] = duration elif label.startswith(u'人气'): score = ''.join(infos) scores = re.findall(r'[\d]+', score) score = ''.join(scores) mediaItem['score'] = score except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def api_media_info(self, mediaItem): try: api_url = self.other_album_api % (mediaItem['cont_id'], 1) result = Util.get_url_content(api_url) if not result: return json_result = json.loads(result) desc = json_result['body']['intro']['desc'] mediaItem['title'] = desc['nameCn'] if 'directory' in desc: director_list = desc['directory'].split(",") mediaItem['director'] = Util.join_list_safely(director_list) if 'starring' in desc: actor_list = desc['starring'].split(",") mediaItem['actor'] = Util.join_list_safely(actor_list) if 'subCategory' in desc: type_list = desc['subCategory'].split(",") desc['type'] = Util.join_list_safely(type_list) if 'area' in desc: district_list = desc['area'].split(",") mediaItem['district'] = Util.join_list_safely(district_list) if 'releaseDate' in desc: mediaItem['release_date'] = Util.str2date( str(desc['releaseDate'])) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def api_episode_info(self, mvItem=None, playlistId='', cat_id=''): # 应该保证mvItem,playlistId不为空,且包含mid或者sid、untrack_id,包含channel_id、site_id items = [] try: mvitem = mvItem ep_item = mvitem["media"] url = self.album_api % (playlistId, 1) logging.log(logging.INFO, 'api_episode_info, info url %s' % url) info = self.httpdownload.get_data(url) info = info.decode('gbk').encode('utf-8') info_json = json.loads(info) actor_list = info_json.get("mainActors") director_list = info_json.get("directors") type_list = info_json.get("categories") if "actor" not in ep_item and actor_list: ep_item["actor"] = Util.join_list_safely(actor_list) if "director" not in ep_item and director_list: ep_item["director"] = Util.join_list_safely(director_list) if "type" not in ep_item and type_list: ep_item["type"] = Util.join_list_safely(type_list) if "title" not in ep_item: ep_item["title"] = info_json.get("albumName") if "district" not in ep_item: ep_item["district"] = info_json.get("area") if "release_date" not in ep_item and info_json.get("publishYear"): ep_item["release_date"] = Util.str2date( str(info_json.get("publishYear"))) if "intro" not in ep_item: ep_item["intro"] = info_json.get("albumDesc") if "poster_url" not in ep_item or not str.strip( str(ep_item["poster_url"])): ep_item["poster_url"] = info_json.get("pic240_330") if "cont_id" not in ep_item: ep_item["cont_id"] = playlistId ttvitem = [] if ep_item['title']: mvitem['media'] = ep_item ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem if "url" not in mvitem["media"]: mvitem["media"]["url"] = ttvitem[0]['url'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def api_media_info(self, mediaVideoItem, vid, prefix_video_url): mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() try: miu = self.media_info_url % vid jdata = self.httpdownload.get_data(miu) if not jdata: pass else: ddata = json.loads(jdata) assert int(ddata.get('code', 202)) == 200, "接口获取媒体信息失败" detail = ddata.get('data').get('detail') assert type(detail) == dict mediaItem['cont_id'] = str(detail.get('collectionId')) mediaItem['title'] = detail.get('collectionName') mediaItem['director'] = Util.join_list_safely( detail.get('director').split('/')) mediaItem['actor'] = Util.join_list_safely( detail.get('player').split('/')) mediaItem['release_date'] = Util.str2date( detail.get('publishTime')) mediaItem['vcount'] = int(detail.get('totalvideocount')) latest = detail.get('lastseries') m = re.compile('\D*(\d+)\D*').match(latest) if m: mediaItem['latest'] = m.group(1) if mediaItem['vcount'] == 1: mediaItem['latest'] = 1 mediaItem['paid'] = detail.get('isvip') mediaItem['intro'] = detail.get('desc') mediaItem['poster_url'] = detail.get('image') mediaItem['site_id'] = self.site_id mediaItem['channel_id'] = self.channels_name_id[ mediaItem['channel_id']] info_id = Util.md5hash(Util.summarize(mediaItem)) mediaItem['info_id'] = info_id vcount = mediaItem['vcount'] if not vcount: vcount = 1 else: vcount = int(vcount) video_list = self.api_video_list(vid, vcount, prefix_video_url, mediaItem['channel_id']) if video_list: Util.set_ext_id(mediaItem, video_list) mediaVideoItem['video'] = video_list mediaVideoItem['media'] = mediaItem except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.ERROR, vid)
def resolve_media_info(self, xpara, mediaItem, ismovie=False): try: title = xpara.find('title') if title is not None: mediaItem['title'] = title.text tag = xpara.find('catalog') if tag is not None and tag.text is not None: mediaItem['type'] = Util.join_list_safely(tag.text.split(',')) director = xpara.find('director') if director is not None and director.text is not None: mediaItem['director'] = Util.join_list_safely( director.text.split(',')) actor = xpara.find('act') if actor is not None and actor.text is not None: mediaItem['actor'] = Util.join_list_safely( actor.text.split(',')) district = xpara.find('area') if district is not None and district.text is not None: mediaItem['district'] = district.text release_date = xpara.find('year') if release_date is not None and release_date.text is not None: # 会有<year>0</year>的情况,导致release_date为空 mediaItem['release_date'] = Util.str2date(release_date.text) if ismovie: duration = xpara.find('duration') if duration is not None and duration.text is not None: mediaItem['duration'] = int(float(duration.text)) paid = xpara.find('pay') if paid is not None and paid.text is not None: mediaItem['paid'] = int(float(paid.text)) intro = xpara.find('content') if intro is not None and intro.text is not None: mediaItem['intro'] = intro.text poster_url = xpara.find('imgurl') if poster_url is not None and poster_url.text is not None: mediaItem['poster_url'] = poster_url.text score = xpara.find('mark') if score is not None and score.text is not None: mediaItem['score'] = float(score.text) latest = xpara.find('vsTitle') if latest is not None and latest.text is not None: l = re.findall(r'[\d+]', latest.text) if l: mediaItem['latest'] = ''.join(l) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def text_infos_resolve(labels, infos, mediaItem): try: if mediaItem == None: return if labels and infos: labels = str(labels[0]).splitlines() label = ''.join(labels) label = label.replace(' ', '') if label.startswith(u'更新'): latest = infos[0] latests = latest.split('/') #共N集/更新至N集 if len(latests) > 1: latest = latests[1] else: latest = latests[0] latests = re.findall(r'[\d]+', latest) mediaItem['latest'] = ''.join(latests) elif label.startswith(u'导演'): mediaItem['director'] = Util.join_list_safely(infos) elif label.startswith(u'作者') or label.startswith(u'编剧'): mediaItem['writer'] = Util.join_list_safely(infos) elif label.startswith(u'主演') or label.startswith( u'配音') or label.startswith(u'主持'): mediaItem['actor'] = Util.join_list_safely(infos) elif label.startswith(u'地区'): mediaItem['district'] = Util.join_list_safely(infos) elif label.startswith(u'类型'): mediaItem['type'] = Util.join_list_safely(infos) elif label.startswith(u'年份') or label.startswith(u'上映'): release_dates = re.findall(r'[\d]+', infos[0]) if release_dates: release_date = ''.join(release_dates) release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date elif label.startswith(u'片长'): durations = re.findall(r'[\d]+', infos[0]) if durations: mediaItem['duration'] = durations[0] except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def text_infos_resolve(label_sels, info_sels, mediaItem, ignore=False): try: if mediaItem == None: return if not label_sels or not info_sels: return labels = label_sels.xpath('./text()').extract() infos = info_sels.xpath('./text()').extract() if labels and infos: labels = str(labels[0]).splitlines() label = ''.join(labels) label = label.replace(' ', '') if label.startswith(u'导演') and ignore == False: mediaItem['director'] = Util.join_list_safely(infos) elif label.startswith(u'编剧') and ignore == False: mediaItem['writer'] = Util.join_list_safely(infos) elif label.startswith(u'主演') and ignore == False: mediaItem['actor'] = Util.join_list_safely(infos) elif label.startswith(u'类型'): #类型/地区放在一块,特殊处理 type_infos = info_sels.xpath( './@href[re:test(., "/mdb/film/list/mtype-[\d]+.*")]/../text()' ).extract() district_infos = info_sels.xpath( './@href[re:test(., "/mdb/film/list/country-[\w]+.*")]/../text()' ).extract() mediaItem['type'] = Util.join_list_safely(type_infos) mediaItem['district'] = Util.join_list_safely( district_infos) elif label.startswith(u'上映'): info = ''.join(infos) release_dates = re.findall(r'[\d]+', info) if release_dates: release_date = ''.join(release_dates) release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def media_info_extract(response, mediaItem): try: if mediaItem == None: mediaItem = MediaItem() #普通媒体页 release_dates = response.xpath('./@data-qitancomment-tvyear').extract() if release_dates: release_dates = re.findall(r'[\d]+', release_dates[0]) if release_dates: release_date = ''.join(release_dates) release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date class_names = response.xpath('./@type').extract() if class_names and 'text/javascript' == class_names[0]: #视频类型 video:正片 trailer:片花 regex_express = "vType[ ]?:[ ]?[']?(\w+)[']" match_result = response.re(regex_express) if match_result: vType = match_result[0] if vType.strip() != 'video': return regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)' #默认采用的是sourceId cont_id = '0' regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)' match_result = response.re(regex_express) if match_result: cont_id = match_result[0] if cont_id == '0': #其他采用的是albumId regex_express = 'albumId[ ]?:[ ]?["]?(\d+)' match_result = response.re(regex_express) if match_result: cont_id = match_result[0] mediaItem['cont_id'] = '%s|album_id' % (cont_id) else: mediaItem['cont_id'] = '%s|source_id' % (cont_id) regex_express = 'cid[ ]?:[ ]?(\d+)' match_result = response.re(regex_express) if match_result: cid = match_result[0] mediaItem['channel_id'] = cid regex_express = 'title[ ]?:[ ]?\"(.*)\"' match_result = response.re(regex_express) if match_result: title = match_result[0] mediaItem['title'] = title #特殊剧集页:http://www.iqiyi.com/dianshiju/18jbj.html#vfrm=2-4-0-1 regex_express = 'albumInfo[ ]?=[ ]?(\{.*\})' match_result = response.re(regex_express) if match_result: json_content = match_result[0] try: json_data = json.loads(json_content) cont_ids = '0' cont_ids = json_data['sourceId'] if cont_ids != '0': cont_ids = '%s|source_id' % (cont_ids) mediaItem['cont_id'] = cont_ids else: cont_ids = json_data['albumId'] cont_ids = '%s|album_id' % (cont_ids) mediaItem['cont_id'] = cont_ids districts = json_data['areas'] types = json_data['types'] directors = json_data['directors'] actors = json_data['mainActors'] writers = json_data['writer'] titles = json_data['tvName'] poster_urls = json_data['tvPictureUrl'] vcounts = json_data['episodeCounts'] latests = json_data['currentMaxEpisode'] release_dates = json_data['issueTime'] intros = json_data['tvDesc'] if districts: districts_json = json.loads(districts) districts = districts_json.values() mediaItem['district'] = Util.join_list_safely(districts) if types: types_json = json.loads(types) types = types_json.values() mediaItem['type'] = Util.join_list_safely(types) mediaItem['director'] = Util.join_list_safely(directors) mediaItem['actor'] = Util.join_list_safely(actors) mediaItem['writer'] = Util.join_list_safely(writers) mediaItem['title'] = titles mediaItem['poster_url'] = poster_urls mediaItem['vcount'] = vcounts mediaItem['latest'] = latests release_dates = str(release_dates) release_date = Util.str2date(release_dates) mediaItem['release_date'] = release_date mediaItem['intro'] = intros except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, '=================json_content=================') logging.log(logging.INFO, json_content) #普通媒体页 - 媒体信息域 # (1) http://www.iqiyi.com/a_19rrgjaiqh.html#vfrm=2-4-0-1 # 集数的情况很复杂,这里不予考虑 sels = response.xpath('.//div[@class="result_pic pr"]') if sels: poster_urls = sels.xpath('.//a/img/@src').extract() if poster_urls: mediaItem['poster_url'] = poster_urls[0] sels = response.xpath('.//div[@class="result_detail"]') if sels: titles = sels.xpath('.//h1[@class="main_title"]//a/text()').extract() scores = sels.xpath('.//div[@class="topic_item topic_item-rt"]//span[@class="score_font"]//span/text()').extract() scores = ''.join(scores) scores = re.findall(r'[\d.]+', scores) if titles: mediaItem['title'] = titles[0] if scores: try: mediaItem['score'] = float(scores[0]) except Exception, e: pass msg_sels = sels.xpath('.//div[@class="topic_item clearfix"]') for msg_sel in msg_sels: msg_more_sels = msg_sel.xpath('./div') for sel in msg_more_sels: labels = sel.xpath('.//em/text()').extract() infos = sel.xpath('.//em/a/text()').extract() iqiyi_extract.text_infos_resolve(labels, infos, mediaItem) intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="moreinfo"]/span/text()').extract() if not intros: intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="lessinfo"]/span/text()').extract() if intros: mediaItem['intro'] = intros[0]
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] year_list = [] lyears = [] title_list = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title' ).extract() director_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'主演:').extract() type_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'类型:').extract() district_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'地区:').extract() year_info = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/text()' % u'地区:').extract() year = None if len(year_info) >= 2: year = self.get_year(year_info[1]) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) districts = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="juqing briefTab"]/div/text()').extract() #score score = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()' ).extract() play_url = "" tplay_url = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href' ).extract() if tplay_url: play_url = self.url_prefix + tplay_url[0].strip() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] if ep_item["title"].find(u'预:') >= 0: print "预告片,url", response.request.url return items ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = districts if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "baofeng") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item vurl = "" videoid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = videoid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) #if self.check_url(mvitem): if res: items.append(mvitem) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] title = response.request.meta['title'] actor = response.request.meta['actor'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] items = [] if not poster_url: poster_url_list = response.xpath('//div[@class="cover_img"]/div[@class="pack pack_album"]/div[@class="pic"]/img/@src').extract() if poster_url_list: poster_url = poster_url_list[0] if not title: title_list = response.xpath('//div[@class="cover_info"]/h2/strong/@title').extract() if title_list: title = title_list[0] if not actor: #actor_list = response.xpath('//div[@class="cover_keys"]/span/a/text()').extract() actor_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u' 主演:').extract() if actor_list: actor = Util.join_list_safely(actor_list) #actor = "|".join([t.strip() for t in actor_list]) #performer pers = actor type_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'类型:\n').extract() district_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'地区:').extract() release_date_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'年代:').extract() types = None if type_list: types = Util.join_list_safely(type_list) #director director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'编导:').extract() if not director_list: director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()' % u'导演:').extract() dirs = Util.join_list_safely(director_list) #dirs = "|".join([t.strip() for t in director_list]) #text text = response.xpath('//div[@class="cover_info"]/div[@class="desc"]/p/text()').extract() #sourceid sourceid = self.get_tudou_showid(response.request.url) videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if release_date_list: ep_item["release_date"] = Util.str2date(release_date_list[0]) #ep_item["info_id"] = Util.md5hash(tinfo) ep_item["cont_id"] = sourceid ep_item["site_id"] = self.site_id ep_item["url"] = response.request.url ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem(); mvitem["media"] = ep_item; mvitem["video"] = videoitems lurl = "http://www.tudou.com/crp/getAlbumvoInfo.action?charset=utf-8&areaCode=110000&acode=" + str(sourceid) info = self.httpdownload.get_data(lurl) jinfo = json.loads(info) if "items" in jinfo: for sitem in jinfo["items"]: vitem = VideoItem() vitem["title"] = sitem["itemTitle"] vitem["vnum"] = sitem["episode"] vitem["os_id"] = self.os_id trailer = sitem['trailer'] if not sitem["itemPlayUrl"]: continue #预告片 if trailer: continue turl = Util.normalize_url(sitem["itemPlayUrl"],"tudou") vitem["url"] = turl vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_tudou_showid(turl) #if "ext_id" not in mvitem["media"]: # mvitem["media"]["ext_id"] = vitem["ext_id"] #vitem["media_ext_id"] = vitem["ext_id"] mvitem["video"].append(vitem) if len(mvitem["video"]) > 0: Util.set_ext_id(mvitem["media"],mvitem["video"]) mvitem["media"]["info_id"] = Util.md5hash(Util.summarize(mvitem["media"])) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
types = response.xpath('./meta[@itemprop="genre"]/@content').extract() actors = response.xpath('./item[@itemprop="actor"]//meta[@itemprop="name"]/@content').extract() writers = response.xpath('./meta[@itemprop="author"]/@content').extract() intros = response.xpath('./meta[@itemprop="description"]/@content').extract() poster_urls = response.xpath('./item/meta[@itemprop="image"]/@content').extract() durations = response.xpath('./item/meta[@itemprop="duration"]/@content').extract() vcounts = response.xpath('./meta[@itemprop="numberOfEpisodes"]/@content').extract() latests = response.xpath('./meta[@itemprop="newestEpisode"]/@content').extract() districts = response.xpath('./item/meta[@itemprop="contentLocation"]/@content').extract() scores = response.xpath('./item/div[@itemprop="aggregateRating"]/meta[@itemprop="ratingValue"]/@content').extract() if titles: mediaItem['title'] = titles[0] if release_dates: release_dates = re.findall(r'[\d]+', release_dates[0]) release_date = ''.join(release_dates) release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date if directors: mediaItem['director'] = Util.join_list_safely(directors) if types: mediaItem['type'] = Util.join_list_safely(types) if actors: mediaItem['actor'] = Util.join_list_safely(actors) if writers: mediaItem['writer'] = Util.join_list_safely(writers) if intros: mediaItem['intro'] = intros[0] if poster_urls: mediaItem['poster_url'] = poster_urls[0] if durations: durations = re.findall(r'[\d]+', durations[0])
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] page_id = self.get_youku_pageid(response.request.url) if not page_id: log.error('miss content id: %s' % response.request.url) return untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] items = [] year_list = [] title = self.parse_title(response,cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'地区:').extract() type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'类型:').extract() play_date = self.parse_play_date(response) total_num = self.parse_total_num(response) year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) #text text = response.xpath('//div[@class="detail"]/span/text()').extract() videoitems = [] ep_item = MediaItem() if title: ep_item["title"] = title[0].strip() if pers: ep_item["actor"] = pers if dirs > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if play_date: ep_item["release_date"] = Util.str2date(play_date) if total_num: ep_item["vcount"] = total_num ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url,"youku") if text: ep_item["intro"] = text[0].strip() ep_item["cont_id"] = page_id ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item)) mvitem = MediaVideoItem(); if mid: mvitem['mid'] = mid mvitem["media"] = ep_item; if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id) mvitem['video'] = video_list Util.set_ext_id(mvitem["media"], mvitem["video"]) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def list_json_parse(self, response): items = [] try: origin_url = response.request.meta['url'] request_url = response.request.url logging.log(logging.INFO, 'json api url: %s' % request_url) page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None list_json_postfix_url = response.request.meta[ 'postfix_url'] if 'postfix_url' in response.request.meta else None json_datas = json.loads(response.body) videos = [] if json_datas: videos = json_datas[ 'data_list'] if 'data_list' in json_datas else [] if videos: #表明仍有下一页 video_url = 'http://www.letv.com/ptv/vplay/%s.html' for item in videos: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id if 'rating' in item and item['rating']: mediaItem['score'] = item['rating'] subCategoryName = item['subCategoryName'] mediaItem['type'] = subCategoryName.replace(',', ';') mediaVideoItem['media'] = mediaItem release_date = item['releaseDate'] if release_date: release_date = float(release_date) if release_date > 0: release_date = release_date / 1000 release_date = time.localtime(release_date) release_date = '%s-%s-%s' % (release_date.tm_year, release_date.tm_mon, release_date.tm_mday) mediaItem['release_date'] = Util.str2date( release_date) vid = '' if 'vids' in item: vids = item['vids'] vids = vids.split(',') vid = vids[0] elif 'vid' in item: vid = item['vid'] if vid: url = video_url % vid items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 page = page + 1 url = self.list_json_prefix_url + list_json_postfix_url + 'p=%s' % page items.append( Request(url=url, callback=self.list_json_parse, meta={ 'page': page, 'id': channel_id, 'postfix_url': list_json_postfix_url, 'url': url })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json api url: %s' % request_url) logging.log(logging.INFO, 'origin url: %s' % origin_url)
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #title title = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title' ).extract() #performer #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract() performer_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()' ).extract() if not performer_list: performer_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'主演:').extract() #director #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract() director_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()' ).extract() if not director_list: director_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'导演:').extract() #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() if not text: response.xpath( '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()' ).extract() type_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()' ).extract() if not type_list: type_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'类型:').extract() year_info = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()' ).extract() if not year_info: year_info = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'年份:').extract() play_date = None if year_info: play_date = self.get_year(year_info[0]) # dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) pers = Util.join_list_safely(performer_list) #sourceid sourceid = "" sourceid_list = response.xpath( '//div[@class="mod_bd sourceCont"]/@sourceid').extract() if sourceid_list: sourceid = sourceid_list[0] videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if play_date: ep_item["release_date"] = Util.str2date(play_date) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "qq") ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems vurl = "" url_pre = "http://s.video.qq.com/loadplaylist?vkey=" url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1" videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) mvitem["media"]["cont_id"] = videoid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) vurl = url_pre + str(sourceid) + url_tail tflag = "jQuery191048201349820010364_1425370006500" tpitem = self.parse_play_list(cat_id, vurl, tflag, response) #没有sourceid,比如专题页面 if not tpitem: tpitem = self.parse_topic_play_list(response) videoids = response.xpath( '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value' ).extract() if videoids: mvitem["media"]["cont_id"] = videoids[0] if tpitem: mvitem["video"] = tpitem Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_info_extract(response, mediaItem): try: if mediaItem == None: mediaItem = MediaItem() #播放页 class_names = response.xpath('./@type').extract() if class_names and 'text/javascript' == class_names[0]: #媒体页 titles = response.re('title[ ]?:[ ]?\"(.*)\"') if titles: mediaItem['title'] = titles[0] #播放页 pids = response.re('pid[ ]?:[ ]?(\d+)') totals = response.re('totalcount[ ]?:[ ]?(\d+)') trylooks = response.re('trylook[ ]?:[ ]?(\d*)') pPics = response.re('pPic[ ]?:[ ]?\"(.*)\"') if pids: mediaItem['cont_id'] = pids[0] if totals: mediaItem['vcount'] = totals[0] """ if trylooks: trylook = str(trylooks[0]) if trylook == '0': mediaItem['paid'] = 0 else: mediaItem['paid'] = 1 """ if pPics: poster_url = pPics[0] mediaItem['poster_url'] = poster_url if u'电影' == mediaItem['channel_id']: #电影,获取时长 durations = response.re('duration[ ]?:[ ]?\"(.*)\"') if durations: duration = durations[0] durations = duration.split(':') length = len(durations) if length == 3: duration = int(durations[0]) * 60 + int( durations[1]) duration = str(duration) elif length == 2: duration = durations[0] mediaItem['duration'] = duration #媒体页 results = response.xpath('.//span[@class="s-t"]/text()').extract() if results: latests = re.findall(r'[\d]+', results[0]) if latests: latest = ''.join(latests) mediaItem['latest'] = latest #媒体页-综艺、动漫、电视剧 sels = response.xpath('.//dd[@data-statectn="n_textInfo"]') if sels: results = sels.xpath('.//p[@class="p1"]//a/text()').extract() if results: if u'综艺' == mediaItem['channel_id']: mediaItem['actor'] = Util.join_list_safely(results) else: mediaItem['director'] = Util.join_list_safely(results) results = sels.xpath('.//p[@class="p2"]//a/text()').extract() if results: if u'综艺' != mediaItem['channel_id']: mediaItem['actor'] = Util.join_list_safely(results) results = sels.xpath('.//p[@class="p3"]//a/text()').extract() if results: mediaItem['district'] = Util.join_list_safely(results) if 'release_date' not in mediaItem: results = sels.xpath( './/p[@class="p4"]//a/text()').extract() if results: release_date = results[0] release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date results = sels.xpath('.//p[@class="p5"]//a/text()').extract() if results: mediaItem['type'] = Util.join_list_safely(results) results = sels.xpath('.//p[@class="p7"]/text()').extract() if results: intro = results[0].strip() mediaItem['intro'] = intro else: #媒体页-电影 sels = response.xpath('.//dd[@data-statectn="n_w150_dd"]') if sels: results = sels.xpath( './/p[@class="p2"]//a/text()').extract() if results: mediaItem['director'] = Util.join_list_safely(results) results = sels.xpath( './/p[@class="p3"]//a/text()').extract() if results: mediaItem['actor'] = Util.join_list_safely(results) results = sels.xpath( './/span[@class="s4"]//a/text()').extract() if results: mediaItem['district'] = Util.join_list_safely(results) results = sels.xpath( './/span[@class="s5"]//a/text()').extract() if results: for item in results: if re.match(r"^[\d-]+$", item): if 'release_date' not in mediaItem: release_date = Util.str2date(item) mediaItem['release_date'] = release_date results.remove(item) break mediaItem['type'] = Util.join_list_safely(results) results = sels.xpath('.//p[@class="p6"]/text()').extract() if results: intro = results[0].strip() mediaItem['intro'] = intro except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_media_info(self, response): try: result = {} #details details = response.xpath('//div[@class="v-details"]') title = details.xpath( './div[@class="v-title clearfix"]/span[@id="film_name"]/text()' ).extract() poster = details.xpath( './div[@class="v-poster"]/descendant-or-self::*/img/@src' ).extract() #movieid = details.xpath('./div[@class="v-poster"]/a/@movieid').extract() #base info base_info = details.xpath('./div[@class="v-main-info clearfix"]') actor = base_info.xpath( './div[1]/p[@id="actors"]/a/text()').extract() director = base_info.re( re.compile(r'<i>%s</i>(.*?)</p>' % u'导演:', re.S)) area = base_info.re( re.compile(r'<i>%s</i>(.*?)</p>' % u'地区:', re.S)) category = base_info.re( re.compile(r'<i>%s</i>(.*?)</p>' % u'类型:', re.S)) year = base_info.re( re.compile(r'<i>%s</i>(.*?)</p>' % u'年代:', re.S)) intro = base_info.xpath( './p[@class="intro"]/span[@id="full-intro"]/text()').extract() if not intro: intro = base_info.xpath( './p[@class="intro"]/span[@id="part-intro"]/text()' ).extract() if not intro: intro = base_info.xpath( './p[@class="intro"]/span[@class="text"]/text()').extract( ) episode_num = base_info.xpath( './p[@class="episode clearfix"]/text()').extract() score_int = base_info.xpath( './div[@class="aggregate-rating"]/div[1]/p/span/em/text()' ).extract() score_dec = base_info.xpath( './div[@class="aggregate-rating"]/div[1]/p/span/text()' ).extract() #side info side_info = details.xpath('./div[@id="left_info"]') if title: result['title'] = title[0] if poster: result['poster_url'] = poster[0] #if movieid: # result['cont_id'] = movieid[0] result['cont_id'] = get_cluster_id(response.request.url) if actor: result['actor'] = V360Formatter.join(actor) if director: result['director'] = V360Formatter.rejoin(director[0]) if area: result['district'] = V360Formatter.rejoin(area[0]) if category: result['type'] = V360Formatter.rejoin(category[0]) if year: result['release_date'] = Util.str2date(year[0]) if intro: result['intro'] = ''.join(intro) if episode_num: vc = V360Formatter.episode_num(episode_num[0].strip()) if 'vcount' in vc: result['vcount'] = vc['vcount'] if 'latest' in vc: result['latest'] = re.sub(r"[^\d]", "", vc['latest']) if score_int and score_dec: result['score'] = V360Formatter.score(score_int[0], score_dec[0]) elif score_int: result['score'] = V360Formatter.score(score_int[0], None) return result except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_media_info(self, response): try: result = {} #main main = response.xpath('//div[@id="main"]') #base info base_info = main.xpath('./div[1]/div[2]') title = base_info.xpath( './div[1]/h1[@id="film_name"]/text()').extract() director = base_info.xpath( './div[1]/dl[@id="director"]/dd/a[not(@id)]/text()').extract() actor = base_info.xpath( './div[1]/dl[@id="actor"]/dd/a/text()').extract() #tag = base_info.xpath('./div[1]/dl[@id="genre"]/dd/a/text()').extract() type = None area = base_info.xpath( './div[1]/div[@class="text"]/dl[@class="area"]/dd/text()' ).extract() year = base_info.xpath( './div[1]/div[@class="text"]/dl[@class="year"]/dd/text()' ).extract() duration = base_info.xpath( './div[1]/div[@class="text"]/dl[@class="duration"]/dd/text()' ).extract() score_int = base_info.xpath( './div[1]/div[@class="aggregate-rating"]/div[1]/p/span/em/text()' ).extract() score_dec = base_info.xpath( './div[1]/div[@class="aggregate-rating"]/div[1]/p/span/text()' ).extract() #side info side = main.xpath('./div[1]/div[1]') poster = side.xpath( './div/descendant-or-self::*/img/@src').extract() #movieid = side.xpath('./div/a/@movieid').extract() pay = side.xpath('./div/a/em').extract() #intro desc = main.xpath('./div[2]/div[2]') intro = desc.xpath('//p[@class="more"]/text()').extract() if not intro: intro = desc.xpath('//p[@class="less"]/text()').extract() # try: contid = get_cluster_id(response.request.url) tag_url = "http://android.api.360kan.com/coverpage/?id=" + contid + "&cat=1&method=coverpage.data&refm=selffull&ss=4&token=2bf65a903d03167e48d38694f8aa4f1a&ver=71&ch=360sjzs" taginfo = self.httpdownload.get_data(tag_url) if taginfo and len(taginfo) > 32: subtaginfo = taginfo[32:] jtaginfo = json.loads(subtaginfo) if "data" in jtaginfo and "data" in jtaginfo[ "data"] and "type" in jtaginfo["data"]["data"]: type = jtaginfo["data"]["data"]["type"] except Exception as e: pass if title: result['title'] = title[0] if director: result['director'] = V360Formatter.join(director) if actor: result['actor'] = V360Formatter.join(actor) if type: result['type'] = V360Formatter.join(type) if area: result['district'] = V360Formatter.rejoin(area[0]) if year: result['release_date'] = Util.str2date(year[0]) if duration: result['duration'] = V360Formatter.duration(duration[0]) if score_int and score_dec: result['score'] = V360Formatter.score(score_int[0], score_dec[0]) elif score_int: result['score'] = V360Formatter.score(score_int[0], None) if poster: result['poster_url'] = poster[0] #if movieid: # result['cont_id'] = movieid[0] result['cont_id'] = get_cluster_id(response.request.url) if pay: result['paid'] = 1 if intro: result['intro'] = ''.join(intro) return result except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode_info(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'parse_episode_info: %s' % request_url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] #此处因考虑不想过多改变原来的程序结构,其实这些属性可以通过接口获得 #http://clientapi.wasu.cn/Phone/vodinfo/id/6786984 title_list = response.xpath( '//div[@class="cloudotm1"]/p[1]/a/text()').extract() if not title_list: title_list = response.xpath( '//div[@class="tele_txts"]/h4[1]/a/text()').extract() director_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'导演').extract() if not director_list: director_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'导演').extract() performer_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'演员').extract() if not performer_list: performer_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'演员').extract() area_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'地区').extract() if not area_list: area_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'地区').extract() tag_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'标签').extract() if not tag_list: tag_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'类型').extract() if not tag_list: tag_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'标签').extract() if not tag_list: tag_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'类型').extract() year_list = response.xpath( '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' % u'年份').extract() if not year_list: year_list = response.xpath( '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()' % u'年份').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) areas = Util.join_list_safely(area_list) tags = Util.join_list_safely(tag_list) #text text = response.xpath( '//div[@class="right_fl"]/p/span[@id="infoS"]/text()').extract( ) if text: text = response.xpath( '//div[@class="tele_b_otm"]/p/span[@id="infoS"]/text()' ).extract() play_url = "" mvitem = self.compose_mvitem(response, title_list, pers, dirs, response.request.url, cat_id, poster_url, text) if mid: mvitem['mid'] = mid if mvitem and 'video' in mvitem and 'url' in mvitem['video'][ 0] and mvitem['video'][0]['url']: mvitem['media']['type'] = tags mvitem['media']['district'] = areas if year_list: mvitem['media']['release_date'] = Util.str2date( year_list[0]) tlen = len(mvitem['video']) logging.log( logging.INFO, "++++url: %s video len: %d " % (response.request.url, tlen)) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_info_extract(response, mediaItem): try: if mediaItem == None: mediaItem = MediaItem() #list列表页 sels = response.xpath('.//p[@class="movielist_tt"]') if sels: scores = sels.xpath('.//em[@class="score"]/text()').extract() latests = sels.xpath('.//em[@class="update"]/text()').extract() if scores: mediaItem['score'] = scores[0] if latests: latests = re.findall(r'[\d]+', latests[0]) if latests: mediaItem['latest'] = latests[0] sels = response.xpath('.//a[@class="pic"]') if sels: poster_urls = sels.xpath('./img/@_src').extract() if poster_urls: mediaItem['poster_url'] = poster_urls[0] #播放页 - 普通版面 sels = response.xpath('..//ul[@class="movieinfo"]/li') for sel in sels: labels = sel.xpath('./text()').extract() infos = sel.xpath('./*/text()').extract() kankan_extract.text_infos_resolve(labels, infos, mediaItem) intros = response.xpath( '..//p[@id="movie_info_intro_l"]/text()').extract() if intros: mediaItem['intro'] = intros[0] #播放页 - vip #(1)http://vip.kankan.com/vod/88169.html?fref=kk_search_sort_01#7927921 sels = response.xpath('..//div[@class="intro"]') if sels: url_sels = sels.xpath('.//dt/a/@href') if url_sels: regex_express = '(http://movie\.kankan\.com/movie/[\d]+).*' match_result = url_sels.re(regex_express) if match_result: mediaItem['url'] = match_result[0] sels = sels.xpath('.//dd') for sel in sels: labels = sel.xpath('./text()').extract() infos = sel.xpath('./a/text()').extract() kankan_extract.text_infos_resolve(labels, infos, mediaItem) intros = sels.xpath('./dd[@class="intro_p"]/p').extract() mediaItem['intro'] = ''.join(intros) #(2)http://vip.kankan.com/vod/88365.html#7306075 sels = response.xpath('..//div[@class="movie_info"]') if sels: url_sels = sels.xpath('.//dd') for sel in url_sels: labels = sel.xpath('./span/text()').extract() infos = sel.xpath('./span/a/text()').extract() if not labels: labels = sel.xpath('./text()').extract() infos = sel.xpath('./a/text()').extract() kankan_extract.text_infos_resolve(labels, infos, mediaItem) intros = sels.xpath('./dd/p[@class="intro_p"]').extract() mediaItem['intro'] = ''.join(intros) #媒体页 sels = response.xpath('//head//script') if sels: regex_express = 'movieInfo\.movieid[ ]?=[ ]?(\d+)' match_result = sels.re(regex_express) if match_result: mediaItem['cont_id'] = match_result[0] regex_express = 'movieInfo\.movie_title[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: mediaItem['title'] = match_result[0] regex_express = 'movieInfo\.poster[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: mediaItem['poster_url'] = match_result[0] regex_express = 'movieInfo\.movie_classify[ ]?=[ ]?(\{.*\})' match_result = sels.re(regex_express) if match_result: content = match_result[0] json_data = json.loads(content) release_date = json_data[ 'year'] if 'year' in json_data else '' release_dates = re.findall(r'[\d]+', str(release_date)) release_date = ''.join(release_dates) if release_date: release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date regex_express = 'movieInfo\.episode[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: latests = match_result[0] latests = latests.split('/') #共N集/更新至N集 if len(latests) > 1: latests = latests[1] else: latests = latests[0] latests = re.findall(r'[\d]+', latests) mediaItem['latest'] = ''.join(latests) regex_express = 'movieInfo\.total_number[ ]?=[ ]?(\d+)' match_result = sels.re(regex_express) if match_result: if int(match_result[0]) > 0: mediaItem['vcount'] = match_result[0] sels = response.xpath('..//div[@class="info_list"]//li') for sel in sels: labels = sel.xpath('./text()').extract() if not labels: labels = sel.xpath('./em/text()').extract() infos = sel.xpath('./a/text()').extract() if not infos: infos = sel.xpath('./span/text()').extract() kankan_extract.text_infos_resolve(labels, infos, mediaItem) sels = response.xpath('..//ul[@class="detail_ul"]//li') for sel in sels: labels = sel.xpath('./text()').extract() infos = sel.xpath('./*/text()').extract() kankan_extract.text_infos_resolve(labels, infos, mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_media_info(self, response): try: result = {} #base info base_info = response.xpath('//div[@id="info"]') title = base_info.xpath( './div[1]/h1[@id="film_name"]/text()').extract() actor = base_info.re( re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'主角:', re.S)) director = base_info.re( re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'导演:', re.S)) area = base_info.re( re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'地区:', re.S)) category = base_info.re( re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'类型:', re.S)) year = base_info.re( re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'年代:', re.S)) intro = base_info.xpath( './div[2]/p/em[text()="%s"]/../span[@id="full-intro"]/span/text()' % u'简介:').extract() if not intro: intro = base_info.xpath( './div[2]/p/em[text()="%s"]/../span[@id="part-intro"]/span/text()' % u'简介:').extract() if not intro: intro = base_info.xpath( './div[2]/p/em[text()="%s"]/../span[@class="text"]/text()' % u'简介:').extract() episode_num = base_info.re( re.compile(r'<em.*?>%s</em>.*?<span.*?>(.*?)</span>' % u'剧集:', re.S)) #poster poster = response.xpath( '//div[@id="left_info"]/div[@id="poster"]/descendant-or-self::*/img/@src' ).extract() #movieid = response.xpath('//div[@id="left_info"]/div[@id="poster"]/a/@movieid').extract() if title: result['title'] = title[0] if actor: result['actor'] = V360Formatter.rejoin(actor[0]) if director: result['director'] = V360Formatter.rejoin(director[0]) if area: result['district'] = V360Formatter.rejoin(area[0]) if category: result['type'] = V360Formatter.rejoin(category[0]) if year: result['release_date'] = Util.str2date(year[0]) if intro: result['intro'] = ''.join(intro) if episode_num: vc = V360Formatter.episode_num(episode_num[0].strip()) if 'vcount' in vc: result['vcount'] = vc['vcount'] if 'latest' in vc: result['latest'] = re.sub(r"[^\d]", "", vc['latest']) if poster: result['poster_url'] = poster[0] #if movieid: # result['cont_id'] = movieid[0] result['cont_id'] = get_cluster_id(response.request.url) return result except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] year_list = [] lyears = [] playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] if not playlistId: logging.log( logging.INFO, "parse_episode_info error,not find playlistid,url:%s " % response.request.url) return items title_list = self.parse_title(response, cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = self.parse_district(response) type_list = self.parse_type_list(response) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() year_list = self.parse_year(response) year = None if year_list: year = year_list[0] #pers = "|".join([t.strip() for t in performer_list]) #dirs = "|".join([t.strip() for t in director_list]) pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) district = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="movieCont mod"]/p[1]/span[@class="full_intro"]/text()' ).extract() play_url = "" play_url = response.xpath( '//div[@class="cfix movie-info"]/div[2]/div[@class="cfix bot"]/a[@class="btn-playFea"]/@href' ).extract() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district: ep_item["district"] = district if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "sohu") playlistId = str(playlistId) ep_item["cont_id"] = playlistId if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid mvitem["media"] = ep_item vurl = "" ttvitem = [] if title_list: ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) if not items and playlistId: items += self.api_episode_info(mvitem, playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_info_extract(response, mediaItem): try: if mediaItem == None: mediaItem = MediaItem() #媒体页 sels = response.xpath('.//dt[@class="album-info"]//p') for sel in sels: labels = sel.xpath('./em[@class="label"]/text()').extract() if not labels: continue label = labels[0] text_regex = re.compile('(%s.*%s.*)' % (u'简',u'介')) match_results = text_regex.search(label) if match_results: infos = sel.xpath('./span/text()').extract() if not infos: continue mediaItem['intro'] = infos[0] else: infos = sel.xpath('./a/text()').extract() if not infos: continue text_regex = re.compile('(%s.*%s.*)' % (u'导',u'演')) match_results = text_regex.search(label) if match_results: mediaItem['director'] = Util.join_list_safely(infos) else: text_regex = re.compile('(%s.*%s.*)' % (u'主',u'演')) match_results = text_regex.search(label) if match_results: mediaItem['actor'] = Util.join_list_safely(infos) else: text_regex = re.compile('(%s.*%s.*%s.*)' % (u'主',u'持',u'人')) match_results = text_regex.search(label) if match_results: mediaItem['actor'] = Util.join_list_safely(infos) else: text_regex = re.compile('(%s.*%s.*)' % (u'类',u'型')) match_results = text_regex.search(label) if match_results: mediaItem['type'] = Util.join_list_safely(infos) else: text_regex = re.compile('(%s.*%s.*)' % (u'地',u'区')) match_results = text_regex.search(label) if match_results: mediaItem['district'] = Util.join_list_safely(infos) sels = response.xpath('.//div[@class="mod-album-1-intro-til"]') if sels: titles = sels.xpath('.//span[@class="dd-pic-til"]/text()').extract() vcounts = sels.xpath('.//span[@class="update-info-series"]//em/text()').extract() if titles: mediaItem['title'] = titles[0] if vcounts: vcount = vcounts[0] mediaItem['vcount'] = int(vcount) #媒体页 class_names = response.xpath('./@language').extract() if class_names and 'javascript' == class_names[0]: cids = response.re('cid[ ]?:[ ]?(\d+)') is_fulls = response.re('\"isfull\"[ ]?:[ ]?(\d+)') latest = mediaItem['latest'] if 'latest' in mediaItem else None if cids: cid = cids[0] mediaItem['cont_id'] = cid if is_fulls: is_full = is_fulls[0] if is_full == '0': latest = '0' else: latests = response.re('\"lastseries\"[ ]?:[ ]?\"([\d-]+)\"') if latests: latest = latests[0] latest = filter(str.isalnum, str(latest)) mediaItem['latest'] = latest #播放页(电影) sels = response.xpath('..//div[@class="play-xxmes clearfix"]//p') for sel in sels: labels = sel.xpath('./span[@class="px-l"]/text()').extract() if not labels: continue label = labels[0] text_regex = re.compile('(%s.*%s.*)' % (u'简',u'介')) match_results = text_regex.search(label) if match_results: infos = sel.xpath('./span[@class="px-r"]/text()').extract() if not infos: continue mediaItem['intro'] = infos[0] else: infos = sel.xpath('./span[@class="px-r"]/a/text()').extract() if not infos: continue text_regex = re.compile('(%s.*%s.*)' % (u'导',u'演')) match_results = text_regex.search(label) if match_results: mediaItem['director'] = Util.join_list_safely(infos) else: text_regex = re.compile('(%s.*%s.*)' % (u'主',u'演')) match_results = text_regex.search(label) if match_results: mediaItem['actor'] = Util.join_list_safely(infos) else: text_regex = re.compile('(%s.*%s.*)' % (u'类',u'型')) match_results = text_regex.search(label) if match_results: mediaItem['type'] = Util.join_list_safely(infos) else: text_regex = re.compile('(%s.*%s.*)' % (u'地',u'区')) match_results = text_regex.search(label) if match_results: mediaItem['district'] = Util.join_list_safely(infos) #播放页 class_names = response.xpath('./@type').extract() if class_names and 'text/javascript' == class_names[0]: titles = response.re('title[ ]?:[ ]?\"(.*)\"') cids = response.re('cid[ ]?:[ ]?(\d+)') release_dates = response.re('release_date[ ]?:[ ]?\"([\d-]+)\"') if titles: mediaItem['title'] = titles[0] if cids: mediaItem['cont_id'] = cids[0] if release_dates: release_date = release_dates[0] release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date #list列表页 poster_urls = response.xpath('.//img[@class="lazy"]/@data-original').extract() members = response.xpath('.//span[@class="member-ico"]').extract() if poster_urls: mediaItem['poster_url'] = poster_urls[0] if members: mediaItem['paid'] = '1' except Exception, e: logging.log(logging.ERROR, traceback.format_exc())