def album_tag_json_resolve(self, text, meta): items = [] try: request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_express = '(\[\{.*\}\])' video_regex = re.compile(video_express) match_results = video_regex.search(text) if match_results: video_content = match_results.groups()[0] videos = json.loads(video_content) for video in videos: videoItem = VideoItem() ext_id = video['id'] title = video['title'] vnum = video['stitle'] img = video['img'] url = video['url'] videoItem['cont_id'] = ext_id videoItem['title'] = title vnum = str(vnum) videoItem['vnum'] = filter(str.isalnum, vnum) videoItem['thumb_url'] = img videoItem['url'] = Util.get_absolute_url(url, prefix_url) self.set_video_info(videoItem) items.append(videoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url)
def list_parse(self, response): try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None sels = response.xpath( '//div[@class="retrieval"]//dl[@class="retrieval-dl"]') if self.max_mark_depth > 0: size = self.max_mark_depth if self.max_mark_depth < len( sels) else len(sels) else: size = len(sels) if level <= size: sel = sels[level - 1] level = level + 1 urls = sel.xpath( './/ul[@class="retrieval-list"]//a/@href').extract() for url in urls: url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': channel_id })) #获取当前层的所有list数据 #按照排序方式再进行细分一次 urls = response.xpath( '//div[@class="filter"]//ul[@class="tab-sya"]//li/a/@href' ).extract() for url in urls: url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_html_parse, meta={ 'page': 1, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else None #播放页获取详细信息 sels = response.xpath('//script[@type="text/javascript"]') hunantv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="play-xxmes clearfix"]') hunantv_extract.media_info_extract(sels, mediaItem) #获得媒体页地址 url_express = '(http://www\.hunantv\.com/v/[\d]+/[\d]+)/[a-zA-Z]/[\d]+\.html' url_regex = re.compile(url_express) match_results = url_regex.search(request_url) if match_results: url_content = match_results.groups()[0] mediaItem['url'] = url_content #获取正片地址 url_exist = False sels = response.xpath( '//div[@class="play-index-con-til clearfix"]//*[@class="mppl-til"]' ) for sel in sels: results = hunantv_extract.album_extract(sel) if results: item = results[0] url = item['url'] url = Util.get_absolute_url(url, prefix_url) mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.album_parse, meta={ 'url': request_url, 'item': mediaVideoItem })) url_exist = True break #不存在正在播放的链接,如“芒果捞星闻” if 'url' in mediaItem and not url_exist: year_api = mediaItem['url'] + '/s/json.year.js' mediaVideoItem['media'] = mediaItem items.append( Request(url=year_api, callback=self.album_json_parse, meta={ 'item': mediaVideoItem, 'url': year_api })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() sels = response.xpath('//script[@type="text/javascript"]') letv_extract.media_info_extract(sels, mediaItem) sels = None if not sels: #Detail电视剧、综艺、动漫 sels = response.xpath( '//div[@data-statectn="play_info"]//ul[@class="intro_box"]' ) if not sels: #Info:普通影片,动漫 sels = response.xpath( '//div[@data-statectn="newplay_info"]//ul[@class="info_list"]' ) if not sels: #收费影片 sels = response.xpath( '//div[@class="Player"]//span[@class="video_info"]') if sels: results = letv_extract.media_extract(sels) if results: item = results[0] url = Util.get_absolute_url(item['url'], prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) if not items: #视频播放页找不到媒体页地址,尝试直接采用接口爬取 if 'cont_id' in mediaItem: self.api_parse(mediaVideoItem) else: logging.log(logging.INFO, '该视频播放页找不到媒体页地址,也无法直接采用接口: %s' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def album_tag_resolve(self, text, meta): items = [] try: request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'album tag url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) try: response = Selector(text=text) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'text to be parsed is not xml or html') return items sels = response.xpath( '//div[@class="play-index-con-box"]//ul[@class="clearfix ullist-ele"]/li' ) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) for sel in sels: videoItem = VideoItem() hunantv_extract.video_info_extract(sel, videoItem) url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url #获取视频id match_results = video_url_regex.search(url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) items.append(videoItem) #下一页 results = hunantv_extract.next_page_extract(response) if results: url = results[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) items = items + self.album_tag_resolve(text=result, meta={'url': url})
def list_html_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'list html url: %s' % request_url) page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None results = pps_extract.media_extract(response) for item in results: mediaVideoItem = MediaVideoItem() item['channel_id'] = channel_id url = item['url'] url = Util.get_absolute_url(url, request_url) item['url'] = url mediaVideoItem['media'] = item items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 page = page + 1 results = pps_extract.next_page_extract(response) for item in results: url = Util.get_absolute_url(item, request_url) items.append( Request(url=url, callback=self.list_html_parse, meta={ 'page': page, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #播放页 - 普通电影 sels = response.xpath( '//div[@class="film-info clearfix"]//span[@class="summary"]/a/@href' ) if not sels: #播放页 - vip电影 sels = response.xpath( '//div[@class="f_song inner_resumeCon intro"]//div[@class="con"]/a/@href' ) if not sels: #播放页 - 预告片电影 sels = response.xpath( '//div[@class="related-film clear"]//a[@class="rel-film-img"]/@href' ) if sels: url = sels.extract()[0] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if level == 0: #当前处在第一频道层,只提取电影、电视剧、动漫、综艺频道 sels = response.xpath( './/div[normalize-space(@class)="column_menu"]/*[@class="channel_tit"]//li[@data-channel]' ) level = level + 1 for list_channel in letv_extract.list_channels: urls = sels.xpath('.//a[text()="%s"]/@href' % list_channel).extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': list_channel })) else: #对当前层再进行细分 sels = response.xpath( './/div[normalize-space(@class)="column_menu"]/ul[@data-statectn="left-biaoqian"]/li' ) if self.max_mark_depth > 0: size = self.max_mark_depth if self.max_mark_depth < len( sels) else len(sels) else: size = len(sels) if level <= size: sel = sels[level - 1] level = level + 1 url_sels = sel.xpath('.//dd/a') for url_sel in url_sels: labels = url_sel.xpath('./b/text()').extract() if not labels: continue label = labels[0] if label in letv_extract.ignore_channels: continue urls = url_sel.xpath('./@href').extract() if not urls: continue url = urls[0] url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': channel_id })) #获取当前层的所有list数据 #在URL中提取之前勾选的过滤条件 regex_pattern = re.compile( 'http://list.letv.com/listn/(.*)\.html') match_result = regex_pattern.search(request_url) filter_str = '' if match_result: filter_str = match_result.groups()[0] list_json_postfix_url = '?' regex_pattern = re.compile('([a-zA-Z]+)([\d,]+)') filters = regex_pattern.findall(filter_str) for item in filters: if item[0] != 'o': list_json_postfix_url = list_json_postfix_url + '%s=%s&' % ( item[0], item[1]) #按照排序方式再进行细分一次 page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if self.max_update_page == self.max_number: sels = response.xpath( '//div[@class="sort_navy"]//a/@data-order') else: sels = response.xpath( '//div[@class="sort_navy"]//a[normalize-space(text())="%s"]/@data-order' % u'最新更新') for sel in sels: list_json_postfix_url_temp = list_json_postfix_url filters = regex_pattern.findall(sel.extract()) for item in filters: list_json_postfix_url_temp = list_json_postfix_url_temp + '%s=%s&' % ( item[0], item[1]) if list_json_postfix_url_temp != '?': url = self.list_json_prefix_url + list_json_postfix_url_temp + 'p=%s' % page items.append( Request(url=url, callback=self.list_json_parse, meta={ 'page': page, 'id': channel_id, 'postfix_url': list_json_postfix_url_temp, 'url': url })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #过滤掉skip_types类型的影片 sels = response.xpath('//head//script') if sels: regex_express = 'movieInfo\.play_type[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: play_type = match_result[0] if play_type in self.skip_types: return items #由于某些URL会有跳转,所以应保存真是的URL #http://movie.kankan.com/movie/88365 -> http://data.movie.kankan.com/movie/88365 mediaItem['url'] = request_url sels = response.xpath('//head') kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="info_list"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//ul[@class="detail_ul"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #获取媒体的剧集信息 videoItems = [] if u'综艺' == mediaItem['channel_id']: #综艺 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_[\d]+")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) elif u'电影' == mediaItem['channel_id']: #电影,从立即观看中获取 videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="section clearfix s2"]') if sels: urls = sels.xpath( './/a[starts-with(@class, "foc")]/@href').extract() thumb_urls = sels.xpath( './/a[@class="foc"]/img/@src').extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) else: #电视剧 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_asc")]]') if not sels: #动漫,电视剧 sels = response.xpath( '//ul[@id[re:test(., "fenji_[\d]+_asc")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) else: logging.log(logging.INFO, '%s: no videos' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) first = response.request.meta[ 'first'] if 'first' in response.request.meta else False channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if first: sels = response.xpath('//div[@class="tab_box"]//a') for sel in sels: texts = sel.xpath('.//span/text()').extract() if texts: text = texts[0].replace(' ', '') if text == u'最新': urls = sel.xpath('./@href').extract() url = urls[0] items.append( Request(url=url, callback=self.list_parse, meta={'id': channel_id})) break else: page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items #list列表 sels = response.xpath('//ul[@class="movielist"]/li') for sel in sels: results = kankan_extract.video_extract(sel) for item in results: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id kankan_extract.media_info_extract(sel, mediaItem) mediaVideoItem['media'] = mediaItem items.append( Request(url=item['url'], callback=self.video_parse, meta={'item': mediaVideoItem})) break #下一页 sels = response.xpath('//p[@class="list-pager-v2"]') results = kankan_extract.next_page_extract(sels) page = page + 1 for item in results: url = Util.get_absolute_url(item, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'page': page, 'id': channel_id })) break except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if level == 0: #当前处在第一频道层,只提取电影、电视剧、动漫、综艺频道 sels = response.xpath('//div[@class="detail_menu"]//li') level = level + 1 for list_channel in pptv_extract.list_channels: urls = sels.xpath('.//a[text()="%s"]/@href' % list_channel).extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) url = pptv_extract.normalize_url(url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': list_channel })) else: #对当前层再进行细分 sels = response.xpath('//div[@class="sear-menu"]//dl') if self.max_mark_depth > 0: size = self.max_mark_depth if self.max_mark_depth < len( sels) else len(sels) else: size = len(sels) if level <= size: sel = sels[level - 1] level = level + 1 url_sels = sel.xpath('.//dd/a') for url_sel in url_sels: labels = url_sel.xpath('./text()').extract() if not labels: continue label = labels[0] if label in pptv_extract.ignore_channels: continue urls = url_sel.xpath('./@href').extract() if not urls: continue url = urls[0] url = Util.get_absolute_url(url, prefix_url) url = pptv_extract.normalize_url(url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': channel_id })) #获取当前层的所有list数据 #按照排序方式再进行细分一次 sels = response.xpath( '//div[@class="sort-result-container"]//li/a/@href') regex_express = 'http://list\.pptv\.com\?(.*)' page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 for sel in sels: match_result = sel.re(regex_express) if match_result: postfix_url = match_result[0] list_postfix_url = postfix_url + '&page=%s' % page url = self.list_prefix_url + '?' + list_postfix_url items.append( Request(url=url, callback=self.list_html_parse, meta={ 'page': page, 'id': channel_id, 'postfix_url': postfix_url })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def album_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'album url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = response.request.meta[ 'url'] if 'url' in response.request.meta else None mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] sels = response.xpath( '//div[@class="page-videolist-tag-main"]//p[@class="pa1-nav"]') if sels: #存在tag页 #http://list.hunantv.com/album/56.html results = hunantv_extract.album_tag_extract(sels) for item in results: url = Util.get_absolute_url(item['url'], prefix_url) result = Util.get_url_content(url) videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': url}) else: #不存在tag页 #http://list.hunantv.com/album/2905.html video_sels = response.xpath( '//div[@class="page-videolist clearfix"]') if video_sels: result = video_sels.extract()[0] videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': request_url}) else: #无正片页地址 #http://www.hunantv.com/v/7/102831/f/1043648.html,有正片集的URL,但该URL是无效的 if video_url: videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) videoItem['url'] = video_url Util.copy_media_to_video(mediaItem, videoItem) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) #获取视频id match_results = video_url_regex.search(video_url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album url: %s' % request_url)
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 if level == 0: #第一次进入list页面 sels = response.xpath( '//div[@id="hony-searchtag-condition"]/p') for list_channel in hunantv_extract.list_channels: list_postfix_urls = sels.xpath( './/a[normalize-space(text())="%s"]/@href' % list_channel).extract() if list_postfix_urls: list_postfix_url = list_postfix_urls[0] url = Util.get_absolute_url(list_postfix_url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={'id': list_channel})) else: page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if page > self.max_update_page: return items #获取播放地址 sels = response.xpath('//div[@class="play-index-con-box"]') results = hunantv_extract.video_extract(sels) for item in results: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id video_sels = sels.xpath('.//a[@href="%s"]/..' % item['url']) hunantv_extract.media_info_extract(video_sels, mediaItem) mediaItem['poster_url'] = Util.get_absolute_url( mediaItem['poster_url'], prefix_url) url = Util.get_absolute_url(item['url'], prefix_url) mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 results = hunantv_extract.next_page_extract(response) if results: result = results[0] result = Util.get_absolute_url(result, prefix_url) page = page + 1 items.append( Request(url=result, callback=self.list_parse, meta={ 'page': page, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #获取播放地址 videoItems = [] videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.video_info_extract(sels, videoItem) if 'url' not in videoItem: #如果videoItem['url']为空,则表示只有影片资料,无播放地址,直接扔掉 logging.log(logging.INFO, '该影片找不到播放地址: %s' % request_url) return items url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem) videoItems.append(videoItem) #媒体属性 #设置媒体付费属性 video_prefix_url = Util.prefix_url_parse(url) if video_prefix_url in self.vip_prefix_urls: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="laMovMAIN fl"]') dy1905_extract.media_info_extract(sels, mediaItem) #剧情与演职人员 nav_sels = response.xpath( '//ul[@class="navSMb"]//li[@class="mdbpLeft2"]//div[@class="nowDefLine DefBOttom"]//a' ) if nav_sels: for sel in nav_sels: labels = sel.xpath('./text()').extract() urls = sel.xpath('./@href').extract() if labels and urls: label = labels[0].strip() if label.startswith(u'剧情') or label.startswith('演职人员'): url = urls[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) dy1905_extract.media_more_info_resolve( result, mediaItem) #设置绝对路径 url = mediaItem['url'] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if level == 0: urls = response.xpath( '//div[@class="rightArea"]//dl[@class="srhGroup srhGroup85 clear"]//dd/a/@href' ).extract() for url in urls: url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': 1, 'id': channel_id })) elif level == 1: urls = response.xpath( '//div[@class="termsBox"]//*[starts-with(@class, "selectLine")]//a[text()="%s"]/@href' % u'可点播').extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'pre_url': url, 'id': channel_id })) else: page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items #list列表 has_more = False sels = response.xpath('//ul[@class="inqList pt18"]') results = dy1905_extract.media_extract(sels) if results: has_more = True for item in results: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id mediaItem['poster_url'] = item['poster_url'] url = item['url'] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) #判断是否有下一页 if has_more: page = page + 1 post_url = '/p%s.html' post_url = post_url % str(page) pre_url = response.request.meta[ 'pre_url'] if 'pre_url' in response.request.meta else '' url = pre_url + post_url items.append( Request(url=url, callback=self.list_parse, meta={ 'page': page, 'pre_url': pre_url, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 if level == 0: #当前处在第一频道层,只提取电影、电视剧、动漫、综艺频道 level = level + 1 sels = response.xpath( '//div[@class="mod_sear_menu mt20 mb30"]//div[@class="mod_sear_list"]//li' ) for list_channel in iqiyi_extract.list_channels: urls = sels.xpath('.//a[text()="%s"]/@href' % list_channel).extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': list_channel })) else: page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if page == 1: #第一次进入该层级,即在第一页,根据需要确定是否细分,否则,直接抓取每一页的内容即可 max_broswe_pages = response.xpath( '//div[@class="mod-page"]//a[@data-key="%s"]' % self.max_broswe_page) if max_broswe_pages: #当前页面数仍大于max_broswe_page,需要再一次细分 sels = response.xpath( '//div[@class="mod_sear_menu mt20 mb30"]//div[starts-with(@class, "mod_sear_list")]' ) size = len(sels) if level < size - 1: sel = sels[level] urls = sel.xpath( './/ul[@class="mod_category_item"]//li[not(@class="selected")]//a/@href' ).extract() level = level + 1 for url in urls: url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': channel_id })) elif level == size - 1: if self.max_update_page == self.max_number: #细分到最后一层,仍大于max_broswe_page,则利用排序再进行遍历一次 #如果是增量更新,则只需按最新排序即可 urls = response.xpath( '//div[@class="sort-result-container"]//div[starts-with(@class, "sort-result-l")]//a/@href' ).extract() else: urls = response.xpath( '//div[@class="sort-result-container"]//div[starts-with(@class, "sort-result-l")]//a[contains(@title, "%s")]/@href' % u'更新时间').extract() level = level + 1 for url in urls: url = Util.get_absolute_url(url, prefix_url) if url == request_url: #排除掉按默认排序的方式 continue items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': channel_id })) #遍历list列表 sels = response.xpath('//div[@class="wrapper-piclist"]//li') for sel in sels: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id #根据实际不同情况,可能有的是直接跳到媒体页,有的直接跳转到播放页 #为了简化问题,都作为媒体来获取结果,再对结果进行分析url,来确定跳转方向 results = iqiyi_extract.media_extract(sel) for item in results: url = item['url'] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url mediaItem['poster_url'] = item[ 'poster_url'] if 'poster_url' in item else None mediaVideoItem['media'] = mediaItem url_type = iqiyi_extract.url_type_resolve(url) if url_type == URL_TYPE_MEDIA: items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) elif url_type == URL_TYPE_PLAY: items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) break #下一页 sels = response.xpath('//div[@class="mod-page"]') results = iqiyi_extract.next_page_extract(sels) if results: page = page + 1 url = results[0] url = Util.get_absolute_url(url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'page': page, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)