def parse_episode(self, response): try: logging.log(logging.INFO, 'episode:%s' % response.request.url) order = response.request.meta['order'] items = [] #video info #tags = response.xpath('//p[@class="info_tags"]//a/@title').extract() #descriptions = response.xpath('//div[@class="info_summary cf"]/span/text()').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_qq_showid(response.request.url) #if tags: # ep_item['tag'] = Util.unquote(tags[0]).rstrip('|') #if descriptions: # ep_item['description'] = descriptions[0] for k, v in order.items(): if k == 'user': ep_item['category'] = v elif k == 'show_id': ep_item['owner_show_id'] = v else: ep_item[k] = v ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['format_id'] = self.format_id items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] lens = response.request.meta['lens'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_letv_showid(response.request.url) albumid = response.selector.re(re.compile(r'pid: ?(\d+)')) #video info title = response.xpath( '//meta[@name="irTitle"]/@content').extract() upload_time = response.xpath( '//ul[@class="info_list"]//em[@id="video_time"]/text()' ).extract() tag_sel = response.xpath( '//meta[@name="keywords"]/@content').extract() ep_item = EpisodeItem() if title: ep_item['title'] = title[0] if show_id: ep_item['show_id'] = show_id if tag_sel: tag_str = tag_sel[0][len(title[0]) + 1:] if tag_str: tag_list = [] split_space = tag_str.split(' ') for item_space in split_space: split_comma = item_space.split(',') for item_comma in split_comma: tag_list.append(item_comma) ep_item['tag'] = "|".join([t.strip() for t in tag_list]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] lens = response.request.meta['lens'] priority = response.request.meta['priority'] items = [] #space maybe exist: "albumId:326754200" or "albumId: 326754200" #albumid = response.selector.re(re.compile(r'pid: ?(\d+)')) #show_id show_id = Util.get_sohu_showid(response.request.url) #tag tag = response.xpath('//meta[@name="keywords"]/@content').extract() #video info title = response.xpath( '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()' ).extract() #played = response.xpath('//em[@id="video_playcount"]').extract() ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = tag[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['duration'] = lens #if played: # ep_item['played']=played #if albumid: # items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]})) #else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_media(self, response): items = [] try: cat_id = response.request.meta['cat_id'] title = response.request.meta['title'] thumb_url = response.request.meta['img'] url = response.request.url query = urlparse.urlparse(url).query query_dict = urlparse.parse_qs(query) show_id = query_dict['id'][0] #get tags sels = response.xpath('//span[@class="c_org1"]/a/text()').extract() tag = '' if sels: tag = "|".join(sels).encode("UTF-8") #get release time upload_time = '' sels = response.xpath( '//p[@class="c_gray0 lh3"]/span/text()').extract() if sels: time_times = sels[0].encode("UTF-8") upload_time = time_times[0:16] #get play times played = 0 sels = response.xpath( '//p[@class="c_gray0 lh3"]/span/a/text()').extract() if sels: played = sels[0].strip() ep_item = EpisodeItem() ep_item['title'] = title ep_item['show_id'] = show_id ep_item['tag'] = tag ep_item['upload_time'] = upload_time ep_item['category'] = self._category ep_item['thumb_url'] = thumb_url ep_item['spider_id'] = self._spider_id ep_item['site_id'] = self._site_id ep_item['url'] = url ep_item['played'] = played ep_item['cat_id'] = cat_id items.append(ep_item) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return items
def parse_second(self,response): try: #log.msg('lev2: %s' % response.request.url) kw_id = response.request.meta['kw_id'] items = [] sel = Selector(response) #info jinfo = json.loads(response.body) title = jinfo['data']['t'] show_id = response.request.meta['show_id'] tags = jinfo['data']['tag'] tag = tags.replace(' ','|').replace(',','|').strip('|') tuploadtime = jinfo['data']['uploadtime'] upload_time = Util.timestamp2datetime(tuploadtime) description = jinfo['data']['desc'] thumb_url = jinfo['data']['picpath'] tduration = str(jinfo['data']['vtime']) tduration1 = tduration.split(',') duration = tduration1[0] ep_item = EpisodeItem() if len(title) != 0: ep_item["title"] = title ep_item['show_id'] = response.request.meta['show_id'] turl = "http://v.ku6.com/show/" + show_id + ".html" if len(tag) != 0: ep_item["tag"] = tag if len(upload_time) != 0: ep_item["upload_time"] = upload_time if len(turl) != 0: ep_item["url"] = turl if len(thumb_url) != 0: ep_item['thumb_url'] = thumb_url if len(duration) != 0: ep_item["duration"] = duration ep_item['kw_id'] = kw_id ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id items.append(Request(url=turl, callback=self.parse_episode, meta={'item':ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] lens = response.request.meta['lens'] items = [] show_id = response.xpath('//div[@id="block-data-view"]/@data-aid').extract() title = response.xpath('//div[@id="block-data-view"]/@data-title').extract() tags = response.xpath('//div[@id="block-data-view"]/@data-tags').extract() if lens ==0: data_from = response.xpath('//div[@id="area-part-view"]/div/a/@data-from').extract() data_sid = response.xpath('//div[@id="area-part-view"]/div/a/@data-sid').extract() if data_sid: second_request = "http://www.acfun.tv/video/getVideo.aspx?id=" + data_sid[0].strip() items.append(Request(url=second_request, callback=self.parse_duration, meta={'cat_name': cat_name, 'thumb': thumb_url,'audit':audit,'priority':priority,'show_id':show_id,'title':title,'tags':tags,'url':response.request.url})) return items else: ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id[0].strip() if tags: ep_item['tag'] = tags[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] =priority ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] lens = response.request.meta['lens'] items = [] show_id = Util.get_ifeng_showid(response.request.url) title = response.xpath( '//head/meta[@property="og:title"]/@content').extract() tags = response.xpath('//div[@class="protag"]/a/text()').extract() upload_time = response.xpath( '//div[@class="vTit_wrap"]/div/p/span[@class="data"]/text()' ).extract() #video info ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tags: ep_item['tag'] = '|'.join(tags) if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() if upload_time: ep_item['upload_time'] = upload_time[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] thumb = response.request.meta['thumb'] items = [] show_id = Util.get_v1_showid(response.request.url) title = response.xpath('//meta[@name="title"]/@content').extract() tags = response.xpath( '//meta[@name="keywords"]/@content').extract() ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tags: ep_item['tag'] = tags[0].strip() ep_item['thumb_url'] = thumb[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name #ep_item['description'] = item.get("description") ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority #ep_item['played'] = item.get('play') #ep_item['upload_time'] = item.get('create') #duration = item.get('duration') #if duration: # a,b=duration.split(':') # duration = int(a)*60+int(b) #else: # duration = 0 #ep_item['duration'] = duration items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_media(self, response, **kwargs): items = [] try: title = response.xpath( '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()' ).extract() tag = response.xpath('//meta[@name="keywords"]/@content').extract() #show_id = Util.get_sohu_showid(response.request.url) thumb = response.xpath('//script').re(',sCover: \'(.*)\'') upload = response.xpath('//script').re(',uploadTime: \'(.*)\'') description = response.xpath( '//p[@class="rel cfix"]/@title').extract() played = response.xpath( '//span[@class="vbtn vbtn-play"]/em/i/text()').extract() print played, upload video_id = response.xpath('//script').re('vid = \'(\d+)\'') ep_item = EpisodeItem() if video_id: ep_item['video_id'] = video_id[0] ep_item['show_id'] = video_id[0] if title: ep_item['title'] = title[0] if tag: ep_item['tag'] = tag[0].strip().replace(',', '|') if upload: ep_item['upload_time'] = upload[0] + ":00" if description: ep_item['description'] = description[0].strip() if thumb: ep_item['thumb_url'] = thumb[0] if played: ep_item['played'] = Util.normalize_played(played[0]) ep_item['category'] = u"搞笑" ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url items.append(ep_item) log.msg("spider success, title:%s" % (ep_item['title']), level=log.INFO) except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) finally: return items
def parse_second(self, response): try: #log.msg('lev2: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #category begin = response.body.find("try{window(") begin += len("try{window(") end = response.body.find(");}catch(e)") msg = response.body[begin:end] jmsg = json.loads(msg) num = len(jmsg["data"]) for i in range(num): title = jmsg["data"][i]["aName"] play_num = "0" play_num = str(jmsg["data"][i]["disCnt"]) upload_time = jmsg["data"][i]["tvYear"] turl = jmsg["data"][i]["vUrl"] timelength = str(jmsg["data"][i]["timeLength"]) ep_item = EpisodeItem() if len(title) != 0: ep_item["title"] = title ep_item["played"] = play_num if len(upload_time) != 0: ep_item["upload_time"] = upload_time if len(turl) != 0: ep_item["url"] = turl if len(timelength) != 0: ep_item["duration"] = timelength ep_item['subject_id'] = cat_id items.append( Request(url=turl, callback=self.parse_episode, meta={'item': ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_duration(self, response): try: items = [] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] title = response.request.meta['title'] show_id = response.request.meta['show_id'] tags = response.request.meta['tags'] url = response.request.meta['url'] data = json.loads(response.body) success = data.get('success') if not success or success == 'false': return items duration = data.get('time') if not duration: return items ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id[0].strip() if tags: ep_item['tag'] = tags[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] =priority ep_item['duration'] = int(duration) items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] data = json.loads(response.body) list = data.get('list') for item in list: ep_item = EpisodeItem() ep_item['title'] = item.get('title') ep_item['show_id'] = item.get('aid') #ep_item['tag'] = item.get() ep_item['thumb_url'] = item.get('pic') ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item[ 'url'] = "http://www.bilibili.com/video/av%s/" % item.get( 'aid') #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['description'] = item.get("description") ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['played'] = item.get('play') #ep_item['upload_time'] = item.get('create') duration = item.get('duration') if duration: a, b = duration.split(':') duration = int(a) * 60 + int(b) else: duration = 0 ep_item['duration'] = duration items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] show_id = Util.get_tucao_showid(response.request.url) title = response.xpath( '//h1[@class="show_title"]/text()').extract() tags = response.xpath( '//meta[@name="keywords"]/@content').extract() #video info ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tags: ep_item['tag'] = tags[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority #ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_played(self, response): items = [] try: log.msg(response.request.url, level=log.INFO) body = response.xpath('//body/p/text()') play_num = body.re('"browse":(\d*)}')[0] _item = response.meta ep_item = EpisodeItem() ep_item['show_id'] = _item['show_id'] ep_item['title'] = _item['title'] ep_item['tag'] = _item['tag'] ep_item['category'] = _item['category'] ep_item['upload_time'] = _item['upload_time'] ep_item['spider_id'] = _item['spider_id'] ep_item['site_id'] = _item['site_id'] ep_item['url'] = _item['url'] ep_item['description'] = _item['description'] ep_item['played'] = int(play_num) items.append(ep_item) except Exception, err: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_first(self, response): try: items = [] user_item = UserItem() data = json.loads(response.body) print data return items has_more = data.get("has_more") message = data.get("message") max_behot_time = data.get("max_behot_time") data = data.get("data") if data: for it in data: ep_item = EpisodeItem() ep_item['title'] = it["title"] ep_item['show_id'] = show_id ep_item['tag'] = "|".join([t.strip() for t in tag]) ep_item['upload_time'] = upload_time[0].strip() if category: ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority print type(data) #items.append(Request(url=urls, callback=self.parse_page)) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode_youku(self, response): try: logging.log(logging.INFO, "episode_youku:%s" % response.request.url) pg_id = response.request.meta['pg_id'] cat_name = response.request.meta['cat_name'] site_id = response.request.meta['site_id'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = response.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() #ep_item['title'] = Util.strip_title("".join(title)) ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') #if category: # ep_item['category'] = category[0].replace(u'频道', '') ep_item['category'] = cat_name if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = site_id ep_item['url'] = response.request.url ep_item['pg_id'] = pg_id ep_item['audit'] = audit ep_item['format_id'] = self.format_id ep_item['priority'] = priority if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode_iqiyi(self, response): try: logging.log(logging.INFO, "parse_youku_playlength:%s" % response.request.url) pg_id = response.request.meta['pg_id'] cat_name = response.request.meta['cat_name'] site_id = response.request.meta['site_id'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath( '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()' ).extract() #category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract() upload_time = response.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath( '//span[@id="widget-videotag"]/descendant::*/text()').extract( ) if not tag: tag = response.xpath( '//span[@class="mod-tags_item vl-block"]/descendant::*/text()' ).extract() if not tag: tag = response.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() #if category: # ep_item['category'] = category[0].strip() ep_item['category'] = cat_name ep_item['spider_id'] = self.spider_id ep_item['site_id'] = site_id ep_item['pg_id'] = pg_id ep_item['audit'] = audit ep_item['url'] = response.request.url ep_item['format_id'] = self.format_id ep_item['priority'] = priority if albumid: items.append( Request(url=self.playlength_url + albumid[0], callback=self.parse_playlength, meta={ 'item': ep_item, 'albumid': albumid[0] })) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode(self, response): try: cust_para = response.request.meta['cust_para'] log.msg('%s: %s' % (response.request.url, cust_para)) items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() category = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = Util.strip_title("".join(title)) if 'need_check' in cust_para: if self.content_is_forbidden(ep_item['title']): log.msg('video [ %s ] is in blacklist!' % ep_item['show_id']) return items else: pass else: pass if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if 'category' in cust_para: ep_item['category'] = cust_para['category'] elif category: ep_item['category'] = category[0].replace(u'频道', '') if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] if 'priority' in cust_para: ep_item['priority'] = cust_para['priority'] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('%s' % response.request.url) thumb_url = response.request.meta['thumb_url'] upload_time = response.request.meta['upload_time'] category = response.request.meta['category'] kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else 1 items = [] #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.url_prefix + owner_url[0] + "/about", callback=self.parse_about)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() #category = response.xpath('//p[@id="eow-category"]/a/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = Util.get_youtube_showid(response.request.url) if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if category: #ep_item['category'] = category[0].replace('&', '|') ep_item['category'] = category ''' if upload: ptime = Util.get_youtube_publish(upload[0]) if ptime: ep_item['upload_time'] = ptime ''' if upload_time: t = Util.get_youtube_upload_time(upload_time[0].strip()) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = "\n".join(description) if thumb_url: ep_item['thumb_url'] = thumb_url[0] if played: pld = Util.normalize_played(played[0]) if pld: ep_item['played'] = Util.normalize_played(played[0]) else: ep_item['played'] = '0' ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = Util.normalize_youtube_url(response.request.url) ep_item['kw_id'] = kw_id query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.parse_other_info, meta={'item': ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) cat_id = response.request.meta['cat_id'] thumb_url = response.request.meta['thumb'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) #space maybe exist: "albumId:326754200" or "albumId: 326754200" albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath('//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()').extract() category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract() upload_time = response.xpath('//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath('//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath('//span[@id="widget-videotag"]/descendant::*/text()').extract() if not tag: tag = response.xpath('//span[@class="mod-tags_item vl-block"]/descendant::*/text()').extract() if not tag: tag = response.xpath('//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() if category: ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id if albumid: items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): items = [] try: log.msg(response.request.url, level=log.INFO) title = response.xpath( '//head/meta[@property="og:title"]/@content') title = title.extract()[0].strip() if title else "" category = response.xpath( '//head/meta[@property="og:category"]/@content') category = category.extract()[0].strip( ) if category else u"\u5a31\u4e50" description = response.xpath( '//head/meta[@property="og:description"]/@content') description = description.extract()[0].strip( ) if description else "" upload_time = response.xpath('//div[@class="playerinfo"]/p/text()') upload_time = upload_time.re( u'\u53d1\u5e03:(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})' ) if upload_time else "" upload_time = upload_time[0] if upload_time else "" upload_time = upload_time if upload_time else response.meta['time'] play_num = response.xpath( '//div[@class="playerinfo"]/p/span[@id="numPlay"]/text()') play_num = play_num.re( u'\u64ad\u653e\u6570:(\d+)') if play_num else "" request_played = False if play_num else True play_num = play_num[0] if play_num else "0" tags = response.xpath('//li[@class="vtags"]/a/text()') tags = tags.extract() if tags else [] tag = '' for a in tags: tag = tag + '|' + a if tag else a tag = tag if tag else u"\u5a31\u4e50" video_id = response.request.url.split("/")[-1] video_id = video_id.split('.')[0] ep_item = EpisodeItem() ep_item['show_id'] = video_id.replace("-", "") #ep_item['video_id'] = video_id ep_item['title'] = title ep_item['tag'] = tag ep_item['category'] = category ep_item['played'] = int(play_num) ep_item['upload_time'] = datetime.strptime(upload_time, '%Y-%m-%d %H:%M:%S') ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['description'] = description if request_played: items.append( Request(url=self.url_num % video_id, callback=self.parse_played, meta=ep_item)) else: items.append(ep_item) except Exception, err: log.msg(traceback.format_exc(), level=log.ERROR)
def video_parse(self, response): items = [] try: kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else None pg_id = response.request.meta[ 'pg_id'] if 'pg_id' in response.request.meta else None cat_id = response.request.meta[ 'cat_id'] if 'cat_id' in response.request.meta else None subject_id = response.request.meta[ 'subject_id'] if 'subject_id' in response.request.meta else None show_id = Util.get_youtube_showid(response.request.url) if not show_id: return items #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.youtube_url_prefix + owner_url[0] + "/about", callback=self.video_about_parse)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() category = response.xpath( '//div[@id="watch-description"]//ul[@class="content watch-info-tag-list"]/li/a/text()' ).extract() upload = response.xpath( '//meta[@itemprop="datePublished"]/@content').extract() #该方法获取的缩略图 thumb_url = response.xpath( '//link[@itemprop="thumbnailUrl"]/@href').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = show_id #这里缩略图采用合成的方式得到['default', 'mqdefault', 'hqdefault', 'sddefault', 'maxresdefault'] #ep_item['thumb_url'] = self.thumb_url_prefix + '/' + show_id + '/default.jpg' if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if description: ep_item['description'] = "\n".join(description) if played: pld = Util.normalize_played(played[0]) if pld: ep_item['played'] = Util.normalize_played(played[0]) else: ep_item['played'] = '0' if kw_id: ep_item['kw_id'] = kw_id if pg_id: ep_item['pg_id'] = pg_id if cat_id: ep_item['cat_id'] = cat_id if subject_id: ep_item['subject_id'] = subject_id if thumb_url: ep_item['thumb_url'] = thumb_url[0] if category: category = category[0].strip() #https://www.youtube.com/watch?v=lwy4qwaByVQ ep_item['category'] = category.replace('&', '|') if upload: upload = upload[0].strip() struct_time = None struct_time = time.strptime(upload, '%b %d, %Y') if not struct_time: struct_time = time.strptime(upload, '%Y年%m月%d日') if struct_time: time_str = time.strftime('%Y-%m-%d %H:%M:%S', struct_time) #time_str = "%s-%s-%s %s" % (struct_time.tm_year, struct_time.tm_mon, struct_time.tm_mday, time_str) ep_item['upload_time'] = time_str ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = Util.normalize_youtube_url(response.request.url) query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.video_other_info_parse, meta={'item': ep_item})) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: log.msg('%s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #owner owner = sel.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return items.append(Request(url=owner[0], callback=self.parse_owner)) #video info #title = sel.xpath('//div[@class="base_info"]/h1/descendant-or-self::*/text()').extract() title = sel.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() category = sel.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() scripts = sel.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = sel.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = sel.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = sel.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() #ep_item['title'] = Util.strip_title("".join(title)) ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if category: ep_item['category'] = category[0].replace(u'频道', '') if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id #if video_id: # items.append(Request(url=self.vpaction_url+video_id[0], callback=self.parse_vpaction, meta={'item':ep_item})) if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
def video_parse(self, response): items = [] try: kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else None pg_id = response.request.meta[ 'pg_id'] if 'pg_id' in response.request.meta else None cat_id = response.request.meta[ 'cat_id'] if 'cat_id' in response.request.meta else None subject_id = response.request.meta[ 'subject_id'] if 'subject_id' in response.request.meta else None #check video's category category_str = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() category = None if category_str: category = category_str[0].replace(u'频道', '') if category: if category in self.category_exclude: log.msg("video category excluded: %s" % category) return owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return #episode info show_id = Util.get_showid(response.request.url) title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') episode_item = EpisodeItem() if show_id: episode_item['show_id'] = show_id else: return if video_id: episode_item['video_id'] = video_id[0] if owner_show_id: episode_item['owner_show_id'] = owner_show_id if title: episode_item['title'] = Util.strip_title("".join(title)) if tag: episode_item['tag'] = Util.unquote(tag[0]).rstrip('|') if category: episode_item['category'] = category if upload: t = Util.get_upload_time(upload[0]) if t: episode_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: episode_item['description'] = description[0] episode_item['spider_id'] = self.spider_id episode_item['site_id'] = self.site_id episode_item['url'] = response.request.url episode_item['kw_id'] = kw_id episode_item['pg_id'] = pg_id episode_item['cat_id'] = cat_id episode_item['subject_id'] = subject_id if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.vpaction_parse, meta={'episode_item': episode_item})) else: items.append(episode_item) except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
def parse_episode(self, response): try: recommend = response.request.meta['recommend'] log.msg('%s|recommend: %s' % (response.request.url, recommend)) items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return #check recommended video's category category = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() cat = None if category: cat = category[0].replace(u'频道', '') if recommend and cat: if cat in self.cat_exclude: log.msg("video category excluded: %s" % cat) return #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = Util.strip_title("".join(title)) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if cat: ep_item['category'] = cat if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) #recommendation if not recommend: items.append( Request(url=self.ykrec_url + video_id[0], callback=self.parse_recommendation)) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)