def __init__(self): self.__db_mgr = DbManager.instance() self.__cdict = {} self.__cdict[u'电影'] = u'电影片花' self.__cdict[u'电视剧'] = u'电视片花' self.__cdict[u'综艺'] = u'综艺片花' self.__cdict[u'动漫'] = u'动漫片花' self.__cdict[u'自拍'] = u'其他' self.__cdict[u'创意视频'] = u'其他' self.__cdict[u'网剧'] = u'搞笑' self.__cdict[u'拍客'] = u'搞笑' self.__cdict[u'亲子'] = u'母婴' self.__cdict[u'教育'] = u'公开课' self.__cdict[u'原创'] = u'其他' #self.__ddict = {} #self.__ddict['资讯'] = u'del' #self.__ddict['微电影'] = u'del' self.__dlist = [u'资讯', u'微电影']
class iqiyi_search_video(Spider): name = 'iqiyi_search_video' pipelines = ['MysqlStorePipeline'] spider_id = '32768' site_id = '5' allowed_domain=["so.iqiyi.com", "www.iqiyi.com"] url_prefix = 'http://so.iqiyi.com' playnum_url = 'http://cache.video.iqiyi.com/jp/pc/' playlength_url = "http://cache.video.iqiyi.com/a/" hottest_played_threshold = get_project_settings().get('HOTTEST_PLAYED_THRESHOLD') mgr = DbManager.instance() channel_exclude = mgr.get_channel_exclude() def __init__(self, cat_ids=None, keywords=None, *args, **kwargs): super(iqiyi_search_video, self).__init__(*args, **kwargs) if keywords: keywords = json.loads(keywords) self.max_search_page = get_project_settings().get('MAX_MANUAL_SEARCH_PAGE') else: keywords = self.mgr.get_keywords(st='video', site_name='iqiyi') self.max_search_page = get_project_settings().get('MAX_SEARCH_PAGE') if keywords: self._keywords = keywords else: self._keywords = [] def start_requests(self): try: items = [] run_time = {'10min': 2, '30min': 3, '60min': 4, 'plus': 5, 'default': 0} pub_time = {'day': 1, 'week': 2, 'month': 3, 'default': 0} quality = {'high': 3, '720P': 4, 'super': 6, '1080P': 7, 'default': ''} sort = {'composite': 1, 'new': 4, 'played': 11} for kw in self._keywords: url = "%s/so/q_%s_ctg__t_%s_page_%s_p_%s_qc_%s_rd_%s_site_%s_m_%s_bitrate_%s" % \ (self.url_prefix, urllib2.quote(kw['keyword'].encode('utf8')), run_time['default'], 1, 1, 0, pub_time['default'], "iqiyi", sort['composite'], quality['default']) items.append(Request(url=url, callback=self.parse, meta={'page':1, 'kw_id': kw['id']})) return items except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR)
class QqCatSpider(Spider): name = "qq_cat" pipelines = ['MysqlStorePipeline'] spider_id = "4194304" site_id = "16" format_id = 2 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(QqCatSpider, self).__init__(*args, **kwargs) cat_urls = kwargs.get('cats') if cat_urls: cat_urls = json.loads(cat_urls) else: cat_urls = self.mgr.get_cat_url('qq') if cat_urls: self._cat_urls = cat_urls else: self._cat_urls = [] def start_requests(self): try: items = [] for cat in self._cat_urls: items.append( Request(url=cat['url'], callback=self.parse_page, meta={ 'cat_id': cat['id'], 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] })) url = cat.pop('url') r = Request(url=url, callback=self.parse_page) r.meta.update({'cat': cat}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_page(self, response): try: logging.log(logging.INFO, 'page:%s' % response.request.url) cat = response.request.meta['cat'] items = [] qq_v = response.xpath('//div[@class="mod_cont"]/ul/li') for v in qq_v: urls = v.xpath('./h6/a/@href').extract() titles = v.xpath('./h6/a/@text').extract() thumb_urls = v.xpath('./a/img/@src').extract() durations = v.xpath( './a/div/span[@class="mod_version"]/text()').extract() playeds = v.xpath('./p/span/text()').extract() title = titles[0] if titles else None thumb_url = thumb_urls[0] if thumb_urls else None duration = Util.get_qq_duration( durations[0]) if durations else None played = Util.normalize_played(Util.normalize_vp( playeds[0])) if playeds else None if urls: r = Request(url=urls[0], callback=self.parse_episode) d = { 'title': title, 'thumb_url': thumb_url, 'duration': duration, 'played': played } d.update(order) r.meta.update({'order': d}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_episode(self, response): try: logging.log(logging.INFO, 'episode:%s' % response.request.url) order = response.request.meta['order'] items = [] #video info #tags = response.xpath('//p[@class="info_tags"]//a/@title').extract() #descriptions = response.xpath('//div[@class="info_summary cf"]/span/text()').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_qq_showid(response.request.url) #if tags: # ep_item['tag'] = Util.unquote(tags[0]).rstrip('|') #if descriptions: # ep_item['description'] = descriptions[0] for k, v in order.items(): if k == 'user': ep_item['category'] = v elif k == 'show_id': ep_item['owner_show_id'] = v else: ep_item[k] = v ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['format_id'] = self.format_id items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
class tucao_cat(Spider): name = "tucao_cat" pipelines = ['MysqlStorePipeline'] spider_id = "7" site_id = "14" max_search_page = 1 #request_url = "http://www.acfun.tv/dynamic/channel/1.aspx?channelId=%s&orderBy=0&pageSize=16" mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(tucao_cat, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = self.mgr.get_cat_url('tucao') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] for cat in self._cat_urls: items.extend([ Request(url=cat['url'], callback=self.parse_page, meta={ 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] }) ]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #file = open('tucao.html','w') #file.write(response.body) #file.close() #video items qy_v = response.xpath('//div[@class="list"]/ul/li') for v in qy_v: thumb = v.xpath('./div/a[@class="pic"]/img/@src').extract() url = v.xpath('./div/a[@class="pic"]/@href').extract() if url: items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'cat_name': cat_name, 'thumb': thumb, 'audit': audit, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] show_id = Util.get_tucao_showid(response.request.url) title = response.xpath( '//h1[@class="show_title"]/text()').extract() tags = response.xpath( '//meta[@name="keywords"]/@content').extract() #video info ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tags: ep_item['tag'] = tags[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority #ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class YoukuOrderSpider(Spider): name = "youku_order_all" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] spider_id = "256" site_id = "1" allowed_domains = ["i.youku.com", "www.youku.com", "v.youku.com"] url_prefix = 'http://i.youku.com' vpaction_url = "http://v.youku.com/v_vpactionInfo/id/" playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS/" forbidden_author_list = set() mgr = DbManager.instance() def __init__(self, orders=None, *args, **kwargs): super(YoukuOrderSpider, self).__init__(*args, **kwargs) self._orders = [ { 'url': 'http://i.youku.com/u/UMjk3OTcyMTM2/', 'cust_para': { 'category': u'音乐', 'priority': '3', 'need_check': '1' } }, ] with open('./crawler/data/blacklist', 'r') as f: for line in f.readlines(): self.forbidden_author_list.add(line.strip().decode('utf-8')) def start_requests(self): try: items = [] for order in self._orders: cust_para = order['cust_para'] if 'cust_para' in order else {} items.append( Request(url=order['url'], callback=self.parse, meta={'cust_para': cust_para})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse(self, response): try: log.msg(response.request.url, level=log.INFO) cust_para = response.request.meta['cust_para'] items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath( './div[@class="info"]/div[@class="username"]/a[1]/@title' ).extract() played = up.xpath( './div[@class="state"]/ul/li[@class="vnum"]/em/text()' ).extract() fans = up.xpath( './div[@class="state"]/ul/li[@class="snum"]/em/text()' ).extract() if user_name: user_item['user_name'] = user_name[0] if played: user_item['played'] = Util.normalize_vp(played[0]) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath( './div[@class="userintro"]/div[@class="desc"]/p[2]/text()' ).extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = '0' if yh: video_count = yh.xpath( 'div[1]/div/div/div/div[@class="title"]/span/a/text()').re( u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id items.append(user_item) #videos items.append( Request(url=response.request.url + "videos", callback=self.parse_video_page, meta={ 'page': 1, 'cust_para': cust_para })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_video_page(self, response): try: page = response.request.meta['page'] cust_para = response.request.meta['cust_para'] log.msg('%s: %s' % (response.request.url, page)) items = [] #get videos yk_v = response.xpath('//div[@class="yk-col4"]/div') for v in yk_v: url = v.xpath('./div[@class="v-link"]/a/@href').extract() if url: items.append( Request(url=url[0], callback=self.parse_episode, meta={'cust_para': cust_para})) #get last_str and ajax_url last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'') ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'') #reqest sibling page if ajax_url: sibling_page = (3 * page - 1, 3 * page) for p in sibling_page: s = last_str[0] if last_str else u'' para = { "v_page": str(page), "page_num": str(p), "page_order": "1", "last_str": s } items.append( FormRequest(url=self.url_prefix + ajax_url[0] + "fun_ajaxload/", formdata=para, method='GET', callback=self.parse_video_page, meta={ 'page': page, 'cust_para': cust_para })) #request next page next_page = response.xpath( '//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract() if next_page: items.append( Request(url=self.url_prefix + next_page[0], callback=self.parse_video_page, meta={ 'page': page + 1, 'cust_para': cust_para })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def content_is_forbidden(self, content): for keyword in self.forbidden_author_list: if content.find(keyword) == -1: pass else: return True return False def parse_episode(self, response): try: cust_para = response.request.meta['cust_para'] log.msg('%s: %s' % (response.request.url, cust_para)) items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() category = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = Util.strip_title("".join(title)) if 'need_check' in cust_para: if self.content_is_forbidden(ep_item['title']): log.msg('video [ %s ] is in blacklist!' % ep_item['show_id']) return items else: pass else: pass if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if 'category' in cust_para: ep_item['category'] = cust_para['category'] elif category: ep_item['category'] = category[0].replace(u'频道', '') if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] if 'priority' in cust_para: ep_item['priority'] = cust_para['priority'] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_vpaction(self, response): try: #log.msg('%s' % response.request.url) item = response.request.meta['item'] vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re( u'<label>总播放数:</label><span.*>(.+)</span>') if vp: item['played'] = Util.normalize_vp(vp[0]) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self, response): try: #log.msg('parse_playlength ,%s' % response.request.url) item = response.request.meta['item'] showid = item["show_id"] msg = response.body jinfo = json.loads(msg) plsylength = str(int(float(jinfo["data"][0]["seconds"]))) if plsylength: item['duration'] = str(plsylength) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class iqiyi_xiaomi(Spider): name = "iqiyi_xiaomi" pipelines = ['MysqlStorePipeline'] spider_id = "65536" site_id = "5" #iqiyi allowed_domains = [ "list.iqiyi.com", "www.iqiyi.com", "cache.video.iqiyi.com" ] url_prefix = 'http://list.iqiyi.com' playnum_url = 'http://cache.video.iqiyi.com/jp/pc/' playlength_url = "http://cache.video.iqiyi.com/a/" max_search_page = 1 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(iqiyi_xiaomi, self).__init__(*args, **kwargs) self._cat_urls = [ { 'url': 'http://list.iqiyi.com/www/25/20031-------------4-1-2-iqiyi-1-.html', 'id': '10000', 'name': u'热点' }, { 'url': 'http://list.iqiyi.com/www/25/21314-------------4-1-2-iqiyi-1-.html', 'id': '10000', 'name': u'新闻' }, { 'url': 'http://list.iqiyi.com/www/25/21739-------------4-1-2-iqiyi-1-.html', 'id': '10000', 'name': u'新闻' }, { 'url': 'http://list.iqiyi.com/www/25/21740-------------4-1-2-iqiyi-1-.html', 'id': '10000', 'name': u'新闻' }, ] def start_requests(self): try: items = [] for cat in self._cat_urls: items.extend([ Request(url=cat['url'], callback=self.parse_page, meta={ 'page': 1, 'cat_id': cat['id'], 'cat_name': cat['name'] }) ]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) page = response.request.meta['page'] cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] if int(page) > int(self.max_search_page): return items = [] #video items qy_v = response.xpath( '//div[@class="wrapper-piclist"]/ul/li/div[1]') for v in qy_v: thumb = v.xpath('./a/img/@src').extract() url = v.xpath('./a/@href').extract() items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'cat_id': cat_id, 'cat_name': cat_name, 'thumb': thumb })) #pages next_page = response.xpath( "//div[@class='mod-page']/a[text()='%s']/@href" % u'下一页').extract() if next_page: items.append( Request(url=self.url_prefix + next_page[0], callback=self.parse_page, meta={ 'page': page + 1, 'cat_id': cat_id, 'cat_name': cat_name })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) #space maybe exist: "albumId:326754200" or "albumId: 326754200" albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath( '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()' ).extract() category = response.xpath( '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract( ) if not category: category = response.xpath( '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()' ).extract() if not category: category = response.xpath( '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath( '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()' ).extract() upload_time = response.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath( '//span[@id="widget-videotag"]/descendant::*/text()').extract( ) if not tag: tag = response.xpath( '//span[@class="mod-tags_item vl-block"]/descendant::*/text()' ).extract() if not tag: tag = response.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() #if category: # ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = '0' ep_item['priority'] = '8' if albumid: items.append( Request(url=self.playlength_url + albumid[0], callback=self.parse_playlength, meta={ 'item': ep_item, 'albumid': albumid[0] })) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self, response): try: log.msg('parse_playlength ,%s' % response.request.url) item = response.request.meta['item'] albumid = response.request.meta['albumid'] items = [] #sel = Selector(response) msg = response.body index = msg.find("AlbumInfo=") + len("AlbumInfo=") info = msg[index:] jinfo = json.loads(info) plsylength = jinfo["data"]["playLength"] if plsylength: if int(plsylength) < 600: item['duration'] = str(plsylength) items.append( Request(url=self.playnum_url + albumid + "/?qyid=", callback=self.parse_playnum, meta={'item': item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playnum(self, response): try: #log.msg('parse_playnum ,%s' % response.request.url) item = response.request.meta['item'] items = [] #sel = Selector(response) tplaynum = response.selector.re(re.compile(r':(\d+)')) #log.msg('play: %s, %s' % (tplaynum[0], response.request.url)) if tplaynum: playnum = tplaynum[0] item['played'] = str(playnum) items.append(item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class ku6_search_video(Spider): name = "ku6_search_video" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] spider_id = "16384" #ku6_search_video site_id = "6" #k6 allowed_domains = ["so.ku6.com","v.ku6.com","v3.stat.ku6.com"] url_prefix = 'http://v.ku6.com/fetchVideo4Player/' url_playnum = 'http://v3.stat.ku6.com/dostatv.do?method=getVideoPlayCount&n=gotPlayCounts&v=' #hottest_played_threshold = get_project_settings().get('ORDERED_PLAYED_THRESHOLD') mgr = DbManager.instance() def __init__(self, cat_ids=None, keywords=None, *args, **kwargs): super(ku6_search_video, self).__init__(*args, **kwargs) if keywords: keywords = json.loads(keywords) self.max_search_page = get_project_settings().get('MAX_MANUAL_SEARCH_PAGE') else: keywords = self.mgr.get_keywords(st='video', site_name='ku6') self.max_search_page = get_project_settings().get('MAX_SEARCH_PAGE') if keywords: self._keywords = keywords else: self._keywords = [] def start_requests(self): try: items = [] for kw in self._keywords: kw_id = kw['id'] word = kw['keyword'] cat_id = kw['ext_cat_id'] turl = 'http://so.ku6.com/search?q=' + word + '&categoryid=' + str(cat_id) items.append(Request(url=turl,callback=self.parse_page, meta={'page': 1,'kw_id': kw_id})) turl1 = str(turl) + u'&sort=uploadtime' items.append(Request(url=turl1,callback=self.parse_page, meta={'page': 1,'kw_id': kw_id})) turl2 = str(turl) + u'&sort=viewcount' items.append(Request(url=turl2,callback=self.parse_page, meta={'page': 1,'kw_id': kw_id})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self,response): try: log.msg('parse_page: %s' % response.request.url) page = response.request.meta['page'] kw_id = response.request.meta['kw_id'] if int(page) > int(self.max_search_page): return items = [] #video items titems = response.xpath('//div[@id="search_list"]/div[2]/div[2]/ul[1]/li') for item in titems: turl = item.xpath('./h3[1]/a/@href').extract() if turl: show_id = Util.get_ku6_showid(turl[0]) items.append(Request(url=turl[0].strip(), callback=self.parse, meta={'kw_id': kw_id, 'show_id': show_id})) #pages next_page = response.xpath("//div[@id='search_list']/div[2]/div[2]/div/a[text()='%s']/@href" % u'下一页').extract() if next_page: items.append(Request(url=next_page[0], callback=self.parse_page, meta={'page': page+1, 'kw_id': kw_id})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) #for each category parse all its sub-categories or types def parse(self, response): try: #log.msg('lev1: %s' % response.request.url) kw_id = response.request.meta['kw_id'] show_id = response.request.meta['show_id'] items = [] sel = Selector(response) #category url1 = self.url_prefix + str(show_id) + ".html" items.extend([Request(url=url1, callback=self.parse_second, meta={'show_id': show_id,'kw_id':kw_id})]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_second(self,response): try: #log.msg('lev2: %s' % response.request.url) kw_id = response.request.meta['kw_id'] items = [] sel = Selector(response) #info jinfo = json.loads(response.body) title = jinfo['data']['t'] show_id = response.request.meta['show_id'] tags = jinfo['data']['tag'] tag = tags.replace(' ','|').replace(',','|').strip('|') tuploadtime = jinfo['data']['uploadtime'] upload_time = Util.timestamp2datetime(tuploadtime) description = jinfo['data']['desc'] thumb_url = jinfo['data']['picpath'] tduration = str(jinfo['data']['vtime']) tduration1 = tduration.split(',') duration = tduration1[0] ep_item = EpisodeItem() if len(title) != 0: ep_item["title"] = title ep_item['show_id'] = response.request.meta['show_id'] turl = "http://v.ku6.com/show/" + show_id + ".html" if len(tag) != 0: ep_item["tag"] = tag if len(upload_time) != 0: ep_item["upload_time"] = upload_time if len(turl) != 0: ep_item["url"] = turl if len(thumb_url) != 0: ep_item['thumb_url'] = thumb_url if len(duration) != 0: ep_item["duration"] = duration ep_item['kw_id'] = kw_id ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id items.append(Request(url=turl, callback=self.parse_episode, meta={'item':ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) item = response.request.meta['item'] items = [] sel = Selector(response) #category tcategory = sel.xpath('//div[@class="ckl_conleftop"]/div[1]/span[1]/a[1]/text()').extract() category = "" if len(tcategory) > 0: category = tcategory[0].strip() item['category'] = category #items.append(ep_item) turl = self.url_playnum + item['show_id'] items.append(Request(url=turl, callback=self.parse_playnum, meta={'item':item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playnum(self,response): try: log.msg('parse_playnum %s' % response.request.url) items = [] item = response.request.meta['item'] sel = Selector(response) msg = response.body r = re.compile(',count:"(\d+)?') m = r.search(msg) if m: tinfo = m.groups(0) if len(tinfo) > 0: playnum = tinfo[0] item['played'] = str(playnum) items.append(item) except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) return items
class YoutubeHottestSpider(Spider): name = "youtube_hottest" pipelines = ['MysqlStorePipeline'] spider_id = "4" site_id = "2" allowed_domains = ["www.youtube.com"] url_prefix = 'https://www.youtube.com' hottest_played_threshold = get_project_settings().get( 'HOTTEST_PLAYED_THRESHOLD') hottest_time_threshold = get_project_settings().get( 'HOTTEST_TIME_THRESHOLD') mgr = DbManager.instance() def __init__(self, orders=None, *args, **kwargs): super(YoutubeHottestSpider, self).__init__(*args, **kwargs) if orders: orders = json.loads(orders) self.max_search_page = get_project_settings().get( 'MAX_MANUAL_SEARCH_PAGE') else: orders = self.mgr.get_ordered_url(site_name='youtube') self.max_search_page = get_project_settings().get( 'MAX_SEARCH_PAGE') if orders: self._orders = orders else: self._orders = [] def start_requests(self): try: items = [] for i in self._orders: items.append( Request(url=i['url'], callback=self.parse, meta={ 'audit': i['audit'], 'priority': i['priority'] })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse(self, response): try: log.msg(response.request.url, level=log.INFO) audit = response.request.meta['audit'] priority = response.request.meta['priority'] category = [ r['user'] for r in self._orders if r['url'] == response.request.url ] if not category: category = ['other'] items = [] items.append( Request(url=response.request.url + "/videos", callback=self.parse_video, meta={ 'category': category[0], 'audit': audit, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_video(self, response): try: log.msg(response.request.url, level=log.INFO) audit = response.request.meta['audit'] priority = response.request.meta['priority'] category = response.request.meta['category'] items = [] #content #content = response.xpath('//div[@id="video-page-content"]/ul/li') content = response.xpath( '//ul[@id="channels-browse-content-grid"]/li') self.parse_page_content(items, content, category, audit, priority) #next page #next_page = response.xpath('//div[@id="video-page-content"]/button/@data-uix-load-more-href').extract() next_page = response.xpath( '//button/@data-uix-load-more-href').extract() if next_page: items.append( Request(url=self.url_prefix + next_page[0], callback=self.parse_more_video, meta={ 'page': 2, 'category': category, 'audit': audit, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_more_video(self, response): try: log.msg(response.request.url, level=log.INFO) page = response.request.meta['page'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] category = response.request.meta['category'] if page > self.max_search_page: return items = [] body = json.loads(response.body) self.parse_page_content( items, Selector(text=body['content_html']).xpath('./body/li'), category, audit, priority) #next page next_page = Selector(text=body['load_more_widget_html']).xpath( '//button/@data-uix-load-more-href').extract() if next_page: items.append( Request(url=self.url_prefix + next_page[0], callback=self.parse_more_video, meta={ 'page': page + 1, 'category': category, 'audit': audit, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page_content(self, items, content, category, audit, priority): try: for s in content: url = s.xpath( './div/div/div[@class="yt-lockup-thumbnail"]/span/a/@href' ).extract() thumb_url = s.xpath( './div/div/div[@class="yt-lockup-thumbnail"]/span/a/span/span/span/img/@src' ).extract() views = s.xpath( './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li/text()' ).re('([\d|,]*) views') upload_time = s.xpath( './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li[@class="yt-lockup-deemphasized-text"]/text()' ).extract() ''' if not views or int(Util.normalize_played(views[0])) < int(self.hottest_played_threshold): #log.msg('discard played: %s' % url[0]) continue if not upload_time or Util.get_youtube_upload_time(upload_time[0].strip()) >= int(self.hottest_time_threshold): #log.msg('discard upload_time: %s' % url[0]) continue ''' if url: items.append( Request(url=self.url_prefix + url[0], callback=self.parse_episode, meta={ 'thumb_url': thumb_url, 'upload_time': upload_time, 'category': category, 'audit': audit, 'priority': priority })) except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('%s' % response.request.url) audit = response.request.meta['audit'] priority = response.request.meta['priority'] thumb_url = response.request.meta['thumb_url'] upload_time = response.request.meta['upload_time'] category = response.request.meta['category'] items = [] #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.url_prefix + owner_url[0] + "/about", callback=self.parse_about)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() #category = response.xpath('//p[@id="eow-category"]/a/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = Util.get_youtube_showid(response.request.url) if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if category: #ep_item['category'] = category[0].replace('&', '|') ep_item['category'] = category ''' if upload: ptime = Util.get_youtube_publish(upload[0]) if ptime: ep_item['upload_time'] = ptime ''' if upload_time: t = Util.get_youtube_upload_time(upload_time[0].strip()) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = "\n".join(description) if thumb_url: ep_item['thumb_url'] = thumb_url[0] if played: ep_item['played'] = Util.normalize_played(played[0]) if audit: ep_item['audit'] = audit if priority: ep_item['priority'] = priority ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['format_id'] = 2 ep_item['url'] = Util.normalize_youtube_url(response.request.url) query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.parse_other_info, meta={'item': ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_other_info(self, response): try: log.msg('%s' % response.request.url) item = response.request.meta['item'] items = [] #duration duration = re.search(r'length_seconds=(\d+)', response.body) if duration: item['duration'] = duration.groups()[0] items.append(item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_about(self, response): try: log.msg(response.request.url, level=log.INFO) items = [] show_id = response.xpath( '//meta[@itemprop="channelId"]/@content').extract() user_name = response.xpath( '//span[@class="qualified-channel-title-text"]/a/text()' ).extract() fans = response.xpath('//span[@class="about-stat"]').re( re.compile(r'<span.*>.*<b>([\d|,]*)</b>.*subscribers.*</span>', re.S)) played = response.xpath('//span[@class="about-stat"]').re( re.compile(r'<span.*>.*<b>([\d|,]*)</b>.*views.*</span>', re.S)) intro = response.xpath( '//div[@class="about-description branded-page-box-padding"]/descendant-or-self::*/text()' ).extract() if show_id: user_item = UserItem() user_item['show_id'] = show_id[0] if user_name: user_item['user_name'] = user_name[0] if fans: user_item['fans'] = Util.normalize_played(fans[0]) if played: user_item['played'] = Util.normalize_played(played[0]) if intro: user_item['intro'] = "".join(intro).strip() user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url[:-len('/about')] items.append(user_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class YoukuOrderSpider(Spider): name = "youku_order" pipelines = ['MysqlStorePipeline'] spider_id = "256" site_id = "1" format_id = 2 #allowed_domains = ["www.youku.com", "v.youku.com", "i.youku.com", "index.youku.com", "play.youku.com"] url_prefix = 'http://i.youku.com' playlength_url = "http://play.youku.com/play/get.json?ct=10&vid=" mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(YoukuOrderSpider, self).__init__(*args, **kwargs) orders = kwargs.get('orders') if orders: orders = json.loads(orders) else: orders = self.mgr.get_ordered_url(site_name='youku') if orders: self._orders = orders else: self._orders = [] def start_requests(self): try: items = [] print self._orders for order in self._orders: items.append( Request(url=order['url'], callback=self.parse, meta={ 'audit': order['audit'], 'cat_name': order['user'], 'show_id': order['show_id'], 'priority': order['priority'] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse(self, response): try: logging.log(logging.INFO, "parse:%s" % response.request.url) audit = response.request.meta['audit'] cat_name = response.request.meta['cat_name'] show_id = response.request.meta['show_id'] priority = response.request.meta['priority'] items = [] #items.extend(self.parse_owner(response)) v_url = response.request.url if not v_url.endswith('/videos'): if v_url.endswith('/'): v_url = v_url + "videos" else: v_url = v_url + "/videos" items.append( Request(url=v_url, callback=self.parse_video_page, meta={ 'audit': audit, 'cat_name': cat_name, 'show_id': show_id, 'priority': priority })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_video_page(self, response): try: # 默认是最新发布 logging.log(logging.INFO, 'page:%s' % response.request.url) audit = response.request.meta['audit'] cat_name = response.request.meta['cat_name'] show_id = response.request.meta['show_id'] priority = response.request.meta['priority'] page = 1 items = [] #get videos yk_v = response.xpath('//div[@class="yk-col4"]/div') for v in yk_v: url = v.xpath('./div[@class="v-link"]/a/@href').extract() thumb_urls = v.xpath( './div/div[@class="v-thumb"]/img/@src').extract() if thumb_urls: thumb_url = thumb_urls[0] if thumb_url == 'http://g1.ykimg.com/': thumb_url = None else: thumb_url = None pl = v.xpath( './div[@class="v-meta va"]/div[@class="v-meta-entry"]/span[@class="v-num"]/text()' ).extract() if pl: pld = Util.normalize_played(pl[0]) played = int(pld) else: played = None if url: items.append( Request(url=url[0], callback=self.parse_episode, meta={ 'audit': audit, 'thumb_url': thumb_url, 'played': played, 'cat_name': cat_name, 'show_id': show_id, 'priority': priority })) #get last_str and ajax_url last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'') ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'') #reqest sibling page if ajax_url: sibling_page = (3 * page - 1, 3 * page) for p in sibling_page: s = last_str[0] if last_str else u'' para = { "v_page": str(page), "page_num": str(p), "page_order": "1", "last_str": s } items.append( FormRequest(url=self.url_prefix + ajax_url[0] + "fun_ajaxload/", formdata=para, method='GET', callback=self.parse_video_page, meta={ 'audit': audit, 'cat_name': cat_name, 'show_id': show_id, 'priority': priority })) #request next page ''' next_page = response.xpath('//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract() if next_page: items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_video_page, meta={'page':page+1})) ''' return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_episode(self, response): try: logging.log(logging.INFO, 'episode:%s' % response.request.url) audit = response.request.meta['audit'] thumb_url = response.request.meta['thumb_url'] played = response.request.meta['played'] cat_name = response.request.meta['cat_name'] owner_show_id = response.request.meta['show_id'] priority = response.request.meta['priority'] items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() #owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) #items.append(Request(url=owner[0], callback=self.parse_owner)) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = response.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') #if category: # ep_item['category'] = category[0].replace(u'频道', '') ep_item['category'] = cat_name if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['audit'] = audit ep_item['format_id'] = self.format_id ep_item['thumb_url'] = thumb_url ep_item['played'] = played ep_item['priority'] = priority if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_vpaction(self, response): try: logging.log(logging.INFO, 'vpaction:%s' % response.request.url) item = response.request.meta['item'] vp = response.xpath( '//ul[@class="player_info"]/li[@class="sum"]/text()').extract( ) if vp: item['played'] = Util.normalize_played( Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_playlength(self, response): try: logging.log(logging.INFO, 'playlength:%s' % response.request.url) item = response.request.meta['item'] showid = item["show_id"] msg = response.body jinfo = json.loads(msg) plsylength = str(int(float(jinfo["data"]["video"]["seconds"]))) if plsylength: item['duration'] = str(plsylength) return item except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_owner(self, response): try: logging.log(logging.INFO, "owner:%s" % response.request.url) show_id = response.request.meta['show_id'] items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') #show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] user_item['show_id'] = show_id #if show_id: # user_item['show_id'] = show_id[0] #else: # return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath( './div[@class="info"]/div[@class="username"]/a[1]/@title' ).extract() played = up.xpath( './div[@class="state"]/ul/li[@class="vnum"]/em/text()' ).extract() fans = up.xpath( './div[@class="state"]/ul/li[@class="snum"]/em/text()' ).extract() if user_name: user_item['user_name'] = user_name[0] if played: #user_item['played'] = Util.normalize_vp(played[0]) user_item['played'] = Util.normalize_played( Util.normalize_vp(played[0])) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath( './div[@class="userintro"]/div[@class="desc"]/p[2]/text()' ).extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = None if yh: video_count = yh.xpath( 'div[1]/div/div/div/div[@class="title"]/span/a/text()').re( u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url items.append(user_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
class letv_cat(Spider): name = "letv_cat" pipelines = ['MysqlStorePipeline'] spider_id = "524288" site_id = "15" max_search_page = 1 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(letv_cat, self).__init__(*args, **kwargs) self._cat_urls = [] self._page_api = "http://list.le.com/apin/chandata.json?c=%s&d=%s&md=%s&o=%s&p=%s&t=%s" self._le_url = "http://www.le.com/ptv/vplay/%s.html" self._max_page = 5 try: self._cat_urls = self.mgr.get_cat_url('letv') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] for cat in self._cat_urls: items.extend([ Request(url=cat['url'], callback=self.parse_page, meta={ 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] }) ]) ret = self.parse_info_from_url(cat['url']) for p in range(self._max_page): url = self._page_api % (ret['c'], ret['d'], ret['md'], ret['o'], str(p + 1), ret['t']) items.extend([ Request(url=url, callback=self.parse_page_json, meta={ 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] }) ]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page_json(self, response): try: cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] json_data = json.loads(response.body) for item in json_data['data_list']: vid = item['vid'] url = self._le_url % (vid) lens = item['duration'] images = item['images'] if '180*135' in images: thumb = images['180*135'] items.append( Request(url=url, callback=self.parse_episode, meta={ 'cat_name': cat_name, 'thumb': thumb, 'audit': audit, 'lens': lens, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #video items qy_v = response.xpath('//div[@class="layout"]/dl') for v in qy_v: thumb = v.xpath('./dt/a/img/@src').extract() url = v.xpath('./dt/a/@href').extract() lens = v.xpath( './dt/a/span[@class="number_bg"]/text()').extract()[0] try: if not lens: lens = 0 else: a, b = lens.split(':') lens = int(a) * 60 + int(b) items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'cat_name': cat_name, 'thumb': thumb, 'audit': audit, 'lens': lens, 'priority': priority })) except Exception as e: continue return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] lens = response.request.meta['lens'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_letv_showid(response.request.url) albumid = response.selector.re(re.compile(r'pid: ?(\d+)')) #video info title = response.xpath( '//meta[@name="irTitle"]/@content').extract() upload_time = response.xpath( '//ul[@class="info_list"]//em[@id="video_time"]/text()' ).extract() tag_sel = response.xpath( '//meta[@name="keywords"]/@content').extract() ep_item = EpisodeItem() if title: ep_item['title'] = title[0] if show_id: ep_item['show_id'] = show_id if tag_sel: tag_str = tag_sel[0][len(title[0]) + 1:] if tag_str: tag_list = [] split_space = tag_str.split(' ') for item_space in split_space: split_comma = item_space.split(',') for item_comma in split_comma: tag_list.append(item_comma) ep_item['tag'] = "|".join([t.strip() for t in tag_list]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_info_from_url(self, url): result = {} keys = ['c', 't', 'md', 'o', 'd', 'p'] info_str = url.split('/')[-1].split('.')[0] key_value = info_str.split('_') for item in key_value: for key in keys: if item[0:len(key)] == key: result[key] = item[len(key):] return result
class toutiao_video(Spider): name = "toutiao_video" pipelines = ['MysqlStorePipeline'] spider_id = "123456" site_id = "101" #iqiyi #allowed_domains = ["list.iqiyi.com","www.iqiyi.com","cache.video.iqiyi.com"] #url_prefix = 'http://list.iqiyi.com' #playnum_url = 'http://cache.video.iqiyi.com/jp/pc/' #playlength_url = "http://cache.video.iqiyi.com/a/" url_first = "http://toutiao.com/api/article/recent/?source=2&category=video&as=A165771AD802ED5&cp=57A8D2FE6D658E1&_=%s" url_second = "http://toutiao.com/api/article/recent/?source=2&count=20&category=video&max_behot_time=%s&utm_source=toutiao&offset=0&as=A1A5A75A8882EDC&cp=57A852EE9D5C6E1&max_create_time=%s&_=%s" max_search_page = 1000 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(toutiao_video, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = [""] #self._cat_urls = self.mgr.get_ordered_url(site_name='iqiyi') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] #for cat in self._cat_urls: url = self.url_first % int(time.time()) items.append(Request(url=url, callback=self.parse_first)) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_first(self, response): try: items = [] user_item = UserItem() data = json.loads(response.body) print data return items has_more = data.get("has_more") message = data.get("message") max_behot_time = data.get("max_behot_time") data = data.get("data") if data: for it in data: ep_item = EpisodeItem() ep_item['title'] = it["title"] ep_item['show_id'] = show_id ep_item['tag'] = "|".join([t.strip() for t in tag]) ep_item['upload_time'] = upload_time[0].strip() if category: ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority print type(data) #items.append(Request(url=urls, callback=self.parse_page)) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: items = [] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] #video items qy_v = response.xpath( '//div[@class="wrap-customAuto-ht "]/ul/li/div[1]') for v in qy_v: thumb = v.xpath('./a/img/@src').extract() url = v.xpath('./a/@href').extract() items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'thumb': thumb, "cat_name": cat_name, 'audit': audit, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) thumb_url = response.request.meta['thumb'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) #print "show_id: %s" % show_id #space maybe exist: "albumId:326754200" or "albumId: 326754200" albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath( '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()' ).extract() category = response.xpath( '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract( ) if not category: category = response.xpath( '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()' ).extract() if not category: category = response.xpath( '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath( '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()' ).extract() upload_time = response.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath( '//span[@id="widget-videotag"]/descendant::*/text()').extract( ) if not tag: tag = response.xpath( '//span[@class="mod-tags_item vl-block"]/descendant::*/text()' ).extract() if not tag: tag = response.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() #if category: # ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority if albumid: items.append( Request(url=self.playlength_url + albumid[0], callback=self.parse_playlength, meta={ 'item': ep_item, 'albumid': albumid[0] })) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self, response): try: log.msg('parse_playlength ,%s' % response.request.url) item = response.request.meta['item'] albumid = response.request.meta['albumid'] items = [] #sel = Selector(response) msg = response.body index = msg.find("AlbumInfo=") + len("AlbumInfo=") info = msg[index:] jinfo = json.loads(info) plsylength = jinfo["data"]["playLength"] #if plsylength: #if int(plsylength) < 600: #item['duration'] = str(plsylength) #items.append(Request(url=self.playnum_url+albumid+"/?qyid=", callback=self.parse_playnum, meta={'item':item})) item['duration'] = str(plsylength) items.append( Request(url=self.playnum_url + albumid + "/?qyid=", callback=self.parse_playnum, meta={'item': item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playnum(self, response): try: #log.msg('parse_playnum ,%s' % response.request.url) item = response.request.meta['item'] items = [] #sel = Selector(response) tplaynum = response.selector.re(re.compile(r':(\d+)')) #log.msg('play: %s, %s' % (tplaynum[0], response.request.url)) if tplaynum: playnum = tplaynum[0] item['played'] = str(playnum) items.append(item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class YoukuOrderSpider(Spider): name = "youku_order_history" pipelines = ['MysqlStorePipeline'] spider_id = "256" site_id = "1" allowed_domains = [ "i.youku.com", "www.youku.com", "v.youku.com", "ykrec.youku.com" ] url_prefix = 'http://i.youku.com' vpaction_url = "http://v.youku.com/v_vpactionInfo/id/" playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS/" ykrec_url = "http://ykrec.youku.com/video/packed/list.json?site=1&pg=1&module=2&pl=20&vid=" mgr = DbManager.instance() channel_exclude = mgr.get_channel_exclude() cat_exclude = mgr.get_cat_exclude() def __init__(self, orders=None, *args, **kwargs): super(YoukuOrderSpider, self).__init__(*args, **kwargs) if orders: orders = json.loads(orders) else: orders = self.mgr.get_ordered_url(site_name='youku') if orders: self._orders = orders else: self._orders = [] start_urls = [r['url'] for r in self._orders] def parse(self, response): try: log.msg(response.request.url, level=log.INFO) items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath( './div[@class="info"]/div[@class="username"]/a[1]/@title' ).extract() played = up.xpath( './div[@class="state"]/ul/li[@class="vnum"]/em/text()' ).extract() fans = up.xpath( './div[@class="state"]/ul/li[@class="snum"]/em/text()' ).extract() if user_name: user_item['user_name'] = user_name[0] if played: user_item['played'] = Util.normalize_vp(played[0]) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath( './div[@class="userintro"]/div[@class="desc"]/p[2]/text()' ).extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = '0' if yh: video_count = yh.xpath( 'div[1]/div/div/div/div[@class="title"]/span/a/text()').re( u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id items.append(user_item) #videos items.append( Request(url=response.request.url + "/videos", callback=self.parse_video_page, meta={'page': 1})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_video_page(self, response): try: page = response.request.meta['page'] log.msg('%s: %s' % (response.request.url, page)) items = [] #get videos yk_v = response.xpath('//div[@class="yk-col4"]/div') for v in yk_v: url = v.xpath('./div[@class="v-link"]/a/@href').extract() #pl = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-entry"]/span[@class="v-num"]/text()').extract() if url: ''' if pl: pld = Util.normalize_played(pl[0]) if int(pld) < int(self.hottest_played_threshold): log.msg('discard: %s' % url[0]) continue ''' items.append( Request(url=url[0], callback=self.parse_episode, meta={'recommend': False})) #get last_str and ajax_url last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'') ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'') #reqest sibling page if ajax_url: sibling_page = (3 * page - 1, 3 * page) for p in sibling_page: s = last_str[0] if last_str else u'' para = { "v_page": str(page), "page_num": str(p), "page_order": "1", "last_str": s } items.append( FormRequest(url=self.url_prefix + ajax_url[0] + "fun_ajaxload/", formdata=para, method='GET', callback=self.parse_video_page, meta={'page': page})) #request next page next_page = response.xpath( '//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract() if next_page: items.append( Request(url=self.url_prefix + next_page[0], callback=self.parse_video_page, meta={'page': page + 1})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: recommend = response.request.meta['recommend'] log.msg('%s|recommend: %s' % (response.request.url, recommend)) items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return #check recommended video's category category = response.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() cat = None if category: cat = category[0].replace(u'频道', '') if recommend and cat: if cat in self.cat_exclude: log.msg("video category excluded: %s" % cat) return #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::*/text()' ).extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = Util.strip_title("".join(title)) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if cat: ep_item['category'] = cat if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url if video_id: items.append( Request(url=self.vpaction_url + video_id[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) #recommendation if not recommend: items.append( Request(url=self.ykrec_url + video_id[0], callback=self.parse_recommendation)) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_vpaction(self, response): try: #log.msg('%s' % response.request.url) item = response.request.meta['item'] vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re( u'<label>总播放数:</label><span.*>(.+)</span>') if vp: item['played'] = Util.normalize_vp(vp[0]) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self, response): try: #log.msg('parse_playlength ,%s' % response.request.url) item = response.request.meta['item'] showid = item["show_id"] msg = response.body jinfo = json.loads(msg) plsylength = str(int(float(jinfo["data"][0]["seconds"]))) if plsylength: item['duration'] = str(plsylength) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_recommendation(self, response): try: log.msg('%s' % response.request.url) items = [] rec_data = json.loads(response.body) for v in rec_data['data']: items.append( Request(url='http://v.youku.com/v_show/id_%s.html' % v['codeId'], callback=self.parse_episode, meta={'recommend': True})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class YoukuCatNewestSpider(Spider): name = "youku_cat_newest" pipelines = ['MysqlStorePipeline'] spider_id = "1" site_id = "1" format_id = 2 #allowed_domains = ["www.youku.com", "v.youku.com", "i.youku.com", "index.youku.com", "play.youku.com"] url_prefix = 'http://www.youku.com' playlength_url = "http://play.youku.com/play/get.json?ct=10&vid=" mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(YoukuCatNewestSpider, self).__init__(*args, **kwargs) cat_urls = kwargs.get('cat_urls') if cat_urls: cat_urls = json.loads(cat_urls) else: cat_urls = self.mgr.get_cat_url("youku") if cat_urls: self._cat_urls = cat_urls else: self._cat_urls = [] def start_requests(self): try: items = [] for cat in self._cat_urls: print cat items.append( Request(url=cat['url'], callback=self.parse_page, meta={ 'cat_id': cat['id'], 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_page(self, response): try: logging.log(logging.INFO, 'page:%s' % response.request.url) cat_id = response.request.meta['cat_id'] audit = response.request.meta['audit'] cat_name = response.request.meta['cat_name'] priority = response.request.meta['priority'] #page = response.request.meta['page'] #if int(page) > int(self.max_search_page): # return items = [] #video items #yk_v = response.xpath('//div[@class="yk-col4"]') ''' yk_v = response.xpath('//div[@id="getVideoList"]/div[@class="yk-row yk-v-90u"]/div[@class="yk-col4"]') for v in yk_v: url = v.xpath('./div/div[@class="v-link"]/a/@href').extract() thumb_urls = v.xpath('./div/div[@class="v-thumb"]/img/@src').extract() ''' # 游戏、生活、旅游、搞笑 yk_v = response.xpath( '//div[@class="vaule_main"]/div[@class="box-video"]/div[@class="yk-row"]/div' ) for v in yk_v: url = v.xpath('./div/div/a/@href').extract() thumb_urls = v.xpath('./div/div/img/@src').extract() if thumb_urls: thumb_url = thumb_urls[0] if thumb_url == 'http://g1.ykimg.com/': thumb_url = None else: thumb_url = None if url: items.append( Request(url=url[0], callback=self.parse_episode, meta={ 'cat_id': cat_id, 'cat_name': cat_name, 'audit': audit, 'thumb_url': thumb_url, 'priority': priority })) # 资讯、母婴、军事 yk_v2 = response.xpath( '//div[@class="yk-box"]/div[@class="yk-body"]/div[@class="yk-row"]/div[@class="yk-col4"]' ) for v in yk_v2: url = v.xpath('./div/div[@class="v-link"]/a/@href').extract() thumb_urls = v.xpath( './div/div[@class="v-thumb"]/img/@src').extract() if thumb_urls: thumb_url = thumb_urls[0] if thumb_url == 'http://g1.ykimg.com/': thumb_url = None else: thumb_url = None if url: items.append( Request(url=url[0], callback=self.parse_episode, meta={ 'cat_id': cat_id, 'cat_name': cat_name, 'audit': audit, 'thumb_url': thumb_url, 'priority': priority })) ''' #pages next_page = response.xpath('//div[@class="yk-pager"]/ul/li[@class="next"]/a/@href').extract() if next_page: items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id})) ''' return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_episode(self, response): try: logging.log(logging.INFO, "episode:%s" % response.request.url) cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] thumb_url = response.request.meta['thumb_url'] priority = response.request.meta['priority'] items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) items.append(Request(url=owner[0], callback=self.parse_owner)) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = response.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') #if category: # ep_item['category'] = category[0].replace(u'频道', '') ep_item['category'] = cat_name if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id ep_item['audit'] = audit ep_item['format_id'] = self.format_id ep_item['thumb_url'] = thumb_url ep_item['priority'] = priority if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: if ep_item['show_id']: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_vpaction(self, response): try: logging.log(logging.INFO, response.request.url) item = response.request.meta['item'] vp = response.xpath( '//ul[@class="player_info"]/li[@class="sum"]/text()').extract( ) if vp: item['played'] = Util.normalize_played( Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_playlength(self, response): try: logging.log(logging.INFO, response.request.url) item = response.request.meta['item'] showid = item["show_id"] msg = response.body jinfo = json.loads(msg) plsylength = str(int(float(jinfo["data"]["video"]["seconds"]))) if plsylength: item['duration'] = int(plsylength) return item except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_owner(self, response): try: logging.log(logging.INFO, response.request.url) items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath( './div[@class="info"]/div[@class="username"]/a[1]/@title' ).extract() played = up.xpath( './div[@class="state"]/ul/li[@class="vnum"]/em/text()' ).extract() fans = up.xpath( './div[@class="state"]/ul/li[@class="snum"]/em/text()' ).extract() if user_name: user_item['user_name'] = user_name[0] if played: #user_item['played'] = Util.normalize_vp(played[0]) user_item['played'] = Util.normalize_played( Util.normalize_vp(played[0])) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath( './div[@class="userintro"]/div[@class="desc"]/p[2]/text()' ).extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = None if yh: video_count = yh.xpath( 'div[1]/div/div/div/div[@class="title"]/span/a/text()').re( u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url items.append(user_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
class GameFySpider(Spider): name = "gamefy_athletic_mobile" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(GameFySpider, self).__init__(*args, **kwargs) self._host_name = "http://www.gamefy.cn/" self._category = "游戏" self._site_id = '11' self._spider_id = '131072' self._cat_urls = self.mgr.get_cat_url("gamefy") def start_requests(self): items = [] try: for cat in self._cat_urls: items.append( Request(url=cat['url'], callback=self.parse_list, meta={'cat_id': cat['id']})) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return items def parse_list(self, response): items = [] try: cat_id = response.request.meta['cat_id'] sels = response.xpath( '//div[@class="con"]//div[@class="area-col min"]//div[@class="area-block"]//a' ) if sels: for sel in sels: urls = sel.xpath('./@href').extract() titles = sel.xpath('./@title').extract() imgs = sel.xpath('.//img/@src').extract() url = urls[0] title = titles[0].encode("UTF-8") img = imgs[0] items.append( Request(url=url, callback=self.parse_media, meta={ 'title': title, 'img': img, 'cat_id': cat_id })) #get next page next_page_sel = response.xpath( '//div[@class="viciao"]/a[text()=">"]/@href').extract() if next_page_sel: next_page = next_page_sel[0] next_page = self._host_name + next_page items.append( Request(url=next_page, callback=self.parse_list, meta={'cat_id': cat_id})) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return items def parse_media(self, response): items = [] try: cat_id = response.request.meta['cat_id'] title = response.request.meta['title'] thumb_url = response.request.meta['img'] url = response.request.url query = urlparse.urlparse(url).query query_dict = urlparse.parse_qs(query) show_id = query_dict['id'][0] #get tags sels = response.xpath('//span[@class="c_org1"]/a/text()').extract() tag = '' if sels: tag = "|".join(sels).encode("UTF-8") #get release time upload_time = '' sels = response.xpath( '//p[@class="c_gray0 lh3"]/span/text()').extract() if sels: time_times = sels[0].encode("UTF-8") upload_time = time_times[0:16] #get play times played = 0 sels = response.xpath( '//p[@class="c_gray0 lh3"]/span/a/text()').extract() if sels: played = sels[0].strip() ep_item = EpisodeItem() ep_item['title'] = title ep_item['show_id'] = show_id ep_item['tag'] = tag ep_item['upload_time'] = upload_time ep_item['category'] = self._category ep_item['thumb_url'] = thumb_url ep_item['spider_id'] = self._spider_id ep_item['site_id'] = self._site_id ep_item['url'] = url ep_item['played'] = played ep_item['cat_id'] = cat_id items.append(ep_item) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) finally: return items
def __init__(self): self.__db_mgr = DbManager.instance()
class bilibili_cat(Spider): name = "bilibili_cat" pipelines = ['MysqlStorePipeline'] spider_id = "5" site_id = "13" max_search_page = 1 url_prefix = "http://www.bilibili.com/index/tag/%s/default/1/%s.json" mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(bilibili_cat, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = self.mgr.get_cat_url('bilibili') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] for cat in self._cat_urls: items.extend([ Request(url=cat['url'], callback=self.parse_page, meta={ 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] }) ]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #video items id = response.xpath( '//div[@class="fcname"]/ul/li[@class="on"]/@tid').extract() tag = response.xpath( '//div[@class="fcname"]/ul/li[@class="on"]/a/text()').extract( ) if tag[0].strip() == u'全部': id = response.xpath( '//div[@class="menu-wrapper"]/ul/li[@class="m-i on"]/@data-tid' ).extract() tag = response.xpath( '//div[@class="menu-wrapper"]/ul/li[@class="m-i on"]/a/em/text()' ).extract() if id and tag: url = self.url_prefix % (id[0].strip(), tag[0].strip()) print url items.append( Request(url=url, callback=self.parse_episode, meta={ 'cat_name': cat_name, 'audit': audit, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] data = json.loads(response.body) list = data.get('list') for item in list: ep_item = EpisodeItem() ep_item['title'] = item.get('title') ep_item['show_id'] = item.get('aid') #ep_item['tag'] = item.get() ep_item['thumb_url'] = item.get('pic') ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item[ 'url'] = "http://www.bilibili.com/video/av%s/" % item.get( 'aid') #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['description'] = item.get("description") ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['played'] = item.get('play') #ep_item['upload_time'] = item.get('create') duration = item.get('duration') if duration: a, b = duration.split(':') duration = int(a) * 60 + int(b) else: duration = 0 ep_item['duration'] = duration items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class iqiyi_subject_history(Spider): name = "iqiyi_subject_history" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] spider_id = "4096" #iqiyi_order_history site_id = "5" #iqiyi allowed_domains = [ "list.iqiyi.com", "www.iqiyi.com", "cache.video.iqiyi.com", "cache.video.qiyi.com" ] url_prefix = 'http://list.iqiyi.com' playnum_url = 'http://cache.video.iqiyi.com/jp/pc/' playlength_url = "http://cache.video.iqiyi.com/a/" hottest_played_threshold = get_project_settings().get( 'ORDERED_PLAYED_THRESHOLD') mgr = DbManager.instance() def __init__(self, cat_urls=None, *args, **kwargs): super(iqiyi_subject_history, self).__init__(*args, **kwargs) if cat_urls: cat_urls = json.loads(cat_urls) self.max_search_page = get_project_settings().get( 'MAX_MANUAL_SEARCH_PAGE') else: cat_urls = self.mgr.get_subjects("iqiyi") self.max_search_page = get_project_settings().get( 'MAX_SEARCH_PAGE') if cat_urls: self._cat_urls = cat_urls else: self._cat_urls = [] def start_requests(self): try: items = [] for cat in self._cat_urls: items.append( Request(url=cat['url'], callback=self.parse, meta={'cat_id': cat['id']})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) #start_urls = ["http://www.iqiyi.com/a_19rrgubpsd.html"] #start_urls = ["http://www.iqiyi.com/a_19rrgiavst.html#vfrm=2-3-0-1"] #start_urls = mgr.get_cat_url("iqiyi") #for each category parse all its sub-categories or types def parse(self, response): try: #log.msg('lev1: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #category albumId = response.selector.re(re.compile(r'albumId: ?(\d+)'))[0] sourceid = response.selector.re(re.compile(r'sourceId: ?(\d+)'))[0] cid = response.selector.re(re.compile(r'cid: ?(\d+)'))[0] years = [] subs = sel.xpath( '//div[@id="block-J"]/div[1]/div[1]/div[1]/div[2]/ul/li/a/@data-year' ).extract() i = 0 for year in subs: sxpath = '//div[@id="block-J"]/div[1]/div[' + str( i + 2) + ']/a/@data-month' subs1 = sel.xpath(sxpath).extract() #subs1 = sel.xpath('//div[@id="block-J"]/div[1]/div[2]/a/@data-month').extract() for month in subs1: y_month = str(year) + str(month) url1 = "http://cache.video.qiyi.com/jp/sdvlst/" + cid + "/" + sourceid + "/" + y_month + "/?categoryId=" + cid + "&sourceId=" + sourceid + "&tvYear=" + y_month + "&callback=window" items.extend([ Request(url=url1, callback=self.parse_second, meta={'cat_id': cat_id}) ]) i = i + 1 return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_second(self, response): try: #log.msg('lev2: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #category begin = response.body.find("try{window(") begin += len("try{window(") end = response.body.find(");}catch(e)") msg = response.body[begin:end] jmsg = json.loads(msg) num = len(jmsg["data"]) for i in range(num): title = jmsg["data"][i]["aName"] play_num = "0" play_num = str(jmsg["data"][i]["disCnt"]) upload_time = jmsg["data"][i]["tvYear"] turl = jmsg["data"][i]["vUrl"] timelength = str(jmsg["data"][i]["timeLength"]) ep_item = EpisodeItem() if len(title) != 0: ep_item["title"] = title ep_item["played"] = play_num if len(upload_time) != 0: ep_item["upload_time"] = upload_time if len(turl) != 0: ep_item["url"] = turl if len(timelength) != 0: ep_item["duration"] = timelength ep_item['subject_id'] = cat_id items.append( Request(url=turl, callback=self.parse_episode, meta={'item': ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) items = [] item = response.request.meta['item'] sel = Selector(response) #video info ttitle = sel.xpath( '//div[@class="play-tit-l"]/h2/span/text()').extract() title = "" if len(ttitle) > 0: title = ttitle[0] if len(title) == 0: ttitle = sel.xpath( '//div[@class="play-tit-l"]/h1/text()').extract() if len(ttitle) > 0: title = ttitle[0] if len(title) == 0 and "title" in item: title = item["title"] index = response.request.url.rfind("#") surl = response.request.url[:index] show_id = "" r = re.compile(r'[vw]_[0-9a-zA-Z]*') m = r.search(surl) if m: show_id = m.group() category = None tcategory = sel.xpath( '//div[@id="block-E"]/div[1]/div[1]/div[2]/div[1]/span[1]/a[2]/text()' ).extract() if len(tcategory) > 0: category = tcategory[0].strip() tcategory = sel.xpath( '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract( ) #"channel" if len(tcategory) > 0 and not category: category = tcategory[0].strip() else: tcategory = sel.xpath( '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()' ).extract() if len(tcategory) > 0 and not category: category = tcategory[0].strip() else: tcategory = sel.xpath( '//div[@class="crumb_bar"]/span[1]/a[2]/text()' ).extract() #"channel" if len(tcategory) > 0 and not category: category = tcategory[0].strip() #else: # log.msg("not find category,url is %s" % response.request.url, level=log.ERROR) #get upload time upload_time = "" tag = "" tupload_time = sel.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if len(tupload_time) > 0: upload_time = tupload_time[0].strip() tupload_time = sel.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() if len(tupload_time) > 0: upload_time = tupload_time[0].strip() if len(upload_time) == 0: tupload_time = sel.xpath( '//div[@class="movieMsg"]/div/p/text()').extract() if len(tupload_time) > 0: r = re.compile(r'(\d+)[.-](\d+)[\d+].*') m = r.search(tupload_time[0]) if m: ttupload_time = m.group() upload_time = ttupload_time.replace(".", "-") #ttupload_time = tupload_time.re(r'(\d+)[.-](\d+)[\d+].*') #get tags,two ways to get tags taglist = sel.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() if len(taglist) > 0: tag = "|".join(taglist) if not tag or len(tag) == 0: taglist = sel.xpath( '//div[@class="crumb_bar"]/span[1]/a/text()').extract() if len(taglist) > 0: tag = "|".join(taglist) ep_item = response.request.meta['item'] if title: ep_item['title'] = title if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = tag if upload_time: ep_item['upload_time'] = upload_time if category: ep_item['category'] = category if not title or not show_id or not category: #log.msg("title ,show_id,category is null ,url is %s" % response.request.url, level=log.ERROR) return if len(title) == 0 or len(show_id) == 0 or len(category) == 0: #log.msg("title ,show_id,category is null ,url is %s" % response.request.url, level=log.ERROR) return ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class youku_spider(Spider): name = 'youku_spider' site_id = '1' subscribe_ids = { 'all': '1', 'channel': '2', 'keyword': '4', 'page': '8', 'cagtegory': '16', 'subject': '32', 'manual': '64' } #prefix urls youku_url_prefix = "http://www.youku.com" soku_url_prefix = "http://www.soku.com" vpaction_url = "http://v.youku.com/v_vpactionInfo/id" playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS" #global variable mgr = DbManager.instance() channel_exclude = mgr.get_channel_exclude() category_exclude = mgr.get_cat_exclude() ordered_played_threshold = get_project_settings().get( 'ORDERED_PLAYED_THRESHOLD') hottest_played_threshold = get_project_settings().get( 'HOTTEST_PLAYED_THRESHOLD') newest_time_threshold = get_project_settings().get('NEWEST_TIME_THRESHOLD') #default value max_search_page = "0" def __init__(self, *args, **kwargs): super(youku_spider, self).__init__(*args, **kwargs) self.spider_parses = {'channel':self.channel_parse,\ 'video_set':self.video_set_parse,\ 'search':self.search_parse,\ 'video':self.video_parse,\ 'page':self.page_parse,\ 'user':self.user_parse,\ 'category':self.category_parse} # #新的接口方式 # #自动订阅接口 # self.subscribe_type = kwargs['type'] if 'type' in kwargs.keys() else None # #手动订阅接口 # self.subscribe_url = kwargs['url'] if 'url' in kwargs.keys() else None # self.subscribe_id = kwargs['id'] if 'id' in kwargs.keys() else None # #以下老的接口方式 #自动订阅接口 self.subscribe_type = kwargs['type'] if 'type' in kwargs.keys( ) else None #手动订阅接口 # 频道订阅urls subscribe_channel_urls = kwargs[ 'channel_urls'] if 'channel_urls' in kwargs.keys() else None # 分类订阅urls subscribe_cat_urls = kwargs['cat_urls'] if 'cat_urls' in kwargs.keys( ) else None # 页面订阅urls subscribe_page_urls = kwargs[ 'page_urls'] if 'page_urls' in kwargs.keys() else None # 专题订阅urls subscribe_subject_urls = kwargs[ 'subject_urls'] if 'subject_urls' in kwargs.keys() else None # 关键词订阅urls self.subscribe_keywords = kwargs[ 'keywords'] if 'keywords' in kwargs.keys() else None self.subscribe_cat_ids = kwargs['cat_ids'] if 'cat_ids' in kwargs.keys( ) else [] self.subscribe_url = None self.subscribe_id_key = None self.subscribe_id_value = None url = None key = None try: if subscribe_channel_urls: log.msg("subscribe_channel_url:", level=log.DEBUG) subscribe_channel_urls = json.loads(subscribe_channel_urls) key = None url = subscribe_channel_urls[0] if subscribe_cat_urls: log.msg("subscribe_cat_url:", level=log.DEBUG) subscribe_cat_urls = json.loads(subscribe_cat_urls) key = 'cat_id' url = subscribe_cat_urls[0] if subscribe_page_urls: log.msg("subscribe_page_url:", level=log.DEBUG) subscribe_page_urls = json.loads(subscribe_page_urls) key = 'pg_id' url = subscribe_page_urls[0] if subscribe_subject_urls: log.msg("subscribe_subject_url:", level=log.DEBUG) subscribe_subject_urls = json.loads(subscribe_subject_urls) key = None url = subscribe_subject_urls[0] except Exception, e: log.msg(traceback.format_exc(), level=log.ERROR) return if url: log.msg(url, level=log.DEBUG) self.subscribe_url = url['url'] if 'url' in url.keys() else None self.subscribe_id_value = url['id'] if 'id' in url.keys() else None if key: self.subscribe_id_key = key if self.subscribe_keywords: log.msg("subscribe_keywords:", level=log.DEBUG) self.subscribe_keywords = json.loads(self.subscribe_keywords) self.subscribe_id_key = 'kw_id' for url in self.subscribe_keywords: log.msg(url, level=log.DEBUG) if self.subscribe_cat_ids: log.msg("subscribe_cat_ids:", level=log.DEBUG) self.subscribe_cat_ids = json.loads(self.subscribe_cat_ids) for url in self.subscribe_cat_ids: log.msg(url, level=log.DEBUG) #解析spider_type self.spider_type_resolve()
class iqiyi_order(Spider): name = "iqiyi_order" pipelines = ['MysqlStorePipeline'] spider_id = "131072" site_id = "5" #iqiyi allowed_domains = [ "list.iqiyi.com", "www.iqiyi.com", "cache.video.iqiyi.com" ] url_prefix = 'http://list.iqiyi.com' playnum_url = 'http://cache.video.iqiyi.com/jp/pc/' playlength_url = "http://cache.video.iqiyi.com/a/" max_search_page = 1 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(iqiyi_order, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = self.mgr.get_ordered_url(site_name='iqiyi') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] #items.append(Request(url="http://www.iqiyi.com/u/1061614233", callback=self.parse_first,meta={'cat_name': u'生活','audit':1,'show_id':'1061614233'})) #''' for cat in self._cat_urls: #items.append(Request(url="http://www.iqiyi.com/u/1211677213", callback=self.parse_first)) items.append( Request(url=cat['url'], callback=self.parse_first, meta={ 'cat_name': cat['user'], 'audit': cat['audit'], 'show_id': cat['show_id'], 'priority': cat['priority'] })) #''' return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_first(self, response): try: items = [] user_item = UserItem() cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] show_id = response.request.meta['show_id'] priority = response.request.meta['priority'] #owner_id = response.xpath('//div[@class="top-yc_userCare fl"]/a/@data-userid') fans = response.xpath( '//div[@class="info_connect"]//em/a[@data-fans="fans"]/text()') played = response.xpath( '//div[@class="info_connect"]/span[@class="conn_type S_line1"]/em/a/text()' ) ''' if owner_id: owner_id = owner_id.extract()[0].strip() #user_item['owner_id']=owner_id user_item['show_id']=owner_id else: owner_id = response.xpath('//span[@class="pc-btn pc-care-large pc-btn-reset"]/a[@class="btn-care btn-care-tocare"]/@data-userid') if owner_id: owner_id = owner_id.extract()[0].strip() #user_item['owner_id']=owner_id user_item['show_id']=owner_id ''' user_item['show_id'] = show_id if fans: fans = fans.extract()[0].strip() fans = fans.replace(',', '') if fans.find(u'万'): fans = float(fans[:fans.find(u'万')]) fans = fans * 10000 user_item['fans'] = int(fans) else: user_item['fans'] = int(fans) if played: played = played.extract()[0].strip() played = played.replace(',', '') if played.find(u'万'): played = float(played[:played.find(u'万')]) played = played * 10000 user_item['played'] = int(played) else: user_item['played'] = int(played) username = response.xpath( '//div[@class="pf_username"]/span/text()') userinfo = response.xpath('//div[@class="pf_intro"]/a/text()') if username: username = username.extract()[0].strip() user_item['user_name'] = username if userinfo: userinfo = userinfo.extract()[0].strip() user_item['intro'] = userinfo user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url items.append(user_item) title = u'视频' urls = '' u = response.xpath( '//div[@class="qiyiSet-nav"]/ul[@class="qiyiNav-normal"]/li/a[@title="%s"]/@href' % title) if u: urls = u.extract()[0] else: u = response.xpath( '//div[@class="pc-nav-title pc-item-box"]/ul[@class="pc-user-nav pc-user-nav-4 clearfix"]/li[@data-ugcguide-target ="2"]/a/@href' ) urls = u.extract()[0] items.append( Request(url=urls, callback=self.parse_page, meta={ 'cat_name': cat_name, 'audit': audit, 'priority': priority })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: items = [] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] #video items qy_v = response.xpath( '//div[@class="wrap-customAuto-ht "]/ul/li/div[1]') for v in qy_v: thumb = v.xpath('./a/img/@src').extract() url = v.xpath('./a/@href').extract() items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'thumb': thumb, "cat_name": cat_name, 'audit': audit, 'priority': priority })) #pages #next_page = response.xpath("//div[@class='mod-page']/a[text()='%s']/@href" % u'下一页').extract() #if next_page: # items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id, 'cat_name': cat_name})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) thumb_url = response.request.meta['thumb'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) #print "show_id: %s" % show_id #space maybe exist: "albumId:326754200" or "albumId: 326754200" albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath( '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()' ).extract() category = response.xpath( '//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract( ) if not category: category = response.xpath( '//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()' ).extract() if not category: category = response.xpath( '//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath( '//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()' ).extract() upload_time = response.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath( '//span[@id="widget-videotag"]/descendant::*/text()').extract( ) if not tag: tag = response.xpath( '//span[@class="mod-tags_item vl-block"]/descendant::*/text()' ).extract() if not tag: tag = response.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() #if category: # ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority if albumid: items.append( Request(url=self.playlength_url + albumid[0], callback=self.parse_playlength, meta={ 'item': ep_item, 'albumid': albumid[0] })) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self, response): try: log.msg('parse_playlength ,%s' % response.request.url) item = response.request.meta['item'] albumid = response.request.meta['albumid'] items = [] #sel = Selector(response) msg = response.body index = msg.find("AlbumInfo=") + len("AlbumInfo=") info = msg[index:] jinfo = json.loads(info) plsylength = jinfo["data"]["playLength"] #if plsylength: #if int(plsylength) < 600: #item['duration'] = str(plsylength) #items.append(Request(url=self.playnum_url+albumid+"/?qyid=", callback=self.parse_playnum, meta={'item':item})) item['duration'] = str(plsylength) items.append( Request(url=self.playnum_url + albumid + "/?qyid=", callback=self.parse_playnum, meta={'item': item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playnum(self, response): try: #log.msg('parse_playnum ,%s' % response.request.url) item = response.request.meta['item'] items = [] #sel = Selector(response) tplaynum = response.selector.re(re.compile(r':(\d+)')) #log.msg('play: %s, %s' % (tplaynum[0], response.request.url)) if tplaynum: playnum = tplaynum[0] item['played'] = str(playnum) items.append(item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class sohu_cat(Spider): name = "sohu_cat" pipelines = ['MysqlStorePipeline'] spider_id = "1048576" site_id = "3" #iqiyi max_search_page = 1 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(sohu_cat, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = self.mgr.get_cat_url('sohu') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] for cat in self._cat_urls: items.append( Request(url=cat['url'], callback=self.parse_page, meta={ 'page': 1, 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) page = response.request.meta['page'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] #if int(page) > int(self.max_search_page): # return items = [] #video items qy_v = response.xpath('//div[@class="column-bd wemd cfix"]/ul/li') if not qy_v: qy_v = response.xpath('//div[@class="column-bd cfix"]/ul/li') print len(qy_v) for v in qy_v: thumb = v.xpath('./div[@class="st-pic"]/a/img/@src').extract() url = v.xpath('./div[@class="st-pic"]/a/@href').extract() lens = v.xpath( './div[@class="st-pic"]/span[@class="maskTx"]/text()' ).extract() if not lens: lens = v.xpath( './div[@class="st-pic"]/a/span[@class="maskTx"]/text()' ).extract() try: lens = lens[0].strip() if not lens: lens = 0 else: a, b = lens.split(':') lens = int(a) * 60 + int(b) items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'cat_name': cat_name, 'thumb': thumb, 'audit': audit, 'lens': lens, 'priority': priority })) except Exception as e: continue return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] lens = response.request.meta['lens'] priority = response.request.meta['priority'] items = [] #space maybe exist: "albumId:326754200" or "albumId: 326754200" #albumid = response.selector.re(re.compile(r'pid: ?(\d+)')) #show_id show_id = Util.get_sohu_showid(response.request.url) #tag tag = response.xpath('//meta[@name="keywords"]/@content').extract() #video info title = response.xpath( '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()' ).extract() #played = response.xpath('//em[@id="video_playcount"]').extract() ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = tag[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['duration'] = lens #if played: # ep_item['played']=played #if albumid: # items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]})) #else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class QqOrderSpider(Spider): name = "qq_order" pipelines = ['MysqlStorePipeline'] spider_id = "2097152" site_id = "16" format_id = 2 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(QqOrderSpider, self).__init__(*args, **kwargs) orders = kwargs.get('orders') if orders: orders = json.loads(orders) else: orders = self.mgr.get_ordered_url(site_name='qq') if orders: self._orders = orders else: self._orders = [] def start_requests(self): try: items = [] for order in self._orders: url = order.pop('url') if not url.endswith('/videos'): if url.endswith('/'): url = url + 'videos' else: url = url + '/videos' r = Request(url=url, callback=self.parse_page) r.meta.update({'order': order}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_page(self, response): try: logging.log(logging.INFO, 'page:%s' % response.request.url) order = response.request.meta['order'] items = [] qq_v = response.xpath( '//ul[@id="videolst_cont"]/li[@class="list_item"]') for v in qq_v: urls = v.xpath('./strong/a/@href').extract() titles = v.xpath('./strong/a/text()').extract() thumb_urls = v.xpath('./a/img/@src').extract() durations = v.xpath('./a/span/em/text()').extract() playeds = v.xpath( './div/span[@class="figure_info_play"]/span/text()' ).extract() upload_times = v.xpath( './div/span[@class="figure_info_time"]/text()').extract() title = titles[0] if titles else None thumb_url = thumb_urls[0] if thumb_urls else None duration = Util.get_qq_duration( durations[0]) if durations else None played = Util.normalize_played(Util.normalize_vp( playeds[0])) if playeds else None upload_time = Util.get_qq_upload_time( upload_times[0]) if upload_times else None if urls: r = Request(url=urls[0], callback=self.parse_episode) d = { 'title': title, 'thumb_url': thumb_url, 'duration': duration, 'played': played, 'upload_time': upload_time } d.update(order) r.meta.update({'order': d}) items.append(r) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_episode(self, response): try: logging.log(logging.INFO, 'episode:%s' % response.request.url) order = response.request.meta['order'] items = [] #video info #tags = response.xpath('//p[@class="info_tags"]//a/@title').extract() #descriptions = response.xpath('//div[@class="info_summary cf"]/span/text()').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_qq_showid(response.request.url) #if tags: # ep_item['tag'] = Util.unquote(tags[0]).rstrip('|') #if descriptions: # ep_item['description'] = descriptions[0] for k, v in order.items(): if k == 'user': ep_item['category'] = v elif k == 'show_id': ep_item['owner_show_id'] = v else: ep_item[k] = v ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['format_id'] = self.format_id items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
class iqiyi_military_hottest(Spider): name = "iqiyi_military_hottest" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] spider_id = "512" #iqiyi_military_hottest site_id = "5" #iqiyi allowed_domains = ["list.iqiyi.com","www.iqiyi.com","cache.video.iqiyi.com"] url_prefix = 'http://list.iqiyi.com' playnum_url = 'http://cache.video.iqiyi.com/jp/pc/' playlength_url = "http://cache.video.iqiyi.com/a/" hottest_played_threshold = get_project_settings().get('ORDERED_PLAYED_THRESHOLD') mgr = DbManager.instance() def __init__(self, cat_urls=None, *args, **kwargs): super(iqiyi_military_hottest, self).__init__(*args, **kwargs) if cat_urls: cat_urls = json.loads(cat_urls) self.max_search_page = get_project_settings().get('MAX_MANUAL_SEARCH_PAGE') else: cat_urls = self.mgr.get_cat_url("iqiyi") self.max_search_page = get_project_settings().get('MAX_SEARCH_PAGE') if cat_urls: self._cat_urls = cat_urls else: self._cat_urls = [] def start_requests(self): try: items = [] for cat in self._cat_urls: items.append(Request(url=cat['url'], callback=self.parse, meta={'cat_id': cat['id']})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) #for each category parse all its sub-categories or types def parse(self, response): try: #log.msg('lev1: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] #category subs = response.xpath('//div[@class="mod_sear_menu mt20 mb30"]/div[2]/ul/li/a/@href').extract() for turl in subs: if turl != "#": url = self.url_prefix+turl items.extend([Request(url=url, callback=self.parse_second, meta={'cat_id': cat_id})]) else: items.extend([Request(url=response.request.url, callback=self.parse_most_played, meta={'cat_id': cat_id})]) inh_item = self.parse_second(response) if inh_item: items.extend(inh_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_second(self,response): try: #log.msg('lev2: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] #category subs = response.xpath('//div[@class="mod_sear_menu mt20 mb30"]/div[3]/ul/li/a/@href').extract() for turl in subs: if turl != "#": url = self.url_prefix+turl items.extend([Request(url=url, callback=self.parse_most_played, meta={'cat_id': cat_id})]) else: items.extend([Request(url=response.request.url, callback=self.parse_most_played, meta={'cat_id': cat_id})]) inh_item = self.parse_most_played(response) if inh_item: items.extend(inh_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) #for each sub-category we get the most played def parse_most_played(self, response): try: #log.msg('lev3: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] url = response.request.url suburl = "------------" index = url.rfind(suburl) #combine all sort types if index > 0: headurl = url[0:index] url11 = headurl + suburl + "10-1-2--1-.html" items.extend([Request(url=url11, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})]) url12 = headurl + suburl + "10-1-2--2-.html" items.extend([Request(url=url12, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})]) url21 = headurl + suburl + "4-1-2--1-.html" items.extend([Request(url=url21, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})]) url22 = headurl + suburl + "4-1-2--2-.html" items.extend([Request(url=url22, callback=self.parse_page,meta={'page': 1, 'cat_id': cat_id})]) #donnot forget parse current reponse's page response.request.meta.update({'page': 1}) items.extend(self.parse_page(response)) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) page = response.request.meta['page'] cat_id = response.request.meta['cat_id'] if int(page) > int(self.max_search_page): return items = [] #video items qy_v = response.xpath('//div[@class="wrapper-piclist"]/ul/li/div[1]') for v in qy_v: thumb = v.xpath('./a/img/@src').extract() url = v.xpath('./a/@href').extract() items.append(Request(url=url[0].strip(), callback=self.parse_episode, meta={'cat_id': cat_id, 'thumb': thumb})) #pages next_page = response.xpath("//div[@class='mod-page']/a[text()='%s']/@href" % u'下一页').extract() if next_page: items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) cat_id = response.request.meta['cat_id'] thumb_url = response.request.meta['thumb'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) #space maybe exist: "albumId:326754200" or "albumId: 326754200" albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath('//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()').extract() if not title: title = response.xpath('//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()').extract() category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() if not category: category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract() upload_time = response.xpath('//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath('//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath('//span[@id="widget-videotag"]/descendant::*/text()').extract() if not tag: tag = response.xpath('//span[@class="mod-tags_item vl-block"]/descendant::*/text()').extract() if not tag: tag = response.xpath('//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() if category: ep_item['category'] = category[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id if albumid: items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self,response): try: log.msg('parse_playlength ,%s' % response.request.url) item = response.request.meta['item'] albumid = response.request.meta['albumid'] items = [] #sel = Selector(response) msg = response.body index = msg.find("AlbumInfo=") + len("AlbumInfo=") info = msg[index:] jinfo = json.loads(info) plsylength = jinfo["data"]["playLength"] if plsylength: item['duration'] = str(plsylength) items.append(Request(url=self.playnum_url+albumid+"/?qyid=", callback=self.parse_playnum, meta={'item':item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playnum(self, response): try: #log.msg('parse_playnum ,%s' % response.request.url) item = response.request.meta['item'] items = [] #sel = Selector(response) tplaynum = response.selector.re(re.compile(r':(\d+)')) #log.msg('play: %s, %s' % (tplaynum[0], response.request.url)) if tplaynum: playnum = tplaynum[0] if int(playnum) > int(self.hottest_played_threshold): item['played'] = str(playnum) items.append(item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class v1_cat(Spider): name = "v1_cat" pipelines = ['MysqlStorePipeline'] spider_id = "10" site_id = "17" max_search_page = 1 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(v1_cat, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = self.mgr.get_cat_url('v1') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] for cat in self._cat_urls: items.extend([ Request(url=cat['url'], callback=self.parse_page, meta={ 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] }) ]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] qy_v = response.xpath('//div[@id="addMore"]/ul/li') print len(qy_v) for v in qy_v: thumb = v.xpath('./div[@class="lists"]/a/img/@src').extract() url = v.xpath('./div[@class="lists"]/a/@href').extract() if url: items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'cat_name': cat_name, 'audit': audit, 'priority': priority, 'thumb': thumb })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] thumb = response.request.meta['thumb'] items = [] show_id = Util.get_v1_showid(response.request.url) title = response.xpath('//meta[@name="title"]/@content').extract() tags = response.xpath( '//meta[@name="keywords"]/@content').extract() ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tags: ep_item['tag'] = tags[0].strip() ep_item['thumb_url'] = thumb[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name #ep_item['description'] = item.get("description") ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority #ep_item['played'] = item.get('play') #ep_item['upload_time'] = item.get('create') #duration = item.get('duration') #if duration: # a,b=duration.split(':') # duration = int(a)*60+int(b) #else: # duration = 0 #ep_item['duration'] = duration items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class YoutubeSearchVideoSpider(Spider): name = "youtube_search_video" pipelines = ['CategoryPipeline', 'MysqlStorePipeline'] spider_id = "64" site_id = "2" allowed_domains = ["www.youtube.com"] url_prefix = 'https://www.youtube.com' mgr = DbManager.instance() def __init__(self, keywords=None, *args, **kwargs): super(YoutubeSearchVideoSpider, self).__init__(*args, **kwargs) if keywords: keywords = json.loads(keywords) self.max_search_page = get_project_settings().get( 'MAX_MANUAL_SEARCH_PAGE') else: keywords = self.mgr.get_keywords(st='video', site_name='youtube') self.max_search_page = get_project_settings().get( 'MAX_SEARCH_PAGE') if keywords: self._keywords = keywords else: self._keywords = [] def start_requests(self): try: items = [] for page in xrange(int(self.max_search_page)): items.extend([ Request( url= 'https://www.youtube.com/results?filters=video%%2C+week&search_sort=video_view_count&search_query=%s&page=%s' % (k['keyword'], page + 1), callback=self.parse, meta={ 'category': k['user'], 'kw_id': k['id'] }) for k in self._keywords ]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse(self, response): try: category = response.request.meta[ 'category'] if 'category' in response.request.meta else 'other' kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else 1 log.msg('%s: %s' % (response.request.url, category), level=log.INFO) items = [] #videos videos = response.xpath('//ol[@class="item-section"]/li') for v in videos: url = v.xpath( './div/div/div[@class="yt-lockup-thumbnail"]/a/@href' ).extract() thumb_url = v.xpath( './div/div/div[@class="yt-lockup-thumbnail"]/a/div/img/@src' ).extract() views = v.xpath( './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li/text()' ).re('([\d|,]*) views') upload_time = v.xpath( './div/div/div[@class="yt-lockup-content"]/div[@class="yt-lockup-meta"]/ul/li[2]/text()' ).extract() if url: items.append( Request(url=self.url_prefix + url[0], callback=self.parse_episode, meta={ 'thumb_url': thumb_url, 'upload_time': upload_time, 'category': category, 'kw_id': kw_id })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('%s' % response.request.url) thumb_url = response.request.meta['thumb_url'] upload_time = response.request.meta['upload_time'] category = response.request.meta['category'] kw_id = response.request.meta[ 'kw_id'] if 'kw_id' in response.request.meta else 1 items = [] #owner owner = response.xpath( '//div[@class="yt-user-info"]/a/@data-ytid').extract() owner_url = response.xpath( '//div[@class="yt-user-info"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = owner[0] items.append( Request(url=self.url_prefix + owner_url[0] + "/about", callback=self.parse_about)) #video info title = response.xpath('//span[@id="eow-title"]/text()').extract() #category = response.xpath('//p[@id="eow-category"]/a/text()').extract() tag = response.xpath( './head/meta[@name="keywords"]/@content').extract() #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract() description = response.xpath( '//p[@id="eow-description"]/descendant-or-self::*/text()' ).extract() played = response.xpath( '//div[@class="watch-view-count"]/text()').extract() #other info sts = re.search(r'\"sts\": ?(\d+)', response.body) ep_item = EpisodeItem() ep_item['show_id'] = Util.get_youtube_showid(response.request.url) if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: ep_item['title'] = title[0].strip() if tag: ep_item['tag'] = tag[0].replace(', ', '|') if category: #ep_item['category'] = category[0].replace('&', '|') ep_item['category'] = category ''' if upload: ptime = Util.get_youtube_publish(upload[0]) if ptime: ep_item['upload_time'] = ptime ''' if upload_time: t = Util.get_youtube_upload_time(upload_time[0].strip()) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = "\n".join(description) if thumb_url: ep_item['thumb_url'] = thumb_url[0] if played: pld = Util.normalize_played(played[0]) if pld: ep_item['played'] = Util.normalize_played(played[0]) else: ep_item['played'] = '0' ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = Util.normalize_youtube_url(response.request.url) ep_item['kw_id'] = kw_id query = Util.encode({'video_id': ep_item['show_id'], \ 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \ 'sts': sts.groups()[0] if sts else ''}) items.append( Request(url='http://www.youtube.com/get_video_info?' + query, callback=self.parse_other_info, meta={'item': ep_item})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_other_info(self, response): try: log.msg('%s' % response.request.url) item = response.request.meta['item'] items = [] #duration duration = re.search(r'length_seconds=(\d+)', response.body) if duration: item['duration'] = duration.groups()[0] items.append(item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_about(self, response): try: log.msg(response.request.url, level=log.INFO) items = [] show_id = response.xpath( '//meta[@itemprop="channelId"]/@content').extract() user_name = response.xpath( '//span[@class="qualified-channel-title-text"]/a/text()' ).extract() fans = response.xpath('//ul[@class="about-stats"]/li').re( re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*subscribers.*</li>', re.S)) played = response.xpath('//ul[@class="about-stats"]/li').re( re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*views.*</li>', re.S)) intro = response.xpath( '//div[@class="about-description branded-page-box-padding"]/descendant-or-self::*/text()' ).extract() if show_id: user_item = UserItem() user_item['show_id'] = show_id[0] if user_name: user_item['user_name'] = user_name[0] if fans: user_item['fans'] = Util.normalize_played(fans[0]) if played: user_item['played'] = Util.normalize_played(played[0]) if intro: user_item['intro'] = "".join(intro).strip() user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url[:-len('/about')] items.append(user_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class YoukuCatHottestSpider(Spider): name = "youku_cat_hottest" pipelines = [ 'HottestItemPipeline', 'CategoryPipeline', 'MysqlStorePipeline' ] spider_id = "2" #youku_cat_hottest site_id = "1" #youku allowed_domains = [ "www.youku.com", "v.youku.com", "i.youku.com", "index.youku.com", "play.youku.com" ] url_prefix = 'http://www.youku.com' vpaction_url = "http://v.youku.com/v_vpactionInfo/id/" # playlength_url = "http://v.youku.com/player/getPlayList/VideoIDS/" playlength_url = "http://play.youku.com/play/get.json?ct=10&vid=" hottest_played_threshold = get_project_settings().get( 'HOTTEST_PLAYED_THRESHOLD') mgr = DbManager.instance() channel_exclude = mgr.get_channel_exclude() def __init__(self, cat_urls=None, *args, **kwargs): super(YoukuCatHottestSpider, self).__init__(*args, **kwargs) if cat_urls: cat_urls = json.loads(cat_urls) self.max_search_page = get_project_settings().get( 'MAX_MANUAL_SEARCH_PAGE') else: cat_urls = self.mgr.get_cat_url("youku") self.max_search_page = get_project_settings().get( 'MAX_SEARCH_PAGE') if cat_urls: self._cat_urls = cat_urls else: self._cat_urls = [] def start_requests(self): try: items = [] for cat in self._cat_urls: items.append( Request(url=cat['url'], callback=self.parse, meta={'cat_id': cat['id']})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) #for each category parse all its sub-categories or types def parse(self, response): try: #log.msg('lev1: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #category subs = sel.xpath( '//div[@class="yk-filter-panel"]/div[2]/ul/li/a/@href' ).extract() items.extend([ Request(url=url, callback=self.parse_most_played, meta={'cat_id': cat_id}) for url in subs ]) inh_item = self.parse_most_played(response) if inh_item: items.extend(inh_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) #for each sub-category we get the most played def parse_most_played(self, response): try: #log.msg('lev2: %s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #most played most_played = sel.xpath( "//div[@class='yk-sort']/div[3]/div/div[@class='panel']/ul/li/a[text()='%s']/@href" % u'本周').extract() items.extend([ Request(url=url, callback=self.parse_page, meta={ 'page': 1, 'cat_id': cat_id }) for url in most_played ]) ''' inh_item = self.parse_page(response) if inh_item: items.extend(inh_item) ''' return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: log.msg('%s: %s' % (response.request.url, response.request.meta['page'])) cat_id = response.request.meta['cat_id'] page = response.request.meta['page'] if int(page) > int(self.max_search_page): return items = [] sel = Selector(response) #video items yk_v = sel.xpath('//div[@class="yk-col4"]') for v in yk_v: url = v.xpath('./div/div[@class="v-link"]/a/@href').extract() pl = v.xpath( './div/div[@class="v-meta va"]/div[@class="v-meta-entry"]/span/text()' ).extract() if url and pl: pld = Util.normalize_played(pl[0]) if int(pld) >= int(self.hottest_played_threshold): items.append( Request(url=url[0], callback=self.parse_episode, meta={'cat_id': cat_id})) #else: # log.msg('discard: %s' % url[0]) #pages next_page = sel.xpath( '//div[@class="yk-pager"]/ul/li[@class="next"]/a/@href' ).extract() if next_page: items.append( Request(url=self.url_prefix + next_page[0], callback=self.parse_page, meta={ 'page': page + 1, 'cat_id': cat_id })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('%s' % response.request.url) cat_id = response.request.meta['cat_id'] items = [] sel = Selector(response) #owner owner = sel.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) if owner_show_id in self.channel_exclude: log.msg("video owner excluded: %s" % owner_show_id) return items.append(Request(url=owner[0], callback=self.parse_owner)) #video info #title = sel.xpath('//div[@class="base_info"]/h1/descendant-or-self::*/text()').extract() title = sel.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() category = sel.xpath( '//div[@class="base_info"]/div[@class="guide"]/div/a/text()' ).extract() scripts = sel.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = sel.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = sel.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = sel.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() #ep_item['title'] = Util.strip_title("".join(title)) ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') if category: ep_item['category'] = category[0].replace(u'频道', '') if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['cat_id'] = cat_id #if video_id: # items.append(Request(url=self.vpaction_url+video_id[0], callback=self.parse_vpaction, meta={'item':ep_item})) if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_vpaction(self, response): try: #log.msg('%s' % response.request.url) item = response.request.meta['item'] sel = Selector(response) #vp = sel.xpath('//div[@id="videodetailInfo"]/ul/li').re(u'<label>总播放数:</label><span.*>(.+)</span>') #vp = sel.xpath('//div[@class="info_num"]/span/text()').extract() vp = sel.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()' ).extract() if vp: item['played'] = Util.normalize_played( Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url + show_id, callback=self.parse_playlength, meta={'item': item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self, response): try: #log.msg('parse_playlength ,%s' % response.request.url) item = response.request.meta['item'] showid = item["show_id"] msg = response.body jinfo = json.loads(msg) # plsylength = str(int(float(jinfo["data"][0]["seconds"]))) plsylength = str(int(float(jinfo["data"]["video"]["seconds"]))) if plsylength: item['duration'] = str(plsylength) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_owner(self, response): try: log.msg('%s' % response.request.url) items = [] sel = Selector(response) user_item = UserItem() #owner id script = sel.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = sel.xpath('//div[@class="profile"]') if up: user_name = up.xpath( './div[@class="info"]/div[@class="username"]/a[1]/@title' ).extract() played = up.xpath( './div[@class="state"]/ul/li[@class="vnum"]/em/text()' ).extract() fans = up.xpath( './div[@class="state"]/ul/li[@class="snum"]/em/text()' ).extract() if user_name: user_item['user_name'] = user_name[0] if played: #user_item['played'] = Util.normalize_vp(played[0]) user_item['played'] = Util.normalize_played( Util.normalize_vp(played[0])) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = sel.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath( './div[@class="userintro"]/div[@class="desc"]/p[2]/text()' ).extract() if intro: user_item['intro'] = ''.join(intro) #count yh = sel.xpath('//div[@class="YK-home"]') vcount = '0' if yh: video_count = yh.xpath( 'div[1]/div/div/div/div[@class="title"]/span/a/text()').re( u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url items.append(user_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class PageOrderSpider(CrawlSpider): name = 'page_order' pipelines = ['MysqlStorePipeline'] spider_id = "2048" format_id = 2 allowed_domains = [ "youku.com", "www.youku.com", "www.iqiyi.com", "cache.video.iqiyi.com", "www.soku.com", "index.youku.com", "play.youku.com" ] vpaction_url = "http://v.youku.com/v_vpactionInfo/id/" playnum_url = 'http://cache.video.iqiyi.com/jp/pc/' playlength_url = "http://cache.video.iqiyi.com/a/" youku_playlength_url = "http://play.youku.com/play/get.json?ct=10&vid=" mgr = DbManager.instance() rules = ( #Rule(LinkExtractor(allow=r'http://v.youku.com/v_show/id_.+\.html'), callback='parse_episode_youku'), Rule(LinkExtractor(allow=r'http://v.youku.com/v_show/id_.+\.html.*'), callback='parse_episode_youku'), Rule(LinkExtractor(allow=r'http://www.iqiyi.com/[vw]_.+\.html'), callback='parse_episode_iqiyi'), ) def __init__(self, orders=None, *args, **kwargs): super(PageOrderSpider, self).__init__(*args, **kwargs) if orders: orders = json.loads(orders) else: orders = self.mgr.get_ordered_page(site_name=['iqiyi', 'youku']) if orders: self._orders = orders else: self._orders = [] def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() for n, rule in enumerate(self._rules): links = [ l for l in rule.link_extractor.extract_links(response) if l not in seen ] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url, callback=self._response_downloaded) r.meta.update(response.request.meta) r.meta.update(rule=n, link_text=link.text) yield rule.process_request(r) def start_requests(self): try: items = [] for page in self._orders: items.append( Request(url=page['url'], meta={ 'pg_id': page['id'], 'cat_name': page['user'], 'site_id': page['site_id'], 'audit': page['audit'], 'priority': page['priority'] })) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_episode_youku(self, response): try: logging.log(logging.INFO, "episode_youku:%s" % response.request.url) pg_id = response.request.meta['pg_id'] cat_name = response.request.meta['cat_name'] site_id = response.request.meta['site_id'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #owner owner = response.xpath( '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href' ).extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) #video info title = response.xpath( '//div[@class="base_info"]/h1/descendant-or-self::text()' ).extract() #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath( '//div[@class="yk-videoinfo"]/div[@class="time"]/text()' ).extract() description = response.xpath( '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()' ).extract() vp_url = response.xpath( '//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() #ep_item['title'] = Util.strip_title("".join(title)) ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') #if category: # ep_item['category'] = category[0].replace(u'频道', '') ep_item['category'] = cat_name if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta( datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = site_id ep_item['url'] = response.request.url ep_item['pg_id'] = pg_id ep_item['audit'] = audit ep_item['format_id'] = self.format_id ep_item['priority'] = priority if vp_url: items.append( Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item': ep_item})) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_vpaction(self, response): try: logging.log(logging.INFO, "parse_vpaction:%s" % response.request.url) item = response.request.meta['item'] vp = response.xpath( '//ul[@class="player_info"]/li[@class="sum"]/text()').extract( ) if vp: item['played'] = Util.normalize_played( Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.youku_playlength_url + show_id, callback=self.parse_youku_playlength, meta={'item': item}) return item except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_youku_playlength(self, response): try: logging.log(logging.INFO, "parse_youku_playlength:%s" % response.request.url) item = response.request.meta['item'] showid = item["show_id"] msg = response.body jinfo = json.loads(msg) playlength = str(int(float(jinfo["data"]["video"]["seconds"]))) if playlength: item['duration'] = str(playlength) return item except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_episode_iqiyi(self, response): try: logging.log(logging.INFO, "parse_youku_playlength:%s" % response.request.url) pg_id = response.request.meta['pg_id'] cat_name = response.request.meta['cat_name'] site_id = response.request.meta['site_id'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #show_id show_id = Util.get_iqiyi_showid(response.request.url) albumid = response.selector.re(re.compile(r'albumId: ?(\d+)')) #video info title = response.xpath( '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()' ).extract() if not title: title = response.xpath( '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()' ).extract() #category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract() #if not category: # category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract() upload_time = response.xpath( '//div[@class="crumb_bar"]/span[3]/span/text()').extract() if not upload_time: upload_time = response.xpath( '//div[@class="crumb_bar"]/span[2]/span/text()').extract() tag = response.xpath( '//span[@id="widget-videotag"]/descendant::*/text()').extract( ) if not tag: tag = response.xpath( '//span[@class="mod-tags_item vl-block"]/descendant::*/text()' ).extract() if not tag: tag = response.xpath( '//div[@class="crumb_bar"]/span[2]/a/text()').extract() ep_item = EpisodeItem() if title: ep_item['title'] = "".join([t.strip() for t in title]) if show_id: ep_item['show_id'] = show_id if tag: ep_item['tag'] = "|".join([t.strip() for t in tag]) if upload_time: ep_item['upload_time'] = upload_time[0].strip() #if category: # ep_item['category'] = category[0].strip() ep_item['category'] = cat_name ep_item['spider_id'] = self.spider_id ep_item['site_id'] = site_id ep_item['pg_id'] = pg_id ep_item['audit'] = audit ep_item['url'] = response.request.url ep_item['format_id'] = self.format_id ep_item['priority'] = priority if albumid: items.append( Request(url=self.playlength_url + albumid[0], callback=self.parse_playlength, meta={ 'item': ep_item, 'albumid': albumid[0] })) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_playlength(self, response): try: logging.log(logging.INFO, "parse_playlength:%s" % response.request.url) item = response.request.meta['item'] albumid = response.request.meta['albumid'] items = [] msg = response.body index = msg.find("AlbumInfo=") + len("AlbumInfo=") info = msg[index:] jinfo = json.loads(info) playlength = jinfo["data"]["playLength"] if playlength: item['duration'] = str(playlength) items.append( Request(url=self.playnum_url + albumid + "/?qyid=", callback=self.parse_playnum, meta={'item': item})) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_playnum(self, response): try: logging.log(logging.INFO, "parse_playnum:%s" % response.request.url) item = response.request.meta['item'] items = [] tplaynum = response.selector.re(re.compile(r':(\d+)')) if tplaynum: item['played'] = str(tplaynum[0]) items.append(item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
class YoukuSearchVideoSpider(Spider): name = "youku_search_video" pipelines = ['MysqlStorePipeline'] spider_id = "1024" site_id = "1" format_id = 2 url_prefix = 'http://www.soku.com' vpaction_url = "http://v.youku.com/v_vpactionInfo/id/" playlength_url = "http://play.youku.com/play/get.json?ct=10&vid=" mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(YoukuSearchVideoSpider, self).__init__(*args, **kwargs) keywords = kwargs.get('kwargs') if keywords: keywords = json.loads(keywords) else: keywords = self.mgr.get_keyword_url(site_name='youku') if keywords: self._keywords = keywords else: self._keywords = [] def start_requests(self): try: items = [] for kw in self._keywords: items.append(Request(url=kw['url'], callback=self.parse, meta={'audit': kw['audit'], 'cat_name': kw['user'], 'kw_id': kw['id'], 'priority': kw['priority']})) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse(self, response): try: logging.log(logging.INFO, "parse:%s" % response.request.url) audit = response.request.meta['audit'] cat_name = response.request.meta['cat_name'] kw_id = response.request.meta['kw_id'] priority = response.request.meta['priority'] items = [] #video items yk_v = response.xpath('//div[@class="sk-vlist clearfix"]/div[@class="v"]') for v in yk_v: url = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-title"]/a/@href').extract() thumb_urls = v.xpath('./div[@class="v-link"]/a/@href').extract() if thumb_urls: thumb_url = thumb_urls[0] if thumb_url == 'http://g1.ykimg.com/': thumb_url = None else: thumb_url = None pl = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-entry"]/div/label[text()="%s"]/../span/text()' % u'播放: ').extract() if pl: pld = Util.normalize_played(pl[0]) played = int(pld) else: played = None if url: items.append(Request(url=url[0], callback=self.parse_episode, meta={'audit': audit, 'thumb_url': thumb_url, 'played': played, 'cat_name': cat_name, 'kw_id': kw_id, 'priority': priority})) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_episode(self, response): try: logging.log(logging.INFO, 'episode:%s' % response.request.url) audit = response.request.meta['audit'] thumb_url = response.request.meta['thumb_url'] played = response.request.meta['played'] cat_name = response.request.meta['cat_name'] kw_id = response.request.meta['kw_id'] priority = response.request.meta['priority'] items = [] #owner owner = response.xpath('//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href').extract() owner_show_id = None if owner: owner_show_id = Util.get_owner(owner[0]) items.append(Request(url=owner[0], callback=self.parse_owner)) #video info title = response.xpath('//div[@class="base_info"]/h1/descendant-or-self::text()').extract() #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract() scripts = response.xpath('//script[@type="text/javascript"]') video_id = scripts.re('videoId = \'(\d+)\'') tag = scripts.re('tags="(.+)"') upload = response.xpath('//div[@class="yk-videoinfo"]/div[@class="time"]/text()').extract() description = response.xpath('//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()').extract() vp_url = response.xpath('//span[@id="videoTotalPV"]/../../@href').extract() ep_item = EpisodeItem() ep_item['show_id'] = Util.get_showid(response.request.url) if video_id: ep_item['video_id'] = video_id[0] if owner_show_id: ep_item['owner_show_id'] = owner_show_id if title: t = "".join(title) t = t.strip("\n").strip() ep_item['title'] = Util.strip_title(t) if tag: ep_item['tag'] = Util.unquote(tag[0]).rstrip('|') #if category: # ep_item['category'] = category[0].replace(u'频道', '') ep_item['category'] = cat_name if upload: t = Util.get_upload_time(upload[0]) if t: ep_item['upload_time'] = Util.get_datetime_delta(datetime.now(), t) if description: ep_item['description'] = description[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url ep_item['audit'] = audit ep_item['format_id'] = self.format_id ep_item['thumb_url'] = thumb_url ep_item['played'] = played ep_item['kw_id'] = kw_id ep_item['priority'] = priority if vp_url: items.append(Request(url=vp_url[0], callback=self.parse_vpaction, meta={'item':ep_item})) else: items.append(ep_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_vpaction(self, response): try: logging.log(logging.INFO, 'vpaction:%s' % response.request.url) item = response.request.meta['item'] vp = response.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()').extract() if vp: item['played'] = Util.normalize_played(Util.normalize_vp(vp[0].replace('总播放:', ''))) show_id = item['show_id'] item = Request(url=self.playlength_url+show_id, callback=self.parse_playlength, meta={'item':item}) return item except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_playlength(self,response): try: logging.log(logging.INFO, 'playlength:%s' % response.request.url) item = response.request.meta['item'] showid = item["show_id"] msg = response.body jinfo = json.loads(msg) plsylength = str(int(float(jinfo["data"]["video"]["seconds"]))) if plsylength: item['duration'] = str(plsylength) return item except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) def parse_owner(self, response): try: logging.log(logging.INFO, "owner:%s" % response.request.url) items = [] user_item = UserItem() #owner id script = response.xpath('/html/head/script') owner_id = script.re('ownerId = \"(\d+)\"') show_id = script.re('ownerEncodeid = \'(.+)\'') if owner_id: user_item['owner_id'] = owner_id[0] if show_id: user_item['show_id'] = show_id[0] else: return #user profile up = response.xpath('//div[@class="profile"]') if up: user_name = up.xpath('./div[@class="info"]/div[@class="username"]/a[1]/@title').extract() played = up.xpath('./div[@class="state"]/ul/li[@class="vnum"]/em/text()').extract() fans = up.xpath('./div[@class="state"]/ul/li[@class="snum"]/em/text()').extract() if user_name: user_item['user_name'] = user_name[0] if played: #user_item['played'] = Util.normalize_vp(played[0]) user_item['played'] = Util.normalize_played(Util.normalize_vp(played[0])) if fans: user_item['fans'] = Util.normalize_vp(fans[0]) #youku profile yp = response.xpath('//div[@class="YK-profile"]') if yp: intro = yp.xpath('./div[@class="userintro"]/div[@class="desc"]/p[2]/text()').extract() if intro: user_item['intro'] = ''.join(intro) #count yh = response.xpath('//div[@class="YK-home"]') vcount = None if yh: video_count = yh.xpath('div[1]/div/div/div/div[@class="title"]/span/a/text()').re(u'\((\d+)\)') if video_count: vcount = video_count[0] user_item['vcount'] = vcount user_item['spider_id'] = self.spider_id user_item['site_id'] = self.site_id user_item['url'] = response.request.url items.append(user_item) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
class Acfun_cat(Spider): name = "acfun_cat" pipelines = ['MysqlStorePipeline'] spider_id = "3" site_id = "12" max_search_page = 1 request_url = "http://www.acfun.tv/dynamic/channel/1.aspx?channelId=%s&orderBy=0&pageSize=16" mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(Acfun_cat, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = self.mgr.get_cat_url('acfun') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] for cat in self._cat_urls: url = self.request_url % Util.get_acfun_showid(cat['url']) items.extend([Request(url=url, callback=self.parse_page,meta={'cat_name': cat['cat_name'],'audit':cat['audit'],'priority':cat['priority']})]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #video items qy_v = response.xpath('//body/div') for v in qy_v: thumb = v.xpath('./a[@class="thumb"]/img/@src').extract() url = v.xpath('./a[@class="thumb"]/@href').extract() lens = v.xpath('./a[@class="thumb"]/p/text()').extract() if lens: try: a,b=lens[0].strip().split(':') lens = int(a)*60+int(b) except Exception as e: lens = 0 else: lens = 0 if url: items.append(Request(url=("http://www.acfun.tv%s" % url[0].strip()), callback=self.parse_episode, meta={'cat_name': cat_name, 'thumb': thumb,'audit':audit,'priority':priority,'lens':lens})) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] lens = response.request.meta['lens'] items = [] show_id = response.xpath('//div[@id="block-data-view"]/@data-aid').extract() title = response.xpath('//div[@id="block-data-view"]/@data-title').extract() tags = response.xpath('//div[@id="block-data-view"]/@data-tags').extract() if lens ==0: data_from = response.xpath('//div[@id="area-part-view"]/div/a/@data-from').extract() data_sid = response.xpath('//div[@id="area-part-view"]/div/a/@data-sid').extract() if data_sid: second_request = "http://www.acfun.tv/video/getVideo.aspx?id=" + data_sid[0].strip() items.append(Request(url=second_request, callback=self.parse_duration, meta={'cat_name': cat_name, 'thumb': thumb_url,'audit':audit,'priority':priority,'show_id':show_id,'title':title,'tags':tags,'url':response.request.url})) return items else: ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id[0].strip() if tags: ep_item['tag'] = tags[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] =priority ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_duration(self, response): try: items = [] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] title = response.request.meta['title'] show_id = response.request.meta['show_id'] tags = response.request.meta['tags'] url = response.request.meta['url'] data = json.loads(response.body) success = data.get('success') if not success or success == 'false': return items duration = data.get('time') if not duration: return items ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id[0].strip() if tags: ep_item['tag'] = tags[0].strip() if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] =priority ep_item['duration'] = int(duration) items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)
class ifeng_cat(Spider): name = "ifeng_cat" pipelines = ['MysqlStorePipeline'] spider_id = "9" site_id = "4" max_search_page = 1 mgr = DbManager.instance() def __init__(self, *args, **kwargs): super(ifeng_cat, self).__init__(*args, **kwargs) self._cat_urls = [] try: self._cat_urls = self.mgr.get_cat_url('ifeng') except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def start_requests(self): try: items = [] for cat in self._cat_urls: print cat items.extend([ Request(url=cat['url'], callback=self.parse_page, meta={ 'cat_name': cat['cat_name'], 'audit': cat['audit'], 'priority': cat['priority'] }) ]) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_page(self, response): try: #log.msg('parse page %s: %s' % (response.request.url, response.request.meta['page'])) cat_name = response.request.meta['cat_name'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] items = [] #video items qy_v = response.xpath('//div[@class="listwrap"]/div/ul/li') for v in qy_v: thumb = v.xpath('./div[@class="pic"]/a/img/@src').extract() url = v.xpath('./div[@class="pic"]/a/@href').extract() lens = v.xpath('./div[@class="pic"]/span[@class="sets"]/text()' ).extract() if lens: try: a, b = lens[0].strip().split(':') lens = int(a) * 60 + int(b) except Exception as e: lens = 0 else: lens = 0 if url: items.append( Request(url=url[0].strip(), callback=self.parse_episode, meta={ 'cat_name': cat_name, 'thumb': thumb, 'audit': audit, 'priority': priority, 'lens': lens })) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR) def parse_episode(self, response): try: log.msg('parse_episode %s' % response.request.url) #cat_id = response.request.meta['cat_id'] cat_name = response.request.meta['cat_name'] thumb_url = response.request.meta['thumb'] audit = response.request.meta['audit'] priority = response.request.meta['priority'] lens = response.request.meta['lens'] items = [] show_id = Util.get_ifeng_showid(response.request.url) title = response.xpath( '//head/meta[@property="og:title"]/@content').extract() tags = response.xpath('//div[@class="protag"]/a/text()').extract() upload_time = response.xpath( '//div[@class="vTit_wrap"]/div/p/span[@class="data"]/text()' ).extract() #video info ep_item = EpisodeItem() if title: ep_item['title'] = title[0].strip() if show_id: ep_item['show_id'] = show_id if tags: ep_item['tag'] = '|'.join(tags) if thumb_url: ep_item['thumb_url'] = thumb_url[0].strip() if upload_time: ep_item['upload_time'] = upload_time[0] ep_item['spider_id'] = self.spider_id ep_item['site_id'] = self.site_id ep_item['url'] = response.request.url #ep_item['cat_id'] = cat_id ep_item['category'] = cat_name ep_item['format_id'] = '2' ep_item['audit'] = audit ep_item['priority'] = priority ep_item['duration'] = lens items.append(ep_item) return items except Exception as e: log.msg(traceback.format_exc(), level=log.ERROR)