def detail_page(self, response): ##抓取基本信息 categories = platform = location = orig_id = hosts = None catList = [] hostList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'播出平台:': platform = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'主持人/嘉宾:': for cat in pq(each).siblings('a'): hostList.append(pq(cat).text()) #get original id orig_id = getId(response.url, '.*zongyi/zy_(\d+)/$') #抓取播放源地址 baseUrl = 'http://v.2345.com/moviecore/server/variety/index.php?' apis = [] firstApi = response.doc('#playNumTabFirst').attr.apiname apis.append(firstApi) for each in response.doc('.sourceMoreList a').items(): if each is not None and len(each) > 0: apis.append(each.attr.apiname) for apiname in apis: save = {"api": apiname, "variety_id": orig_id} self.crawl(self.ajaxBaseUrl + makeAjaxParam(api=apiname, id=orig_id), callback=self.jsonYearList, save=save) small_image = response.save['small_image'] if small_image is None or len(small_image) == 0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "platform": platform or '', "location": location or '', "hosts": hostList, "small_image": small_image or '', "orig_id": orig_id, "rating": response.save['rating'], "is_play_source": 0 }
def detail_page(self, response): ##抓取基本信息 categories = platform = location = orig_id = hosts= None catList = [] hostList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'播出平台:': platform = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'主持人/嘉宾:': for cat in pq(each).siblings('a'): hostList.append(pq(cat).text()) #get original id orig_id = getId(response.url, '.*zongyi/zy_(\d+)/$') #抓取播放源地址 baseUrl = 'http://v.2345.com/moviecore/server/variety/index.php?' apis = [] firstApi = response.doc('#playNumTabFirst').attr.apiname apis.append(firstApi) for each in response.doc('.sourceMoreList a').items(): if each is not None and len(each)>0: apis.append(each.attr.apiname) for apiname in apis: save = {"api": apiname, "variety_id": orig_id} self.crawl(self.ajaxBaseUrl+makeAjaxParam(api=apiname, id=orig_id), callback=self.jsonYearList, save=save) small_image = response.save['small_image'] if small_image is None or len(small_image)==0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "platform": platform or '', "location": location or '', "hosts" : hostList, "small_image": small_image or '', "orig_id" : orig_id, "rating": response.save['rating'], "is_play_source": 0 }
def detail_page(self, response): self.handleMetaRedirect(response) ##抓取基本信息 categories = year = location = alias = upd_desc = orig_id = None catList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'年代:': year = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'别名:': alias = pq(each).siblings().eq(0).text() or None #抓取播放源地址 playSources = {} for each in response.doc('.sourceList').items(): if each.attr.id is not None and len(each.attr.id) > 4: api = each.attr.id[:-4] episodes = {} for episode in pq(each).find('.numList>a').items(): if episode.attr.href and episode.attr.href[: 10] != 'javascript': episodes[pq(episode).text()] = delUrlParams( episode.attr.href) playSources[api] = episodes #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源, #因为集数太多,所以分了两个列表,需要合并两个dict for key in playSources.keys(): if playSources.has_key(key + '_con_'): playSources[key] = dict(playSources[key].items() + playSources[key + '_con_'].items()) del playSources[key + '_con_'] #判断是否完结,closed:0 | 1 closed = 0 wrap = response.doc('.pTxt .sDes') sep = wrap.find('i') if re.match(u'.*更新.*', wrap.text()) or sep: pass if sep: upd_desc = wrap.contents()[-1][2:] or None else: closed = 1 #get original id orig_id = getId(response.url, '.*dm/(\d+)\.html$') small_image = response.save['small_image'] if small_image is None or len(small_image) == 0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "rating": response.doc('.sScore em').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "year": year or '', "location": location or '', "alias": alias or '', "play_source": playSources, "closed": closed, "upd_desc": upd_desc or '', "small_image": small_image or '', "orig_id": orig_id }
def detail_page(self, response): self.handleMetaRedirect(response) ##抓取基本信息 categories = year = location = alias = upd_desc = orig_id = None catList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'年代:': year = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'别名:': alias = pq(each).siblings().eq(0).text() or None #抓取播放源地址 playSources = {} for each in response.doc('.sourceList').items(): if each.attr.id is not None and len(each.attr.id)>4: api = each.attr.id[:-4] episodes = {} for episode in pq(each).find('.numList>a').items(): if episode.attr.href and episode.attr.href[:10] != 'javascript': episodes[pq(episode).text()] = delUrlParams(episode.attr.href) playSources[api] = episodes #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源, #因为集数太多,所以分了两个列表,需要合并两个dict for key in playSources.keys(): if playSources.has_key(key+'_con_'): playSources[key] = dict(playSources[key].items() + playSources[key+'_con_'].items()) del playSources[key+'_con_'] #判断是否完结,closed:0 | 1 closed = 0 wrap = response.doc('.pTxt .sDes') sep = wrap.find('i') if re.match(u'.*更新.*', wrap.text()) or sep: pass if sep: upd_desc = wrap.contents()[-1][2:] or None else: closed = 1 #get original id orig_id = getId(response.url, '.*dm/(\d+)\.html$') small_image = response.save['small_image'] if small_image is None or len(small_image)==0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "rating": response.doc('.sScore em').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "year": year or '', "location": location or '', "alias" : alias or '', "play_source": playSources, "closed": closed, "upd_desc" : upd_desc or '', "small_image": small_image or '', "orig_id" : orig_id }