def source_list_page(self, response): save = response.save save['source'] = [] save['is_play_source'] = 1 if save['api']: pass for each in pq(response.json['html']).find('ul.ulPic li').items(): img = pq(each).find('.pic img') guests = None guests = [pq(x).text() for x in pq(each).find('.sDes').children()] episode = { "img": img.attr.src or '', "desc": img.attr.alt or '', "date": pq(each).find('.pic .sExplanation em').text()[1:-1] or '', "guests": guests or '', "url": delUrlParams(pq(each).find('.aPlayBtn').attr.href) } save['source'].append(episode) return save
def detail_page(self, response): self.handleMetaRedirect(response) ##抓取基本信息 categories = year = location = alias = upd_desc = orig_id = None catList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'年代:': year = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'别名:': alias = pq(each).siblings().eq(0).text() or None #抓取播放源地址 playSources = {} for each in response.doc('.sourceList').items(): if each.attr.id is not None and len(each.attr.id) > 4: api = each.attr.id[:-4] episodes = {} for episode in pq(each).find('.numList>a').items(): if episode.attr.href and episode.attr.href[: 10] != 'javascript': episodes[pq(episode).text()] = delUrlParams( episode.attr.href) playSources[api] = episodes #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源, #因为集数太多,所以分了两个列表,需要合并两个dict for key in playSources.keys(): if playSources.has_key(key + '_con_'): playSources[key] = dict(playSources[key].items() + playSources[key + '_con_'].items()) del playSources[key + '_con_'] #判断是否完结,closed:0 | 1 closed = 0 wrap = response.doc('.pTxt .sDes') sep = wrap.find('i') if re.match(u'.*更新.*', wrap.text()) or sep: pass if sep: upd_desc = wrap.contents()[-1][2:] or None else: closed = 1 #get original id orig_id = getId(response.url, '.*dm/(\d+)\.html$') small_image = response.save['small_image'] if small_image is None or len(small_image) == 0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "rating": response.doc('.sScore em').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "year": year or '', "location": location or '', "alias": alias or '', "play_source": playSources, "closed": closed, "upd_desc": upd_desc or '', "small_image": small_image or '', "orig_id": orig_id }
def detail_page(self, response): self.handleMetaRedirect(response) ##抓取基本信息 categories = year = location = alias = upd_desc = orig_id = None catList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'年代:': year = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'别名:': alias = pq(each).siblings().eq(0).text() or None #抓取播放源地址 playSources = {} for each in response.doc('.sourceList').items(): if each.attr.id is not None and len(each.attr.id)>4: api = each.attr.id[:-4] episodes = {} for episode in pq(each).find('.numList>a').items(): if episode.attr.href and episode.attr.href[:10] != 'javascript': episodes[pq(episode).text()] = delUrlParams(episode.attr.href) playSources[api] = episodes #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源, #因为集数太多,所以分了两个列表,需要合并两个dict for key in playSources.keys(): if playSources.has_key(key+'_con_'): playSources[key] = dict(playSources[key].items() + playSources[key+'_con_'].items()) del playSources[key+'_con_'] #判断是否完结,closed:0 | 1 closed = 0 wrap = response.doc('.pTxt .sDes') sep = wrap.find('i') if re.match(u'.*更新.*', wrap.text()) or sep: pass if sep: upd_desc = wrap.contents()[-1][2:] or None else: closed = 1 #get original id orig_id = getId(response.url, '.*dm/(\d+)\.html$') small_image = response.save['small_image'] if small_image is None or len(small_image)==0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "rating": response.doc('.sScore em').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "year": year or '', "location": location or '', "alias" : alias or '', "play_source": playSources, "closed": closed, "upd_desc" : upd_desc or '', "small_image": small_image or '', "orig_id" : orig_id }
def detail_page(self, response): ##抓取基本信息 casting = director = categories = year = location = upd_desc = None castingList = [] directorList = [] catList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'主演:': for cast in pq(each).siblings('a'): if pq(cast).text() != u'全部主演>': castingList.append(pq(cast).text()) elif pq(each).text() == u'导演:': director = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None if year is None: matchObj = re.search( u'.*上映于(\d+)年.*', response.doc('meta[name=Description]').eq(0).attr.content) if year is not None: year = matchObj.group(1) else: year = 1980 orig_id = 0 orig_id = re.search('.*/detail/(\d+).html$', response.url).group(1) #抓取播放源地址 playSources = {} for each in response.doc('.sourceList').items(): api = each.attr.id[:-4] episodes = {} for episode in pq(each).find('.numList>a').items(): if episode.attr.href and episode.attr.href[: 10] != 'javascript' and pq( episode).text( ) != u'分集剧情': episodes[pq(episode).text()] = delUrlParams( episode.attr.href) playSources[api] = episodes #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源, #因为集数太多,所以分了两个列表,需要合并两个dict for key in playSources.keys(): if playSources.has_key(key + '_con_'): playSources[key] = dict(playSources[key].items() + playSources[key + '_con_'].items()) del playSources[key + '_con_'] #判断是否完结,closed:0 | 1 closed = 0 wrap = response.doc('.pTxt .sDes') sep = wrap.find('i') if re.match(u'.*更新.*', wrap.text()) or sep: pass if sep: upd_desc = wrap.contents()[-1][2:] else: closed = 1 #测试是否能够同时去爬,并返回另一个结果 self.crawl(response.doc('.pNumTab>a:last-child').attr.href, callback=self.plot_list_page) small_image = response.save['small_image'] if small_image is None or len(small_image) == 0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "rating": response.doc('.sScore em').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "year": year or '', "location": location or '', "play_source": playSources, "small_image": small_image or '', "casting": castingList, "director": director or '', "closed": closed, "upd_desc": upd_desc or '', "is_plot": 0, "orig_id": orig_id }
def detail_page(self, response): ##抓取基本信息 casting = director = categories = year = location = upd_desc = None castingList = [] directorList = [] catList = [] for each in response.doc('dl.dlTxt dd em.emTit'): # TODO 这里可以包装为一个方法,目前太乱 if pq(each).text() == u'主演:': for cast in pq(each).siblings('a'): if pq(cast).text() != u'全部主演>': castingList.append(pq(cast).text()) elif pq(each).text() == u'导演:': director = pq(each).siblings().eq(0).text() or None elif pq(each).text() == u'类型:': for cat in pq(each).siblings('a'): catList.append(pq(cat).text()) elif pq(each).text() == u'国家/地区:': location = pq(each).siblings().eq(0).text() or None if year is None: matchObj = re.search(u'.*上映于(\d+)年.*' ,response.doc('meta[name=Description]').eq(0).attr.content) if year is not None: year = matchObj.group(1) else: year = 1980 orig_id = 0 orig_id = re.search('.*/detail/(\d+).html$', response.url).group(1) #抓取播放源地址 playSources = {} for each in response.doc('.sourceList').items(): api = each.attr.id[:-4] episodes = {} for episode in pq(each).find('.numList>a').items(): if episode.attr.href and episode.attr.href[:10] != 'javascript' and pq(episode).text() != u'分集剧情': episodes[pq(episode).text()] = delUrlParams(episode.attr.href) playSources[api] = episodes #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源, #因为集数太多,所以分了两个列表,需要合并两个dict for key in playSources.keys(): if playSources.has_key(key+'_con_'): playSources[key] = dict(playSources[key].items() + playSources[key+'_con_'].items()) del playSources[key+'_con_'] #判断是否完结,closed:0 | 1 closed = 0 wrap = response.doc('.pTxt .sDes') sep = wrap.find('i') if re.match(u'.*更新.*', wrap.text()) or sep: pass if sep: upd_desc = wrap.contents()[-1][2:] else: closed = 1 #测试是否能够同时去爬,并返回另一个结果 self.crawl(response.doc('.pNumTab>a:last-child').attr.href, callback=self.plot_list_page) small_image = response.save['small_image'] if small_image is None or len(small_image)==0: small_image = poster_image return { "url": response.url, "meta_title": response.doc('title').text(), "title": response.doc('h1 a').text() or '', "rating": response.doc('.sScore em').text() or '', "introduction": response.doc('#pIntroId').text() or '', "poster_image": response.doc('.posterCon .pic>img').attr.src or '', "categories": catList, "year": year or '', "location": location or '', "play_source": playSources, "small_image": small_image or '', "casting" : castingList, "director" : director or '', "closed": closed, "upd_desc": upd_desc or '', "is_plot": 0, "orig_id": orig_id }