def parse_third(self,response): Index_Url = response.meta['Index_Url'] Target_Detail_Page = response.meta.get('Target_Detail_Page',None) Final_Xpath = response.meta.get('Final_Xpath',None) detail_url = Relative_to_Absolute(Index_Url,response.xpath(Target_Detail_Page['xpath']).extract(),self.name) Some_Info = {} if 'Some_Info' in Target_Detail_Page.keys(): keys = Target_Detail_Page['Some_Info'].keys() for key in keys: try: Some_Info[key] = response.xpath(Target_Detail_Page['Some_Info'][key]).extract()[0] except Exception,e: print Exception,":",e
def parse_json2(self, response): Index_Url = response.meta.get('Index_Url', None) All_Detail_Page = response.meta.get('All_Detail_Page', None) Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None) Target_Detail_Page = response.meta.get('Target_Detail_Page', None) Final_Xpath = response.meta.get('Final_Xpath', None) detail_url = [] res_json = json.loads(response.body_as_unicode()) #递归读取最底层的key对应的value值,我去,想出来了~~[这里是要for一遍最底层的list,所以要读到len-1处,然后在得到detail_url] depth = 0 length = len(All_Detail_Page['index']) while depth < length - 1: res_json = res_json.get(All_Detail_Page['index'][depth]) depth += 1 #print "now the res_json is %s"%res_json for i in res_json: detail_url.append(i.get(All_Detail_Page['index'][length - 1])) try: detail_url = Relative_to_Absolute(Index_Url, detail_url, self.name) except Exception, e: print Exception, ":", e
Final_Xpath = response.meta.get('Final_Xpath', None) Some_Info = {} if 'Some_Info' in All_Detail_Page.keys(): keys = All_Detail_Page['Some_Info'].keys() for key in keys: try: Some_Info[key] = response.xpath( All_Detail_Page['Some_Info'][key]).extract()[0] except Exception, e: print Exception, ":", e #一个页面可能会需要多个提取的xpath,这里就指定为一个list了 detail_url = [] for xpath in All_Detail_Page['xpath']: for url in Relative_to_Absolute(Index_Url, response.xpath(xpath).extract(), self.name): detail_url.append(url) #在考虑在每一层加一个判断,相当于如果没有(第一个)要传递给下一层的数据,就直接传递给final_parse(注:在传递给final_parse时需要判断是否需要渲染,这里我暂时先默认都渲染,但是之后可以考虑在config.json的Final_Xpath加一个flag,1表示需要渲染,0表示不需要) if Signal_Detail_Page is None: for url in detail_url: request = Request( url, callback=self.parse_final, dont_filter=True, meta={ 'splash': { 'endpoint': 'render.html', 'args': { #只有aiyiyi需要load 10s,才能拿到播放量 'wait': 0.5,