コード例 #1
0
	def parse_third(self,response):
		Index_Url = response.meta['Index_Url']
		Target_Detail_Page = response.meta.get('Target_Detail_Page',None)
		Final_Xpath = response.meta.get('Final_Xpath',None)
		detail_url = Relative_to_Absolute(Index_Url,response.xpath(Target_Detail_Page['xpath']).extract(),self.name)	
		Some_Info = {}
		if 'Some_Info' in Target_Detail_Page.keys():
				keys = Target_Detail_Page['Some_Info'].keys()
				for key in keys:
						try:
								Some_Info[key] = response.xpath(Target_Detail_Page['Some_Info'][key]).extract()[0]
						except Exception,e:
								print Exception,":",e
コード例 #2
0
 def parse_json2(self, response):
     Index_Url = response.meta.get('Index_Url', None)
     All_Detail_Page = response.meta.get('All_Detail_Page', None)
     Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None)
     Target_Detail_Page = response.meta.get('Target_Detail_Page', None)
     Final_Xpath = response.meta.get('Final_Xpath', None)
     detail_url = []
     res_json = json.loads(response.body_as_unicode())
     #递归读取最底层的key对应的value值,我去,想出来了~~[这里是要for一遍最底层的list,所以要读到len-1处,然后在得到detail_url]
     depth = 0
     length = len(All_Detail_Page['index'])
     while depth < length - 1:
         res_json = res_json.get(All_Detail_Page['index'][depth])
         depth += 1
     #print "now the res_json is %s"%res_json
     for i in res_json:
         detail_url.append(i.get(All_Detail_Page['index'][length - 1]))
     try:
         detail_url = Relative_to_Absolute(Index_Url, detail_url, self.name)
     except Exception, e:
         print Exception, ":", e
コード例 #3
0
        Final_Xpath = response.meta.get('Final_Xpath', None)
        Some_Info = {}
        if 'Some_Info' in All_Detail_Page.keys():
            keys = All_Detail_Page['Some_Info'].keys()
            for key in keys:
                try:
                    Some_Info[key] = response.xpath(
                        All_Detail_Page['Some_Info'][key]).extract()[0]
                except Exception, e:
                    print Exception, ":", e
        #一个页面可能会需要多个提取的xpath,这里就指定为一个list了
        detail_url = []

        for xpath in All_Detail_Page['xpath']:
            for url in Relative_to_Absolute(Index_Url,
                                            response.xpath(xpath).extract(),
                                            self.name):
                detail_url.append(url)
        #在考虑在每一层加一个判断,相当于如果没有(第一个)要传递给下一层的数据,就直接传递给final_parse(注:在传递给final_parse时需要判断是否需要渲染,这里我暂时先默认都渲染,但是之后可以考虑在config.json的Final_Xpath加一个flag,1表示需要渲染,0表示不需要)
        if Signal_Detail_Page is None:
            for url in detail_url:
                request = Request(
                    url,
                    callback=self.parse_final,
                    dont_filter=True,
                    meta={
                        'splash': {
                            'endpoint': 'render.html',
                            'args': {
                                #只有aiyiyi需要load 10s,才能拿到播放量
                                'wait': 0.5,