コード例 #1
0
	def homeSubjectParse(self, response):
		content = json.loads(response.body.decode('utf-8'))
		if 'data' in content.keys():
			subject_list = []
			name 	   = '首页'
			subject_id = 0
			path 	   = [name]
			path_id    = [0]
			info = self.build_subject_info(subject_id, name, path, path_id, 1, 3)
			subject_list.append(info)
			
			for subject in content['data']:
				subject_mix_data = self.get_child_subject_info(subject) ##获取活动下级mix信息
				if not subject_mix_data:
					continue

				for v in subject_mix_data:
					child_name = v['name']
					child_subject_id = v['subject_id']
					if child_name:
						subject_info = self.build_subject_info(child_subject_id, child_name, path + [child_name], path_id + [child_subject_id])
						subject_list.append(subject_info)
					else:
						headers = self.make_headers()
						url = 'http://apiv3.yangkeduo.com/subject/'+str(child_subject_id)
						meta 	= {'path':path, 'path_id':path_id}
						yield scrapy.Request(url, meta=meta, callback=self.parse_subject_info, headers=headers,dont_filter=True,errback=self.errback_httpbin)
			item  = CategoryItem()
			item['cat_list'] = subject_list
			#print(item)
			yield item
コード例 #2
0
 def brand_parse_subject(self, response):
     """ 品牌馆 """
     body = response.body.decode()
     logging.debug(json.dumps({'body': body}))
     path = response.meta["path"]
     path_id = response.meta["path_id"]
     result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body)
     if result:
         subject_list = []
         result = json.loads(result.group())
         tabList = self.dict_get(result, 'tabList', None)
         if tabList and len(tabList) > 0:
             a = 1
             for i in tabList:
                 subject_id = str(i["web_url"])
                 subject_id = re.search(r"\d+", subject_id).group()
                 name = i['tab_name']
                 subject_info = self.build_subject_info(
                     subject_id, name, path + [name],
                     path_id + [subject_id], 31, 3, a)
                 a += 1
                 subject_list.append(subject_info)
                 self.save_log(
                     json.dumps({"subject_info_brand": subject_info}))
             item = CategoryItem()
             logging.debug(json.dumps({'subject_list_brand': subject_list}))
             self.save_log(json.dumps({'subject_list_brand': subject_list}))
             item['cat_list'] = subject_list
             yield item
コード例 #3
0
 def shopping_parse_subject(self, response):
     """ 爱逛街 """
     subject_list = []
     path = response.meta["path"]
     path_id = response.meta["path_id"]
     body = response.body.decode('utf-8')
     logging.debug(json.dumps({'body': body}))
     result = json.loads(body)
     list_subject = self.dict_get(result, 'list', None)
     if list_subject:
         a = 1
         for i in list_subject:
             subject_id = i["tab_id"]
             name = i["subject"]
             subject_info = self.build_subject_info(subject_id, name,
                                                    path + [name],
                                                    path_id + [subject_id],
                                                    51, 5, a)
             a += 1
             subject_list.append(subject_info)
             self.save_log(
                 json.dumps({"subject_info_shopping": subject_info}))
         item = CategoryItem()
         logging.debug(json.dumps({'subject_list_shopping': subject_list}))
         self.save_log(json.dumps({'subject_list_shopping': subject_list}))
         item['cat_list'] = subject_list
         yield item
コード例 #4
0
 def parse_subject_banner(self, response):
     """ 首页轮播"""
     subject_list = []
     body = response.body.decode('utf-8')
     logging.debug(json.dumps({'body': body}))
     result = re.search(r'{"store".*?"ssr":true}', body)
     path = response.meta['path']
     path_id = response.meta['path_id']
     if result:
         result_dict = json.loads(result.group())
         result_str = result.group()
         try:
             name = result_dict["store"]["pageTitle"]
         except Exception:
             name = ''
         f = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str)
         if not f:
             f = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str)
         subject_id_list = re.findall(r"\d+", str(f))
         a = 1
         for subject_id in subject_id_list:
             subject_info = self.build_subject_info(subject_id, name,
                                                    path + [name],
                                                    path_id + [subject_id],
                                                    61, 8, a)
             a += 1
             subject_list.append(subject_info)
             self.save_log(json.dumps({"subject_info_banner":
                                       subject_info}))
         item = CategoryItem()
         logging.debug(json.dumps({'subject_list_banner': subject_list}))
         self.save_log(json.dumps({'subject_list_banner': subject_list}))
         item['cat_list'] = subject_list
         yield item
コード例 #5
0
    def get_third_category(self, response):
        data = json.loads(response.body.decode('utf-8'))
        if data['opt_infos']:
            item = CategoryItem()
            cat_name = response.meta['cat_name']
            cat_id = response.meta['cat_id']
            cat_list = response.meta['cat_list']
            #first_name = response.meta['first_name']
            #second_name = response.meta['second_name']

            for info in data['opt_infos']:
                name = info['opt_name']
                subject_id = info['id']

                path = cat_name['first_name'] + '>' + cat_name[
                    'second_name'] + '>' + name
                path_id = str(cat_id['first_id']) + '>' + str(
                    cat_id['second_id']) + '>' + str(subject_id)

                cat = {
                    'subject_id': subject_id,
                    'name': name,
                    'type': 2,
                    'path': path,
                    'path_id': path_id
                }
                cat_list.append(cat)
                #print(cat)
            item['cat_list'] = cat_list
            yield item
コード例 #6
0
 def short_parse_subject(self, response):
     """ 断码清仓"""
     path = response.meta["path"]
     path_id = response.meta["path_id"]
     body = response.body.decode()
     logging.debug(json.dumps({'body': body}))
     result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body)
     if result:
         result = json.loads(result.group())
         result = self.dict_get(result, 'filterTabList', None)
         subject_list = []
         a = 1
         if result and len(result) > 0:
             for i in result:
                 subject_id = i["id"]
                 str_i = str(i)
                 d = re.search(r"'brand_name': '\w+'", str_i).group()
                 name = re.sub(r"'brand_name':", '', d)
                 subject_info = self.build_subject_info(
                     subject_id, name, path + [name],
                     path_id + [subject_id], 21, 6, a)
                 a += 1
                 subject_list.append(subject_info)
                 self.save_log(
                     json.dumps({"subject_info_short": subject_info}))
             item = CategoryItem()
             logging.debug(json.dumps({'subject_list_short': subject_list}))
             self.save_log(json.dumps({'subject_list_short': subject_list}))
             item['cat_list'] = subject_list
             yield item
コード例 #7
0
	def parse_keyword_extra(self, response):
		keyword_data = response.meta['keyword_data']
		times = response.meta['times']
		query_data = response.meta['query_data']
		get_data = True

		result = json.loads(response.body.decode('utf-8'))
		if 'errorCode' in result.keys() and result['errorCode'] == 1000000:
			pass
			keyword_extend_data = result['result'][0]
			keyword = keyword_extend_data['word']

			keyword_merge_data = self.merge_keyword_data(keyword_data, keyword_extend_data)
			if keyword_merge_data is False:
				get_data = False
			else:
				self.ssdb_client.hset(self.keyword_extend_hash, keyword, json.dumps(keyword_extend_data))
		else:
			get_data = False

		if get_data is False:  ##未获取到数据
			if times < 3:  ##重试次数少于三次
				meta = {'keyword_data': keyword_data, 'times': times + 1, 'query_data': query_data}
				headers = self.make_headers()
				yield scrapy.Request(response.url, method="POST", meta=meta, body=query_data, headers=headers,
									 dont_filter=True, callback=self.parse_keyword_extra)
			else:
				keyword_merge_data = self.merge_keyword_data(keyword_data, {})
				get_data = True

		if get_data is True:
			keywordItem = CategoryItem()
			keywordItem['cat_list'] = keyword_merge_data
			yield keywordItem
コード例 #8
0
	def parse(self, response):
		pass
		cat_list 	 = []
		
		categoryInfo = response.body.decode('utf-8') ##bytes转换为str
		categoryInfo = json.loads(categoryInfo) ##str转为字典
		
		if 'errorCode' in categoryInfo.keys() and categoryInfo['errorCode'] == 1000000:
			for cat in categoryInfo['result']:
				cat_id 	= cat['id']
				cat_name= cat['cat_name']
				parent_id=cat['parent_id']
				cat_level   = cat['level']
				info 	= {'cat_id':cat_id, 'cat_name':cat_name, 'level':cat_level, 'parent_id':parent_id}
				info['cat_id_1'] = cat['cat_id_1']
				info['cat_id_2'] = cat['cat_id_2']
				info['cat_id_3'] = cat['cat_id_3']
				info['cat_id_4'] = cat['cat_id_4']

				cat_list.append(info)
				if cat_level != 3:
					headers = self.make_headers()
					yield scrapy.FormRequest(self.url+'?&parentId='+str(cat_id),callback=self.parse, headers=headers)
			CatItem = CategoryItem()
			CatItem['cat_list'] = cat_list
			yield CatItem
			
		elif 'error_code' in categoryInfo.keys() and categoryInfo['error_code'] == 43001:
			self.get_pdd_login_info()
コード例 #9
0
 def special_parse_subject(self, response):
     """ 9块9特卖 """
     subject_list = []
     path = response.meta["path"]
     path_id = response.meta["path_id"]
     body = response.body.decode()
     logging.debug(json.dumps({'body': body}))
     result = json.loads(body)
     if 'list' in result.keys() and len(result['list']) > 0:
         a = 1
         for i in result['list']:
             subject_id = i["tab_id"]
             name = i["subject"]
             subject_info = self.build_subject_info(subject_id, name,
                                                    path + [name],
                                                    path_id + [subject_id],
                                                    41, 4, a)
             a += 1
             subject_list.append(subject_info)
             self.save_log(
                 json.dumps({"subject_info_special": subject_info}))
         item = CategoryItem()
         logging.debug(json.dumps({'subject_list_special': subject_list}))
         self.save_log(json.dumps({'subject_list_special': subject_list}))
         item['cat_list'] = subject_list
         yield item
コード例 #10
0
    def parse_subjects(self, response):
        data_list = []
        result = json.loads(response.body.decode('utf-8'))
        lists = result['list']
        path = response.meta['path']

        for data in lists:
            name = data['subject']
            new_path = [name]
            new_path = path + new_path

            ##path.append(data['subject'])
            info = {
                'subject_id': data['subject_id'],
                'path': new_path,
                'name': name,
                'type': 1,
                'activity_type': 2,
                'path_id': []
            }
            data_list.append(info)

        item = CategoryItem()
        item['cat_list'] = data_list
        # print(item)
        yield item
コード例 #11
0
 def kill_parse_subject(self, response):
     """ 限时秒杀"""
     path = response.meta["path"]
     path_id = response.meta["path_id"]
     subject_list = response.meta["subject_list"]
     self.save_log(json.dumps({"kill_subject_list": subject_list}))
     body = response.body.decode()
     logging.debug(json.dumps({'body': body}))
     result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}',
                        body).group()
     if result:
         result = json.loads(result)
         result = self.dict_get(result, 'brandList', None)
         if result:
             a = 1
             for i in result:
                 subject_id = i["data"]["id"]
                 name = i["data"]["name"]
                 subject_info = self.build_subject_info(
                     subject_id, name, path + [name],
                     path_id + [subject_id], 14, 7, a)
                 a += 1
                 subject_list.append(subject_info)
                 self.save_log(
                     json.dumps({"subject_info_kill": subject_info}))
             item = CategoryItem()
             logging.debug(json.dumps({'subject_list_kill': subject_list}))
             self.save_log(json.dumps({'subject_list_kill': subject_list}))
             item['cat_list'] = subject_list
             yield item
コード例 #12
0
 def short_parse_subject(self, response):
     """ 断码清仓"""
     path = response.meta["path"]
     path_id = response.meta["path_id"]
     body = response.body.decode()
     result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body)
     if result:
         result = json.loads(result.group())
         result = self.dict_get(result, 'filterTabList', None)
         subject_list = []
         a = 0
         if result and len(result) > 0:
             for i in result:
                 a += 1
                 subject_id = i["id"]
                 name = i['tabName']
                 subject_info = self.build_subject_info_brand_time(
                     subject_id, name, path + [name],
                     path_id + [subject_id], 21, 6, a)
                 subject_list.append(subject_info)
                 self.save_log(
                     json.dumps({"subject_info_short": subject_info}))
             item = CategoryItem()
             item['cat_list'] = subject_list
             yield item
コード例 #13
0
 def special_parse_subject(self, response):
     """ 9块9特卖 """
     subject_list = []
     path = response.meta["path"]
     path_id = response.meta["path_id"]
     body = response.body.decode()
     result = re.search(r'{"props".*?"https://cdn.yangkeduo.com"}', body)
     if result:
         result = json.loads(result.group())
         tab_list = self.dict_get(result, 'tabList', None)
         if tab_list:
             a = 0
             for i in tab_list:
                 a += 1
                 subject_id = i["tab_id"]
                 name = i["subject"]
                 subject_info = self.build_subject_info_brand_time(
                     subject_id, name, path + [name],
                     path_id + [subject_id], 41, 4, a)
                 subject_list.append(subject_info)
                 self.save_log(
                     json.dumps({"subject_info_special": subject_info}))
             item = CategoryItem()
             item['cat_list'] = subject_list
             yield item
コード例 #14
0
    def parse(self, response):
        """ 获取首页活动信息"""
        body = response.body.decode("utf-8")
        result = json.loads(body)

        # 处理首页下拉商品中的活动
        logging.debug(json.dumps({'result': result}))
        list_activity = self.dict_get(result, 'crossSlideList', None)
        logging.debug(json.dumps({'list_activity': list_activity}))
        banner = self.dict_get(result, 'carouselData', None)
        logging.debug(json.dumps({'banner': banner}))
        if list_activity:
            subject_list = []
            subject_info = self.build_subject_info(71, "首页商品", "首页商品", [71],
                                                   71, 1)
            subject_list.append(subject_info)
            a = 0
            for i in list_activity:
                subject_list_id = re.findall(r"'brand_id': '\d+'", str(i))
                subject_id_list = list(set(subject_list_id))
                a += 1
                logging.debug(json.dumps({'brand_id': subject_list_id}))
                if subject_id_list:
                    if len(subject_id_list) == 1:
                        name = self.dict_get(i, "subject", None)
                        subject_id = self.dict_get(i, "subject_id", None)
                        subject_info = self.build_subject_info(
                            subject_id, name, name, [subject_id], 72, 1, a)
                        subject_list.append(subject_info)
                        logging.debug(
                            json.dumps({'subject_info_home_1': subject_info}))
                        self.save_log(
                            json.dumps({"subject_info_home_1": subject_info}))
                    else:
                        b = 0
                        for i in i["subject_list"]:
                            b += 1
                            subject_id = i['p_rec']["brand_id"]
                            name = i["name"]
                            subject_info = self.build_subject_info(
                                subject_id, name, name, [subject_id], 72, 1, a,
                                b)
                            subject_list.append(subject_info)
                            logging.debug(
                                json.dumps(
                                    {'subject_info_home_2': subject_info}))
                            self.save_log(
                                json.dumps(
                                    {"subject_info_home_2": subject_info}))

            item = CategoryItem()
            logging.debug(json.dumps({'subject_list_home': subject_list}))
            self.save_log(json.dumps({'subject_list_home': subject_list}))
            item['cat_list'] = subject_list
            yield item
コード例 #15
0
	def curl_sub_info(self, response):
		sub_info 	=	response.meta['sub_info']
		message 	=	json.loads(response.body.decode('utf-8'))

		subject_name=	message['subject']
		sub_info['path'] 	= 	sub_info['path'] + [subject_name]
		sub_info['name'] 	=	subject_name

		item  = CategoryItem()
		item['cat_list'] = [sub_info]
		yield item
コード例 #16
0
def parse_category(response):
    loader = ItemLoader(item=CategoryItem(), response=response)
    loader.add_xpath(
        "category", '//div[@id="floor_1"]/div[@class="classify_kind"]'
        '/ul[@class="classify_kind_detail"]/li/a/text()')
    loader.add_xpath(
        "url", '//div[@id="floor_1"]/div[@class="classify_kind"]'
        '/ul[@class="classify_kind_detail"]/li/a/@href')
    items = loader.load_item()
    # {"影视写真": "http://category.dangdang.com/cp01.01.13.00.00.00.html"}
    #result = dict(zip(items.get("category"), items.get("url")))
    return items
コード例 #17
0
    def parse(self, response):
        result = json.loads(response.body.decode('utf-8'))
        logging.debug(json.dumps(result))
        if 'errorCode' in result.keys() and result['errorCode'] == 1000000:
            if result['result']['items'] is None:
                return None
            meta = response.meta
            cat = meta['cat']
            cat_id = cat['cat_id']
            level = cat['level']
            day = meta['day']
            rank = meta['rank']
            page = meta['page']

            # logging.log(logging.WARNING, str(cat_id)+'+'+str(len(result['result']['items'])))
            for keyword_item in result['result']['items']:
                keyword = keyword_item['query']
                rank += 1
                keyword_data = {}
                keyword_data['keyword'] = keyword
                keyword_data['category'] = cat_id
                keyword_data['cat_id_1'] = cat['cat_id_1']

                if level == 2 or level == 3:
                    keyword_data['cat_id_2'] = cat['cat_id_2']
                    if level == 3:
                        keyword_data['cat_id_3'] = cat['cat_id_3']

                keyword_data['hotness'] = keyword_item['heat']
                keyword_data['richness'] = 27
                keyword_data['rank'] = rank
                keyword_data['day'] = day

                keywords_data = self.merge_keyword_data(keyword_data, {})
                keywordItem = CategoryItem()
                keywordItem['cat_list'] = keywords_data
                yield keywordItem

            if result['result']['count'] > rank and page < self.max_page:
                page += 1
                meta['page'] = page
                meta['rank'] = rank
                query_data = self.build_query_data(cat, page, self.size, day)
                headers = self.make_headers()
                yield scrapy.Request(self.url,
                                     method="POST",
                                     body=json.dumps(query_data),
                                     meta=meta,
                                     headers=headers,
                                     callback=self.parse)
                '''先从ssdb中获取关键词扩展信息 ssdb没有则拉取接口'''
コード例 #18
0
    def parse_subject_children(self, response):
        subject_list = []
        result = json.loads(response.body.decode('utf-8'))
        lists = result['list']
        path = response.meta['path']
        path_id = response.meta['path_id']

        for data in lists:
            name = data['subject']
            subject_id = data['subject_id']
            new_path = path + [name]
            new_path_id = path_id + [subject_id]

            info = self.build_subject_info(subject_id, name, new_path,
                                           new_path_id, 1, 2)
            subject_list.append(info)

            if data['mix']:  ##有子级活动
                pass
                for mix in data['mix']:
                    subject_mix_data = self.get_child_subject_info(
                        mix)  ##获取活动下级mix信息
                    if not subject_mix_data:
                        continue

                    for v in subject_mix_data:
                        child_name = v['name']
                        child_subject_id = v['subject_id']
                        if child_name:
                            subject_info = self.build_subject_info(
                                child_subject_id, child_name,
                                new_path + [child_name],
                                new_path_id + [child_subject_id])
                            subject_list.append(subject_info)
                        else:
                            headers = self.make_headers()
                            url = 'http://apiv3.yangkeduo.com/subject/' + str(
                                child_subject_id)
                            meta = {'path': new_path, 'path_id': new_path_id}
                            yield scrapy.Request(
                                url,
                                meta=meta,
                                callback=self.parse_subject_info,
                                headers=headers,
                                dont_filter=True,
                                errback=self.errback_httpbin)

        item = CategoryItem()
        item['cat_list'] = subject_list
        #print(item)
        yield item
コード例 #19
0
	def parse_subject_info(self, response):
		path 	= response.meta['path']
		path_id = response.meta['path_id']
		result  = json.loads(response.body.decode('utf-8'))
		subject_id = result['id']
		name 	= result['subject']
		new_path = path+[name]
		new_path_id = path_id + [subject_id]
		info 	= self.build_subject_info(subject_id, name, new_path, new_path_id, 1, 2)

		item  = CategoryItem()
		item['cat_list'] = [info]
		#print(item)
		yield item
コード例 #20
0
    def parse(self, response):
        pass
        self.activity_list.clear()
        item = CategoryItem()

        result = json.loads(response.body.decode('utf-8'))
        if result['result']:
            for data in result['result']:
                img_url = data['img_url']
                title = data['title']
                url = data['link_url']

                ##拆分出URL参数
                url_arr = urlparse.urlparse(url)
                url_query = url_arr.query
                url_query = urlparse.parse_qs(url_query)

                path = ['首页banner轮播', title]  ##活动图片

                query_keys = url_query.keys()
                if 'subjects_id' in query_keys:  ##子页面有下级分类
                    subject_id = url_query['subjects_id'][0]
                    if int(subject_id) in [12, 14]:  ##9.9特卖和品牌清仓跳过
                        continue

                    new_url = 'http://apiv4.yangkeduo.com/subject_collection/' + str(
                        subject_id)
                    headers = self.make_headers()
                    meta = {'path': path}
                    yield scrapy.Request(new_url,
                                         meta=meta,
                                         callback=self.parse_subjects,
                                         headers=headers)  ##抓取下级分类

                elif 'subject_id' in query_keys:  ##子页面无分类
                    subject_id = url_query['subject_id'][0]
                    info = {
                        'path': path,
                        'subject_id': subject_id,
                        'name': title,
                        'type': 1,
                        'activity_type': 1,
                        'path_id': []
                    }
                    self.activity_list.append(info)
                else:  ##无法抓取 跳过
                    continue

            item['cat_list'] = self.activity_list
            yield item
コード例 #21
0
    def parse(self, response):
        result = json.loads(response.body.decode('utf-8'))
        logging.debug(json.dumps(result))
        meta = response.meta
        cat = meta['cat']
        day = meta['day']
        rank_type = meta['rank_type']
        cat_id = cat['cat_id']
        cat_id_1 = cat['cat_id_1']
        cat_id_2 = cat['cat_id_2']
        cat_id_3 = cat['cat_id_3']

        if 'success' in result.keys() and result['success'] and 'result' in result.keys():
            content = json.dumps({'status': 'success', 'cat': cat, "result": result['result']})
            self.save_cat_log(content)
            for keyword_item in result['result']:
                keyword_data = {'category': cat_id, 'day': day, 'rank_type': rank_type}
                keyword_data['cat_id_1'] = cat['cat_id_1']
                if cat_id_2:
                    keyword_data['cat_id_2'] = cat_id_2
                    if cat_id_3:
                        keyword_data['cat_id_3'] = cat_id_3
                keyword_data['rank_num'] = keyword_item['rankNum']
                keyword_data['click_num'] = keyword_item['clickNum']
                keyword_data['compete_value'] = keyword_item['competeValue']
                keyword_data['ctr'] = keyword_item['ctr']
                keyword_data['cvr'] = keyword_item['cvr']
                keyword_data['impr_avg_bid'] = keyword_item['imprAvgBid']
                keyword_data['pv'] = keyword_item['pv']
                keyword_data['word'] = keyword_item['word']
                keywordItem = CategoryItem()
                keywordItem['cat_list'] = keyword_data
                yield keywordItem
        else:
            content = json.dumps({'status': 'fail', 'cat': cat, "result": result})
            self.save_cat_log(content)
            # 三级分类
            mall_result = self.get_pass_mall_id(cat_id_1)
            mall_id, pass_id = mall_result
            if not mall_id:
                return None
            cookie = 'PASS_ID' + '=' + pass_id + ";"
            cat["mall_id"] = mall_id
            meta["proxy"] = self.get_proxy_ip(False)
            headers = self.make_headers(cookie)
            query_data = self.build_query_data(cat, meta['page'], self.size, day, rank_type)
            yield scrapy.Request(self.url, method="POST", body=json.dumps(query_data), meta=meta, headers=headers,
                                 callback=self.parse, dont_filter=True, errback=self.errback_httpbin)
            return None
コード例 #22
0
ファイル: category.py プロジェクト: hehanlin/jdDA-spider
 def parse(self, response):
     level_one_cates = response.xpath(
         "//div[contains(@class, 'category-items')]//div[contains(@class, 'category-item')]"
     )
     for each in level_one_cates:
         level = CATEGORY.LEVEL_ONE
         name = each.xpath("div[@class='mt']//span/text()").get()
         url = ''        # 一级分类没有url
         path = name
         is_list = CATEGORY.LIST_NO
         yield CategoryItem(
             level=level, name=name, url=url, path=path, is_list=is_list, cat_id=None
         )
         for two_each in self.parse_level_two_cates(each, name):
             yield two_each
コード例 #23
0
	def parse(self, response):
		meta = response.meta
		sub_list = []
		activity_type		= meta['activity_type']

		data = json.loads(response.body.decode('utf-8'))
		if data['list']:
			for subject_info in data['list']:
				name 		= subject_info['subject']
				subject_id 	= subject_info['subject_id']
				path 		= [meta['name'], name]
				path_id 	= [meta['subject_id'], subject_id]

				info  		= {'subject_id':subject_id,'path':path,'name':name,'type':1,'activity_type':activity_type,'path_id':path_id}
				sub_list.append(info)

				if subject_info['mix']: ##有下级subject
					for child_sub in subject_info['mix']:
						sub_info 	= self.get_child_subject_info(path, path_id, child_sub, activity_type) ##获取子subject信息

						if sub_info:
							for sub in sub_info:
								new_path_id 	= path_id + [int(sub['subject_id'])]
								sub['path_id'] 	= new_path_id
								sub['type']		= 1
								sub['activity_type'] = activity_type

								if sub['name']: ##有subject_name
									if 'banner' in sub:
										new_path 	= path + [sub['banner']]
										del sub['banner']
									else:
										new_path	= path + [sub['name']]

									sub['path']		= new_path
									
									sub_list.append(sub)
								else: ##没有name则需要接口拉取subject信息获取name
									sub['path']		= path
									url 	=	'http://apiv3.yangkeduo.com/subject/'+str(sub['subject_id'])
									yield scrapy.Request(url, meta={'sub_info':sub}, callback=self.curl_sub_info, headers=self.make_headers())

				
			item = CategoryItem()
			item['cat_list'] 	=	sub_list
			yield item
コード例 #24
0
ファイル: category.py プロジェクト: hehanlin/jdDA-spider
 def parse_level_two_cates(self, level_one_cate, level_one_name):
     """
     解析出二级分类
     :param level_one_cate: 一级分类的html节点
     :param level_one_name: 一级分类名
     :return:
     """
     level_two_cates = level_one_cate.xpath("div[@class='mc']/div[@class='items']/dl")
     for each in level_two_cates:
         level = CATEGORY.LEVEL_TWO
         name = each.xpath("dt/a/text()").get().strip()
         if not name:
             name = each.xpath("dt//text()").get().strip()
         url = each.xpath("dt/a/@href").get()
         path = self.generate_path([level_one_name, name])
         is_list = CATEGORY.LIST_NO
         yield CategoryItem(
             level=level, name=name, url=url, path=path, is_list=is_list, cat_id=None
         )
         for three_each in self.parse_level_three_cates(each, level_one_name, name):
             yield three_each
コード例 #25
0
    def parse(self, response):
        pass
        result = json.loads(response.body.decode('utf-8'))
        logging.debug(json.dumps(result))
        if 'errorCode' in result.keys() and result['errorCode'] == 1000:
            if result['result'] is None:
                return None
            meta = response.meta
            cat = meta['cat']
            cat_id = cat['cat_id']
            level = cat['level']
            day = meta['day']
            rank_type = meta['rank_type']

            # logging.log(logging.WARNING, str(cat_id)+'+'+str(len(result['result']['items'])))
            for keyword_item in result['result']:
                keyword_data = {
                    'category': cat_id,
                    'day': day,
                    'rank_type': rank_type
                }
                keyword_data['cat_id_1'] = cat['cat_id_1']

                if level == 2 or level == 3:
                    keyword_data['cat_id_2'] = cat['cat_id_2']
                    if level == 3:
                        keyword_data['cat_id_3'] = cat['cat_id_3']

                keyword_data['rank_num'] = keyword_item['rankNum']
                keyword_data['click_num'] = keyword_item['clickNum']
                keyword_data['compete_value'] = keyword_item['competeValue']
                keyword_data['ctr'] = keyword_item['ctr']
                keyword_data['cvr'] = keyword_item['cvr']
                keyword_data['impr_avg_bid'] = keyword_item['imprAvgBid']
                keyword_data['pv'] = keyword_item['pv']
                keyword_data['word'] = keyword_item['word']

                keywordItem = CategoryItem()
                keywordItem['cat_list'] = keyword_data
                yield keywordItem
コード例 #26
0
ファイル: category.py プロジェクト: hehanlin/jdDA-spider
 def parse_level_three_cates(self, level_two_cate, level_one_name, level_two_name):
     """
     解析出三级分类
     :param level_two_cate: 二级分类的html节点
     :param level_one_name: 一级分类名
     :param level_two_name: 二级分类名
     :return:
     """
     level_three_cates = level_two_cate.xpath("dd/a")
     for each in level_three_cates:
         level = CATEGORY.LEVEL_THREE
         name = each.xpath("text()").get()
         url = each.xpath("@href").get()
         path = self.generate_path([level_one_name, level_two_name, name])
         re_matcher = match(r"/{0,2}list\.jd\.com/list.html\?cat=(\d+,\d+,\d+)?(.*)", url)
         if re_matcher:
             is_list = CATEGORY.LIST_YES
             cat_id = re_matcher.group(1)
         else:
             is_list = CATEGORY.LIST_NO
             cat_id = None
         yield CategoryItem(
             level=level, name=name, url=url, path=path, is_list=is_list, cat_id=cat_id, hot=0
         )
コード例 #27
0
    def parse_subject_banner(self, response):
        """ 首页轮播"""
        subject_list = []
        body = response.body.decode('utf-8')
        result = re.search(r'{"store".*?"ssr":true}', body)
        path = response.meta['path']
        path_id = response.meta['path_id']
        url_type = response.meta["url_type"]
        item = CategoryItem()
        if result:
            result_str = result.group()

            if url_type in [1, 2]:
                name = '轮播活动' + str(1) + str(url_type)
                subject_id = 0
                subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 61, 7,
                                                       url_type, 1)
                subject_list.append(subject_info)
                self.save_log(json.dumps({"subject_info_banner" + str(url_type): subject_info}))

            if url_type in [4, 6, 12]:
                f = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str)
                if not f:
                    f = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str)
                subject_id_list = re.findall(r"\d+", str(f))
                a = 1
                for subject_id in subject_id_list:
                    name = '轮播活动' + str(2) + str(a)
                    subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 62,
                                                           7, url_type, a)
                    a += 1
                    subject_list.append(subject_info)
                    self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info}))

            if url_type == 17:  # 登陆接口
                pass

            if url_type == 20:
                name_list = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str)
                if not name_list:
                    name_list = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str)
                id_list = re.findall(r'\d+', str(name_list))
                if id_list:
                    id_list = list(set(id_list))
                else:
                    id_list = []
                if len(id_list) > 1:
                    a = 0
                    for id in id_list:
                        a += 1
                        name = "轮播活动" + str(3) + str(a)
                        subject_info = self.build_subject_info(id, name, path + [name], path_id + [id], 63, 7, url_type,
                                                               a)
                        subject_list.append(subject_info)
                        self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info}))

                else:
                    f = re.findall(r'"DIYGoodsIDs":".*?"', result_str)
                    subject_id_list = re.findall(r"\d+", str(f))
                    subject_str = ''
                    if len(id_list) == 1:
                        subject_id = id_list[0]
                    else:
                        subject_id = subject_id_list[0]
                    for i in subject_id_list:
                        i = i + ','
                        subject_str += i
                    name = "轮播活动" + str(4) + str(1)
                    subject_info = self.build_subject_20_info(subject_id, name, path + [name], path_id + [subject_id],
                                                              subject_str, 64, 7, url_type)
                    subject_list.append(subject_info)
                    self.save_log(json.dumps({"subject_info_banner_id_" + str(url_type): subject_info}))

            item['cat_list'] = subject_list
            yield item
コード例 #28
0
    def parse(self, response):
        """ 获取首页活动信息"""
        body = response.body.decode("utf-8")
        result = re.search(r'{"pageProps".*?null}}', body)
        if not result:
            result = re.search(r'{"props".*?206]}', body)
        if not result:
            result = re.search(r'{"props".*?355]}', body)
        if not result:
            result = re.search(r'{"props".*?344]}', body)
        result = json.loads(result.group())

        # 处理轮播活动
        banner = self.dict_get(result, 'carouselData', None)
        if len(banner) > 0:
            path = ['首页滚动banner']
            for i in banner:
                title = i["title"]
                if not title:
                    title = '商品'
                end_url = i["forwardURL"]
                subject = self.get_subject_type_id(end_url)
                if not subject:
                    continue
                subject_id = subject["subject_id"]
                type = subject["type"]
                headers = self.make_headers()
                new_url = "https://mobile.yangkeduo.com/" + end_url
                meta = {'path': path + [title], 'path_id': [subject_id], 'url_type': type}
                yield scrapy.Request(new_url, meta=meta, callback=self.parse_subject_banner, headers=headers,
                                     dont_filter=True, errback=self.errback_httpbin)

        # 处理首页下拉商品中的活动
        list_activity = self.dict_get(result, 'crossSlideList', None)
        if list_activity:
            subject_list = []
            subject_info = self.build_subject_info(71, "首页商品", "首页商品", [71], 71, 1)
            subject_list.append(subject_info)
            a = 0
            for i in list_activity:
                subject_list_id = re.findall(r"'brand_id': '\d+'", str(i))
                subject_id_list = list(set(subject_list_id))
                a += 1
                if subject_id_list:
                    if len(subject_id_list) == 1:
                        name = self.dict_get(i, "subject", None)
                        subject_id = self.dict_get(i, "subject_id", None)
                        subject_info = self.build_subject_info(subject_id, name, name, [subject_id], 72, 1, a)
                        subject_list.append(subject_info)
                        logging.debug(json.dumps({'subject_info_home_1': subject_info}))
                        self.save_log(json.dumps({"subject_info_home_1": subject_info}))
                    else:
                        b = 0
                        for i in i["subject_list"]:
                            b += 1
                            subject_id = i['p_rec']["brand_id"]
                            if not subject_id:
                                subject_id = 0
                            name = i["name"]
                            if not name:
                                name = '商品' + str(b)
                            subject_info = self.build_subject_info(subject_id, name, name, [subject_id], 72, 1, a,
                                                                   b)
                            subject_list.append(subject_info)
                            logging.debug(json.dumps({'subject_info_home_2': subject_info}))
                            self.save_log(json.dumps({"subject_info_home_2": subject_info}))
            item = CategoryItem()
            item['cat_list'] = subject_list
            yield item
コード例 #29
0
    def parse(self, response):
        result = json.loads(response.body.decode('utf-8'))
        logging.debug(json.dumps(result))
        meta = response.meta
        cat = meta['cat']
        mall_id = meta["mall_id"]
        pass_id = meta["pass_id"]
        day = meta['day']
        rank_type = meta['rank_type']
        cat_id_1 = cat['cat_id_1']
        cat_id_2 = cat['cat_id_2']
        cat_id_3 = cat['cat_id_3']
        level = cat['level']
        if 'errorCode' in result.keys():
            if result['errorCode'] == 9 or result['errorCode'] == 1000:
                content = json.dumps({
                    "mall_id": mall_id,
                    "pass_id": pass_id,
                    "status": "fail",
                    "leval": level,
                    "day": day,
                    "rank_type": rank_type,
                    "cat_id_1": cat_id_1,
                    "cat_id_2": cat_id_2,
                    "cat_id_3": cat_id_3,
                    "result": result['errorCode']
                }) + ","
                self.save_mall_log(content)
                # 一级分类
                if not cat_id_2:
                    mall_result = self.get_pass_mall_id(cat_id_1)
                    mall_id, pass_id = mall_result
                    if not mall_id:
                        self.fail_count += 1
                        content = json.dumps({
                            "cat_id": cat_id_1,
                            "fail_reason": "当前分类没有pass_id",
                            "fail_count": self.fail_count
                        })
                        self.save_mall_log(content)
                        return None
                    cat["mall_id"] = mall_id
                    cookie = 'PASS_ID' + '=' + pass_id + ";"
                    headers = self.make_headers(cookie)
                    query_data = self.build_query_data(cat, meta['page'],
                                                       self.size, day,
                                                       rank_type)
                    yield scrapy.Request(self.url,
                                         method="POST",
                                         body=json.dumps(query_data),
                                         meta=meta,
                                         headers=headers,
                                         callback=self.parse)

                # 二级分类
                if not cat_id_3:
                    mall_result = self.get_pass_mall_id(cat_id_1)
                    mall_id, pass_id = mall_result
                    if not mall_id:
                        self.fail_count += 1
                        content = json.dumps({
                            "cat_id": cat_id_1,
                            "fail_reason": "当前分类没有pass_id",
                            "fail_count": self.fail_count
                        })
                        self.save_mall_log(content)
                        return None
                    cat["mall_id"] = mall_id
                    cookie = 'PASS_ID' + '=' + pass_id + ";"
                    headers = self.make_headers(cookie)
                    query_data = self.build_query_data(cat, meta['page'],
                                                       self.size, day,
                                                       rank_type)
                    yield scrapy.Request(self.url,
                                         method="POST",
                                         body=json.dumps(query_data),
                                         meta=meta,
                                         headers=headers,
                                         callback=self.parse)

                # 三级分类
                mall_result = self.get_pass_mall_id(cat_id_1)
                mall_id, pass_id = mall_result
                if not mall_id:
                    self.fail_count += 1
                    content = json.dumps({
                        "cat_id": cat_id_1,
                        "fail_reason": "当前分类没有pass_id",
                        "fail_count": self.fail_count
                    })
                    self.save_mall_log(content)
                    return None
                cookie = 'PASS_ID' + '=' + pass_id + ";"
                cat["mall_id"] = mall_id
                headers = self.make_headers(cookie)
                query_data = self.build_query_data(cat, meta['page'],
                                                   self.size, day, rank_type)
                yield scrapy.Request(self.url,
                                     method="POST",
                                     body=json.dumps(query_data),
                                     meta=meta,
                                     headers=headers,
                                     callback=self.parse)
                return None

        if 'result' in result.keys():
            self.success_count += 1
            content = json.dumps({
                "pass_id": pass_id,
                "leval": level,
                "day": day,
                "rank_type": rank_type,
                "cat_id_1": cat_id_1,
                "cat_id_2": cat_id_2,
                "cat_id_3": cat_id_3,
                "seccuss": self.success_count,
                "goods_count": len(result['result'])
            }) + ","
            self.save_mall_log(content)
            for keyword_item in result['result']:
                keyword_data = {
                    'category': cat_id_1,
                    'day': day,
                    'rank_type': rank_type
                }
                keyword_data['cat_id_1'] = cat['cat_id_1']
                if cat_id_2:
                    keyword_data['cat_id_2'] = cat_id_2
                    if cat_id_3:
                        keyword_data['cat_id_3'] = cat_id_3
                keyword_data['rank_num'] = keyword_item['rankNum']
                keyword_data['click_num'] = keyword_item['clickNum']
                keyword_data['compete_value'] = keyword_item['competeValue']
                keyword_data['ctr'] = keyword_item['ctr']
                keyword_data['cvr'] = keyword_item['cvr']
                keyword_data['impr_avg_bid'] = keyword_item['imprAvgBid']
                keyword_data['pv'] = keyword_item['pv']
                keyword_data['word'] = keyword_item['word']
                keywordItem = CategoryItem()
                keywordItem['cat_list'] = keyword_data
                yield keywordItem
コード例 #30
0
    def parse_subject_banner(self, response):
        """ 首页轮播"""
        subject_list = []
        body = response.body.decode('utf-8')
        logging.debug(json.dumps({'body': body}))
        result = re.search(r'{"store".*?"ssr":true}', body)
        path = response.meta['path']
        path_id = response.meta['path_id']
        url_type = response.meta["url_type"]
        item = CategoryItem()
        if result:
            result_dict = json.loads(result.group())
            result_str = result.group()
            try:
                name = result_dict["store"]["pageTitle"]
            except Exception:
                name = ''
            if url_type in [1, 2]:
                subject_id = path_id
                subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 61, url_type)
                subject_list.append(subject_info)
                self.save_log(json.dumps({"subject_info_banner" + str(url_type): subject_info}))

            if url_type in [4, 6, 12]:
                f = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str)
                if not f:
                    f = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str)
                subject_id_list = re.findall(r"\d+", str(f))
                a = 1
                for subject_id in subject_id_list:
                    subject_info = self.build_subject_info(subject_id, name, path + [name], path_id + [subject_id], 62, url_type, a)
                    a += 1
                    subject_list.append(subject_info)
                    self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info}))

            if url_type == 17:  # 接口不确定
                pass

            if url_type == 20:
                name_list = re.findall(r'"id":\d+,"DIYGoodsIDs"', result_str)
                if not name_list:
                    name_list = re.findall(r'"id":"\d+","DIYGoodsIDs"', result_str)
                id_list = re.findall(r'\d+', str(name_list))
                if id_list:
                    id_list = list(set(id_list))
                else:
                    id_list = []
                if len(id_list) > 1:
                    a = 0
                    for id in id_list:
                        a += 1
                        name = "商品" + str(a)
                        subject_info = self.build_subject_info(id, name, path + [name], path_id + [id], 63, url_type, a)
                        subject_list.append(subject_info)
                        self.save_log(json.dumps({"subject_info_banner_" + str(url_type): subject_info}))

                else:
                    f = re.findall(r'"DIYGoodsIDs":".*?"', result_str)
                    subject_id_list = re.findall(r"\d+", str(f))
                    subject_str = ''
                    if len(id_list) == 1:
                        subject_id = id_list[0]
                    else:
                        subject_id = subject_id_list[0]
                    for i in subject_id_list:
                        i = i + ','
                        subject_str += i
                    subject_info = self.build_subject_20_info(subject_id, name, path + [name], path_id + [subject_id], subject_str, 64, url_type)
                    subject_list.append(subject_info)
                    self.save_log(json.dumps({"subject_info_banner_id_" + str(url_type): subject_info}))

            item['cat_list'] = subject_list
            yield item