def parse_item(self, response): print(star, '进入 parse_item') items = MaiGooItem() items['url'] = unquote(response.url) items['mongo_id'] = json.dumps(response.text) # need word items['spider_url'] = 'http://www.chinasspp.com' fid = response.xpath( '//ul[@id="brand_info_ctl00_blink"]/li[2]/text()').get() # 行业分类ID fid_id_fid_name = get_trademard_fid_id_fid_name( fid.split('-')[::-1]) if fid else {} items['fid'] = fid_id_fid_name.get('fid_id', '') # 行业分类ID items['fid_name'] = fid_id_fid_name.get('fid_name', '') #行业分类名稱 items['title'] = response.xpath( '//ul[@id="brand_info_ctl00_blink"]/li[1]/text()').get() # 标题 items['thumb'] = response.xpath( '//p[@class="logo"]/img/@src').get() # 缩略图 region_item = get_region_name_id(key=response.meta.get('region', '')) items['region'] = region_item.get('get_region_id', '') # 省份 items['region_name'] = region_item.get('get_region_name', '') # 省份 items['address'] = '' # 品牌发源地 items['foundtime'] = '' # 品牌创立时间 items['corporation'] = '' # 公司法人 items['telphone'] = 'http://www.chinasspp.com{}'.format( response.xpath('//ul[@id="brand_info_ctl00_blink"]/li[4]/img/@src' ).get()) # 图片固话 items['companyname'] = response.xpath( '//ul[@id="brand_info_ctl00_blink"]/li[3]/text()').get() # 公司名称 items['websiteurl'] = response.xpath( '//ul[@id="brand_info_ctl00_blink"]/li[6]/a/@href').get() # 官网 items['content'] = response.xpath( '//div[@class="r r_about"]').get() # 内容 items['slideshow'] = response.xpath( '//div[@id="brand_info_ctl00_r_banner"]/ul/li/img/@src').get( ) # 轮播图 # slideshow_html_link = response.xpath('//div[@class="r r_photo"]/ul//li/a/@href | //div[@class="r r_photo"]/ul//li/a/@title').getall() items['bid'] = '' # 品牌等级(500强。。。) items['grade'] = '' # 品牌评分(1-5随机) items['scope'] = '' # 经营范围(fid未修改前) items['idenid'] = 1 # 认证标签(idenid值全部为1) items['email'] = '' #邮箱 items['index_item'] = '' # 排名 items['classify_name'] = '' # 分类名称 # anader word items['corporation_link'] = response.xpath( '//ul[@id="brand_info_ctl00_blink"]/li[6]/a/@href').get() #公司介绍链接 items['websiteurl_info'] = unquote(response.meta.get( 'up_response', '')) #公司网址 items['user_comment'] = '' #品牌评论 items['bid_name'] = '' #品牌等级名称 items['update_biaoshi'] = 10 items['company_address'] = response.xpath( '//ul[@id="brand_info_ctl00_blink"]/li[6]/text()').get() yield items
def parse_item(self,response): try: zhaoshang_dict = {} # alter =AlterClassify () zhaoshang_dict['title']=response.xpath('//div[@class="context_title"]/h1/text()').get() #'招商名稱', # zhaoshang_dict['foundtime']='品牌创立时间', zhaoshang_dict['spider_url']= 'https://www.anxjm.com/' #'爬取网站', zhaoshang_dict['region_name']= response.xpath('//ul[@class="basic_others"]/li[2]/span/text()').get() #'省份名', zhaoshang_dict['region'] =list(map(lambda x:get_region_name_id(x),[zhaoshang_dict['region_name']]))# '省份', zhaoshang_dict['region'] =zhaoshang_dict['region'][0].get('get_region_id') # '省份', zhaoshang_dict['companyname']= response.xpath('//li[@class="fline"][1]/span/text()').get()#'公司名称', zhaoshang_dict['fid_name']= response.xpath('//div[@class="path"]//a/text()').extract()[::-1][:2] #'分类名称', zhaoshang_dict['fid']=self.get_trademard_fid_id_fid_name(zhaoshang_dict['fid_name']).get('fid_id') #'分类id', zhaoshang_dict['source']= response.xpath('//ul[@class="basic_others"]/li[2]/span/text()').get() #'品牌发源地/公司地址', zhaoshang_dict['shopnum']=response.xpath('//li[@class="fline"][3]/span/text()').get()# '门店总数', # zhaoshang_dict['trademark_all_img']= '品牌所有图片', zhaoshang_dict['slideshow']= list(map(lambda x:'https://www.anxjm.com'+x,response.xpath('//div[@class="img-block"]/img/@src').extract()))#'产品log图', zhaoshang_dict['contents']=list(map(lambda x:x.split('<h2>'),response.xpath('//div[@class="body_tit"]').extract()))# '招商全部', zhaoshang_dict['contents']=list(map(lambda xs:'<h2>'+xs,zhaoshang_dict['contents'][0][1:]))# '招商全部', zhaoshang_dict['advantage']= zhaoshang_dict['contents'][3] if len(zhaoshang_dict['contents'])>3 else '' #'加盟优势', zhaoshang_dict['content']= zhaoshang_dict['contents'][0] if len(zhaoshang_dict['contents'])>0 else ''#'加盟详情', zhaoshang_dict['manage']=response.xpath('//li[@class="clear"][2]/strong/text()').get() # '经营范围', zhaoshang_dict['process']= zhaoshang_dict['contents'][4] if len(zhaoshang_dict['contents'])>4 else ''#'加盟流程', zhaoshang_dict['analysis']= zhaoshang_dict['contents'][1] if len(zhaoshang_dict['contents'])>1 else ''#'加盟费分析', zhaoshang_dict['conditions']= zhaoshang_dict['contents'][2] if len(zhaoshang_dict['contents'])>2 else '' #'加盟条件', # zhaoshang_dict['telephone']= '固定电话', # zhaoshang_dict['mobilephone']= '移动手机', # zhaoshang_dict['email']='邮箱地址', # zhaoshang_dict['fax']='公司传真', # zhaoshang_dict['corporation']='公司法人', # zhaoshang_dict['unite']='统一社会信用代码', # zhaoshang_dict['websiteurl']= '品牌官网网址', # zhaoshang_dict['grade']='品牌评分', # zhaoshang_dict['trademark_img']='品牌图片', # zhaoshang_dict['trademark_cp']='品牌产品', zhaoshang_dict['trademark_address']=response.xpath('//li[@class="clear"][3]/strong/text()').get() #,'公司地址', zhaoshang_dict['trademark_throng']= response.xpath('//li[@class="clear"][1]/span/text()').get()#'适合人群', zhaoshang_dict['mongo_id']=response.text # 'mongo_id', zhaoshang_dict['feeid']=alter_price(list(map(lambda x:x.split('万')[0],response.xpath('//span[@class="money"]/text()').extract()))) #'加盟价钱', zhaoshang_dict['pattern']=response.xpath('//li[@class="clear"][2]/span/text()').get()# '经营模式', # zhaoshang_dict['title_picture']= '品牌商标图片', zhaoshang_dict['jiamengnum']=random.randint(1,1000) #'加盟人数', # 'insert_time': '数据插入时间', zhaoshang_dict['thumb']= list(map(lambda x:'https://www.anxjm.com'+x,response.xpath('//div[@class="bd"]/ul//li/img/@src').extract()))#'轮播图', # zhaoshang_dict['update_biaoshi']= '1:上传至oss', zhaoshang_dict['url']=response.url #'网站url', zhaoshang_dict1 = {k:','.join(v) if isinstance(v,list) else v for k,v in zhaoshang_dict.items()} zhaoshang_result = ZhaoShanItem() zhaoshang_result.update(zhaoshang_dict1) yield zhaoshang_result except ValueError as e: print(e)
def parse_items(self,response): zhaoshan_items = ZhaoShanItem() items = {} items['title']=response.xpath('//div[@class="project-name"]/h2/text()').re('【(.*?)】')# '品牌名稱', items['spider_url']= 'https://www.91jmw.com' #'爬取网站', items['region_name']=response.xpath('*//div[@class="project-info"][1]/p[5]').xpath('string(.)').get(default='').split()[-1:]#'省份名', items['region'] =list(map(lambda x:get_region_name_id(x),[items['region_name']]))[0].get('get_region_id') #'省份', items['companyname']=response.xpath('*//div[@class="prd-item"][3]/p[1]').xpath('string(.)').get(default='').split()[-1:] #'公司名称', items['fid_name']=response.xpath('*//div[@class="project-info"][1]/p[4]').xpath('string(.)').get(default='').split()[2:]#'分类名称', items['fid_name']=list(map(lambda x:x.replace('>',''),items['fid_name']))#'分类名称', # items['fid']=list(map(lambda x:get_trademard_fid_id_fid_name(x),items['fid_name']))[0].get('fid_id') if items['fid_name'] else ''#'分类id', items['source']=response.xpath('*//div[@class="project-info"][1]/p[5]').xpath('string(.)').get(default='').split()[2:]#'品牌发源地/公司地址', # items['contents']='招商全部', items['content']= response.xpath('//div[@class="jiameng-d-step cont-jiameng "]/div[1]').get(default='') #'加盟详情', # items['manage']='经营范围', # items['foundtime']='品牌创立时间', items['shopnum']= response.xpath('*//div[@class="project-info"][1]/p[2]').xpath('string(.)').re('(\d+)家') #'门店总数', # items['trademark_all_img']= '品牌所有图片', items['advantage']=response.xpath('//div[@class="jiameng-d-step cont-jiameng "]/div[3]').get(default='') # '加盟优势', items['slideshow']= response.xpath('//img[@class="lazyload"]/@data-src').get(default='') #'产品log图', # items['process']= '加盟流程', # items['analysis']= '加盟费分析', # items['conditions']= '加盟条件', # items['telephone']= '固定电话', # items['mobilephone']= '移动手机', # items['email']= '邮箱地址', # items['fax']= '公司传真', # items['corporation']= '公司法人', # items['unite']= '统一社会信用代码', # items['websiteurl']= '品牌官网网址', # items['grade']= '品牌评分', # items['trademark_img']= '品牌图片', items['trademark_cp']= response.xpath('*//div[@class="project-info"][1]/p[3]').xpath('string(.)').get(default='')#'品牌产品', items['trademark_address']= response.xpath('*//div[@class="prd-item"][3]/p[3]').xpath('string(.)').get(default='').split()[-1:] #'公司地址', items['trademark_throng']= response.xpath('*//div[@class="project-info"][1]/p[7]').xpath('string(.)').get(default='').split()[2:]#'适合人群', items['mongo_id']= response.text #'mongo_id', items['feeid']=response.xpath('//span[@class="fl p-price"]').xpath('string(.)').re('¥(.*?)万')# '加盟价钱', items['feeid']=list(map(lambda x:alter_price(x),items['feeid'])) if all([isinstance(items['feeid'],list)]) else ''# '加盟价钱', # items['pattern']= '经营模式', # items['title_picture']= '品牌商标图片', # items['jiamengnum']= '加盟人数', items['thumb']= response.xpath('//ul[@class="scroll_pic cls"]/li//img/@src').getall() #'轮播图', # items['update_biaoshi']= '1 =上传至oss', items['url']=response.url # '网站url', items_0 = {k:list(map(lambda x:x.replace('>',''),v)) if k=='fid_name' else v for k, v in items.items()} items_1 = {k:','.join(v) if isinstance(v,list) else v for k, v in items_0.items() if v} items_2 = {k:'' if all([k=='slideshow','pro_91' in v]) else v for k, v in items_1.items()} zhaoshan_items.update(items_2) yield zhaoshan_items
def parse_navigation_response(self, response): #-----------ItemLoader #------------测试利用继承类上传oss # slideshow = super().upload_oss(response.xpath('//div[@class="img big"]/img/@src').get()) # ---------------item items = MaiGooItem() fid_id_fid_name = get_trademard_fid_id_fid_name( response.xpath('//div[@class="position"]/a').xpath( 'string(.)').getall()[1:3]) items['fid'] = fid_id_fid_name.get('fid_id') # 行业分类ID fid_name = fid_id_fid_name.get('fid_name') # 行业分类ID items['title'] = response.xpath('//span[@class="font22 line18em b"]' ).xpath('string(.)').get() # 标题 items['thumb'] = response.xpath( '//div[@class="img"]/a/img/@src').get() # 缩略图 items['address'] = response.xpath( '//li[@class="dhidden"]/span[1]/text()').get().split(':')[ 1] # 品牌发源地 get_region = get_region_name_id(key=response.xpath( '//li[@class="dhidden"]/span[1]/text()').get().split(':')[1][:2]) items['region'] = get_region.get('get_region_id') # 省份id region_name = get_region.get('get_region_name') # 省份 items['foundtime'] = response.xpath( '//li[@class="dhidden"]/span[2]/text()').get().split(':')[ 1] # 品牌创立时间 items['corporation'] = response.xpath( '//span[@class="mgl"]/a/text()').get() # 公司法人 items['telphone'] = ','.join( response.xpath('//div[@class="info"]').re('i>(.*?)</span')) # 固话 items['companyname'] = response.xpath('//a[@class="font16"]').xpath( 'string(.)').get().replace('(', '').replace(')', '') # 公司名称 items['websiteurl'] = unquote( response.xpath('//div[@class="img"]/a/@href').get().split('=') [1]) # 官网 items['content'] = response.xpath('//div[@class="desc"]').get() # 内容 items['slideshow'] = response.xpath( '//div[@class="img big"]/img/@src').get() # 轮播图 items['bid'] = response.meta.get('classify', 'None') # 品牌等级(500强。。。) items['grade'] = random.randint(1, 5) # 品牌评分(1-5随机) items['company_email'] = response.xpath( '//i[contains(@class,"icon-email")]/@ptitle').get() items['mongo_id'] = json.dumps(response.text) #self link items['corporation_link'] = response.xpath( '//span[@class="mgl"]/a/@href').get() # 公司法人_link items['websiteurl_info'] = response.xpath( '//a[@class="font16"]/@href').get() items['url'] = response.url return items
def parse_response_item(self,response): '''解析需要数据''' items = MaiGooItem() try: fid_id_fid_name = get_trademard_fid_id_fid_name( response.xpath('//div[@class="position"]/a').xpath('string(.)').getall()[1:3]) items['fid'] = fid_id_fid_name.get('fid_id') # 行业分类ID items['scope'] = fid_id_fid_name.get('fid_name') # 行业分类名稱 items['title'] = response.xpath('//span[@class="font22 line18em b"]').xpath('string(.)').get() # 标题 items['thumb'] = response.xpath('//div[@class="img"]/a/img/@src').get() # 缩略图 items['address'] = response.xpath('//li[@class="dhidden"]/span[1]/text()').get().split(':')[1] # 品牌发源地 get_region = get_region_name_id(key=response.xpath('//li[@class="dhidden"]/span[1]/text()').get().split(':')[1][:2]) items['region'] = get_region.get('get_region_id') # 省份id region_name = get_region.get('get_region_name') # 省份 items['foundtime'] = response.xpath('//li[@class="dhidden"]/span[2]/text()').get().split(':')[1] # 品牌创立时间 items['corporation'] = response.xpath('//span[@class="mgl"]/a/text()').get() # 公司法人 items['telphone'] = ','.join(response.xpath('//div[@class="info"]').re('i>(.*?)</span')) # 固话 company_name = response.xpath('//a[@class="font16"]').xpath('string(.)').get() items['companyname'] = company_name.replace('(', '').replace(')', '') if company_name else company_name # 公司名称 items['websiteurl'] = unquote(response.xpath('//div[@class="img"]/a/@href').get().split('=')[1]) # 官网 items['content'] = response.xpath('//div[@class="desc"]').get() # 内容 items['slideshow'] = ','.join(response.xpath('//div[@class="img big"]/img/@src').getall()) # 轮播图 items['bid'] = self.classify_dict_self.get(self.classify_dict.get(response.meta.get('classify_id','None'))) #品牌等级 items['bid_name'] = self.classify_dict.get(response.meta.get('classify_id','None')) # 品牌等级名稱 items['grade'] = random.randint(1, 5) # 品牌评分(1-5随机) items['email'] = response.xpath('//i[contains(@class,"icon-email")]/@ptitle').get() # self link items['corporation_link'] = response.xpath('//span[@class="mgl"]/a/@href').get() # 公司法人_link items['websiteurl_info'] = response.xpath('//a[@class="font16"]/@href').get() items['mongo_id'] = json.dumps(response.text) items['url'] = response.url items['index_item'] = response.meta.get('index_item','') items['user_comment'] = [] #评论 comment_id = response.xpath('//div[@class="brandud"]/@brandid').get() yield scrapy.Request(url=self.comment_link.format(comment_id=comment_id, page=1), callback=self.parse_item_comment, dont_filter=False, meta={'pp_items': items, 'comment_id': comment_id, }) except Exception as e: print(e)
def parse_item(self, response): items = ZhaoShanItem() response_meta = response.meta #------------------------加盟 items['spider_url'] = 'http://so.jiameng.com' # 爬取网站 items['contents'] = response.xpath( '//div[@class="jm_xq_con"]').extract() # 加盟详情全部 items['content'] = items['contents'][0] # 加盟详情 items['advantage'] = items['contents'][1] # 加盟优势 items['process'] = items['contents'][2] # 加盟流程 #items['conditions'] = response.xpath() # 加盟条件 #items['analysis'] = response.xpath() # 加盟分析 items['manage'] = ','.join( response.xpath('//div[@class="jm_info"]//tr[3]/td[2]/text()'). extract()) # 经营范围 items['jiamengnum'] = random.randint(100, 1000) # 加盟人数 feeid = response.xpath( '//div[@class="jm_info"]//tr[8]/td[2]/text()').re( r'(\d+)') if response.xpath( '//div[@class="jm_info"]//tr[8]/td[2]/text()').re( r'(\d+)') else response.xpath( '//em[@class="jm-rmb-num"]/text()').get().split( '~') # 加盟价钱 items['feeid'] = alter_price(feeid) items['thumb'] = ','.join( response.xpath( '//div[@class="holder"]/i/a/img/@src').extract()) # 轮播图 items['title'] = response.xpath( '//h2[@class="base_bd_title"]/@title').re(r'【(.*?)】')[0] # 品牌名稱 # items['update_biaoshi'] = response.xpath() # 1:上传至oss fid_names = get_trademard_fid_id_fid_name( response.xpath('//div[@class="brand"]').xpath( 'string(.)').get().replace('\n', '').split('>')[1:-1]) # 分类名称 items['fid'] = fid_names.get('fid_id', '') #分类id items['fid_name'] = fid_names.get('fid_name', '') # 标签名 items['trademark_throng'] = ','.join( response.xpath('//div[@class="jm_info"]//tr[3]/td[4]/text()'). extract()) # 适合人群 # items['async_part'] = response.xpath('//div[@class="w990"]/a[{}]/text() | //div[@class="w990"]/a[{}]/@href'.format(1,1)).extract() items['foundtime'] = ','.join(response_meta.get( 'company_create', '')) if response_meta.get('company_create', None) else ''.join( response.xpath('//div[@class="jm_info"]//tr[2]/td[2]/text()'). extract()) # 品牌创立时间 items['shopnum'] = ','.join( response.xpath('//div[@class="jm_info"]//tr[2]/td[4]/text()'). extract()) # 门店总数 items['slideshow'] = ','.join( response.xpath( '//div[@class="comp_logo"]/img/@src').extract()) # 产品log图 items['websiteurl'] = ','.join( response.xpath( '//li[@class="summary_site clear"]/div[@class="dd"]/span/text()' ).extract()) # 品牌官网网址 # items['grade'] = response.xpath() # 品牌评分 # items['trademark_img'] = response.xpath() # 品牌图片 # items['trademark_cp'] = response.xpath() # 品牌产品 # items['title_picture'] = response.xpath() # 品牌商标图片 # items['pattern'] = response.xpath() # 经营模式 #---------------------公司 items['companyname'] = ','.join( response.xpath( '//div[@class="comp_info_con"]/h3/@title').extract()) # 公司名称 items['source'] = ','.join( response.xpath('//div[@class="jm_info"]//tr[5]/td[2]/text()'). extract()) # 品牌发源地/公司地址 region = get_region_name_id( items['source'] if items['source'] else response. xpath('//div[@class="jm_info"]//tr[5]/td[2]/text()').extract()) items['region'] = region.get('get_region_id', '') # 省份 items['region_name'] = region.get('get_region_name', '') # 省份名 # items['telephone'] = response.xpath() # 固定电话 # items['mobilephone'] = response.xpath() # 移动手机 # items['email'] = response.xpath() # 邮箱地址 # items['fax'] = response.xpath() # 公司传真 # items['corporation'] = response.xpath() # 公司法人 # items['unite'] = response.xpath() # 统一社会信用代码 items['trademark_address'] = ','.join( response.xpath('//div[@class="comp_info_con"]/ul/li[3]/em/text()' ).extract()) if 1 else response_meta.get( 'company_address', '') # 公司地址 #-----------------------其他 items['trademark_all_img'] = '' # 品牌所有图片 items['mongo_id'] = response.text # mongo id items['url'] = response.url # 网站url return items