コード例 #1
0
    def parse_item(self, response):
        print(star, '进入 parse_item')
        items = MaiGooItem()
        items['url'] = unquote(response.url)
        items['mongo_id'] = json.dumps(response.text)
        # need word
        items['spider_url'] = 'http://www.chinasspp.com'
        fid = response.xpath(
            '//ul[@id="brand_info_ctl00_blink"]/li[2]/text()').get()  # 行业分类ID
        fid_id_fid_name = get_trademard_fid_id_fid_name(
            fid.split('-')[::-1]) if fid else {}
        items['fid'] = fid_id_fid_name.get('fid_id', '')  # 行业分类ID
        items['fid_name'] = fid_id_fid_name.get('fid_name', '')  #行业分类名稱

        items['title'] = response.xpath(
            '//ul[@id="brand_info_ctl00_blink"]/li[1]/text()').get()  # 标题
        items['thumb'] = response.xpath(
            '//p[@class="logo"]/img/@src').get()  # 缩略图
        region_item = get_region_name_id(key=response.meta.get('region', ''))
        items['region'] = region_item.get('get_region_id', '')  # 省份
        items['region_name'] = region_item.get('get_region_name', '')  # 省份
        items['address'] = ''  # 品牌发源地
        items['foundtime'] = ''  # 品牌创立时间
        items['corporation'] = ''  # 公司法人
        items['telphone'] = 'http://www.chinasspp.com{}'.format(
            response.xpath('//ul[@id="brand_info_ctl00_blink"]/li[4]/img/@src'
                           ).get())  # 图片固话
        items['companyname'] = response.xpath(
            '//ul[@id="brand_info_ctl00_blink"]/li[3]/text()').get()  # 公司名称
        items['websiteurl'] = response.xpath(
            '//ul[@id="brand_info_ctl00_blink"]/li[6]/a/@href').get()  # 官网
        items['content'] = response.xpath(
            '//div[@class="r r_about"]').get()  # 内容
        items['slideshow'] = response.xpath(
            '//div[@id="brand_info_ctl00_r_banner"]/ul/li/img/@src').get(
            )  # 轮播图
        # slideshow_html_link = response.xpath('//div[@class="r r_photo"]/ul//li/a/@href | //div[@class="r r_photo"]/ul//li/a/@title').getall()

        items['bid'] = ''  # 品牌等级(500强。。。)
        items['grade'] = ''  # 品牌评分(1-5随机)
        items['scope'] = ''  # 经营范围(fid未修改前)
        items['idenid'] = 1  # 认证标签(idenid值全部为1)
        items['email'] = ''  #邮箱
        items['index_item'] = ''  # 排名
        items['classify_name'] = ''  # 分类名称

        # anader word
        items['corporation_link'] = response.xpath(
            '//ul[@id="brand_info_ctl00_blink"]/li[6]/a/@href').get()  #公司介绍链接
        items['websiteurl_info'] = unquote(response.meta.get(
            'up_response', ''))  #公司网址
        items['user_comment'] = ''  #品牌评论
        items['bid_name'] = ''  #品牌等级名称
        items['update_biaoshi'] = 10
        items['company_address'] = response.xpath(
            '//ul[@id="brand_info_ctl00_blink"]/li[6]/text()').get()
        yield items
コード例 #2
0
ファイル: AnxjmSpider.py プロジェクト: veritastry/ScrapyX
    def parse_item(self,response):
        try:
            zhaoshang_dict = {}
            # alter =AlterClassify ()
            zhaoshang_dict['title']=response.xpath('//div[@class="context_title"]/h1/text()').get() #'招商名稱',
            # zhaoshang_dict['foundtime']='品牌创立时间',
            zhaoshang_dict['spider_url']= 'https://www.anxjm.com/' #'爬取网站',

            zhaoshang_dict['region_name']= response.xpath('//ul[@class="basic_others"]/li[2]/span/text()').get() #'省份名',
            zhaoshang_dict['region'] =list(map(lambda x:get_region_name_id(x),[zhaoshang_dict['region_name']]))# '省份',
            zhaoshang_dict['region'] =zhaoshang_dict['region'][0].get('get_region_id') # '省份',
            zhaoshang_dict['companyname']= response.xpath('//li[@class="fline"][1]/span/text()').get()#'公司名称',
            zhaoshang_dict['fid_name']= response.xpath('//div[@class="path"]//a/text()').extract()[::-1][:2] #'分类名称',
            zhaoshang_dict['fid']=self.get_trademard_fid_id_fid_name(zhaoshang_dict['fid_name']).get('fid_id') #'分类id',
            zhaoshang_dict['source']= response.xpath('//ul[@class="basic_others"]/li[2]/span/text()').get() #'品牌发源地/公司地址',
            zhaoshang_dict['shopnum']=response.xpath('//li[@class="fline"][3]/span/text()').get()# '门店总数',
            # zhaoshang_dict['trademark_all_img']= '品牌所有图片',
            zhaoshang_dict['slideshow']= list(map(lambda x:'https://www.anxjm.com'+x,response.xpath('//div[@class="img-block"]/img/@src').extract()))#'产品log图',
            zhaoshang_dict['contents']=list(map(lambda x:x.split('<h2>'),response.xpath('//div[@class="body_tit"]').extract()))# '招商全部',
            zhaoshang_dict['contents']=list(map(lambda xs:'<h2>'+xs,zhaoshang_dict['contents'][0][1:]))# '招商全部',

            zhaoshang_dict['advantage']= zhaoshang_dict['contents'][3] if len(zhaoshang_dict['contents'])>3 else '' #'加盟优势',
            zhaoshang_dict['content']= zhaoshang_dict['contents'][0] if len(zhaoshang_dict['contents'])>0 else ''#'加盟详情',
            zhaoshang_dict['manage']=response.xpath('//li[@class="clear"][2]/strong/text()').get() # '经营范围',
            zhaoshang_dict['process']= zhaoshang_dict['contents'][4] if len(zhaoshang_dict['contents'])>4 else ''#'加盟流程',
            zhaoshang_dict['analysis']= zhaoshang_dict['contents'][1] if len(zhaoshang_dict['contents'])>1 else ''#'加盟费分析',
            zhaoshang_dict['conditions']= zhaoshang_dict['contents'][2] if len(zhaoshang_dict['contents'])>2 else '' #'加盟条件',

            # zhaoshang_dict['telephone']= '固定电话',
            # zhaoshang_dict['mobilephone']= '移动手机',
            # zhaoshang_dict['email']='邮箱地址',
            # zhaoshang_dict['fax']='公司传真',
            # zhaoshang_dict['corporation']='公司法人',
            # zhaoshang_dict['unite']='统一社会信用代码',
            # zhaoshang_dict['websiteurl']= '品牌官网网址',
            # zhaoshang_dict['grade']='品牌评分',
            # zhaoshang_dict['trademark_img']='品牌图片',
            # zhaoshang_dict['trademark_cp']='品牌产品',
            zhaoshang_dict['trademark_address']=response.xpath('//li[@class="clear"][3]/strong/text()').get() #,'公司地址',
            zhaoshang_dict['trademark_throng']= response.xpath('//li[@class="clear"][1]/span/text()').get()#'适合人群',
            zhaoshang_dict['mongo_id']=response.text # 'mongo_id',
            zhaoshang_dict['feeid']=alter_price(list(map(lambda x:x.split('万')[0],response.xpath('//span[@class="money"]/text()').extract()))) #'加盟价钱',
            zhaoshang_dict['pattern']=response.xpath('//li[@class="clear"][2]/span/text()').get()# '经营模式',
            # zhaoshang_dict['title_picture']= '品牌商标图片',
            zhaoshang_dict['jiamengnum']=random.randint(1,1000) #'加盟人数',
            # 'insert_time': '数据插入时间',
            zhaoshang_dict['thumb']= list(map(lambda x:'https://www.anxjm.com'+x,response.xpath('//div[@class="bd"]/ul//li/img/@src').extract()))#'轮播图',
            # zhaoshang_dict['update_biaoshi']= '1:上传至oss',
            zhaoshang_dict['url']=response.url #'网站url',

            zhaoshang_dict1 = {k:','.join(v) if isinstance(v,list) else v for k,v in zhaoshang_dict.items()}
            
            zhaoshang_result = ZhaoShanItem()
            zhaoshang_result.update(zhaoshang_dict1)
            yield zhaoshang_result
        except ValueError as e:
            print(e)
コード例 #3
0
ファイル: Jmw91Spider.py プロジェクト: fulongyang/ScrapyX
    def parse_items(self,response):
        zhaoshan_items = ZhaoShanItem()
        items = {}
        items['title']=response.xpath('//div[@class="project-name"]/h2/text()').re('【(.*?)】')# '品牌名稱',
        items['spider_url']= 'https://www.91jmw.com'  #'爬取网站',

        items['region_name']=response.xpath('*//div[@class="project-info"][1]/p[5]').xpath('string(.)').get(default='').split()[-1:]#'省份名',
        items['region'] =list(map(lambda x:get_region_name_id(x),[items['region_name']]))[0].get('get_region_id') #'省份',

        items['companyname']=response.xpath('*//div[@class="prd-item"][3]/p[1]').xpath('string(.)').get(default='').split()[-1:] #'公司名称',
        items['fid_name']=response.xpath('*//div[@class="project-info"][1]/p[4]').xpath('string(.)').get(default='').split()[2:]#'分类名称',
        items['fid_name']=list(map(lambda x:x.replace('>',''),items['fid_name']))#'分类名称',


        # items['fid']=list(map(lambda x:get_trademard_fid_id_fid_name(x),items['fid_name']))[0].get('fid_id') if items['fid_name'] else ''#'分类id',
        items['source']=response.xpath('*//div[@class="project-info"][1]/p[5]').xpath('string(.)').get(default='').split()[2:]#'品牌发源地/公司地址',
        # items['contents']='招商全部',
        items['content']= response.xpath('//div[@class="jiameng-d-step cont-jiameng "]/div[1]').get(default='') #'加盟详情',
        # items['manage']='经营范围',
        # items['foundtime']='品牌创立时间',
        items['shopnum']= response.xpath('*//div[@class="project-info"][1]/p[2]').xpath('string(.)').re('(\d+)家') #'门店总数',
        # items['trademark_all_img']= '品牌所有图片',
        items['advantage']=response.xpath('//div[@class="jiameng-d-step cont-jiameng "]/div[3]').get(default='') # '加盟优势',
        items['slideshow']= response.xpath('//img[@class="lazyload"]/@data-src').get(default='') #'产品log图',
        # items['process']= '加盟流程',
        # items['analysis']= '加盟费分析',
        # items['conditions']= '加盟条件',
        # items['telephone']= '固定电话',
        # items['mobilephone']= '移动手机',
        # items['email']= '邮箱地址',
        # items['fax']= '公司传真',
        # items['corporation']= '公司法人',
        # items['unite']= '统一社会信用代码',
        # items['websiteurl']= '品牌官网网址',
        # items['grade']= '品牌评分',
        # items['trademark_img']= '品牌图片',
        items['trademark_cp']= response.xpath('*//div[@class="project-info"][1]/p[3]').xpath('string(.)').get(default='')#'品牌产品',
        items['trademark_address']= response.xpath('*//div[@class="prd-item"][3]/p[3]').xpath('string(.)').get(default='').split()[-1:] #'公司地址',
        items['trademark_throng']= response.xpath('*//div[@class="project-info"][1]/p[7]').xpath('string(.)').get(default='').split()[2:]#'适合人群',
        items['mongo_id']= response.text #'mongo_id',
        items['feeid']=response.xpath('//span[@class="fl p-price"]').xpath('string(.)').re('¥(.*?)万')# '加盟价钱',
        items['feeid']=list(map(lambda x:alter_price(x),items['feeid'])) if all([isinstance(items['feeid'],list)]) else ''# '加盟价钱',

        # items['pattern']= '经营模式',
        # items['title_picture']= '品牌商标图片',
        # items['jiamengnum']= '加盟人数',
        items['thumb']= response.xpath('//ul[@class="scroll_pic cls"]/li//img/@src').getall() #'轮播图',
        # items['update_biaoshi']= '1 =上传至oss',
        items['url']=response.url # '网站url',
        items_0 = {k:list(map(lambda x:x.replace('>',''),v)) if k=='fid_name' else v for k, v in items.items()}
        items_1 = {k:','.join(v) if isinstance(v,list) else v for k, v in items_0.items() if v}
        items_2 = {k:'' if all([k=='slideshow','pro_91' in v]) else v for k, v in items_1.items()}
        zhaoshan_items.update(items_2)
        yield zhaoshan_items
コード例 #4
0
ファイル: MaiGooSpider.py プロジェクト: veritastry/ScrapyX
    def parse_navigation_response(self, response):
        #-----------ItemLoader
        #------------测试利用继承类上传oss
        # slideshow = super().upload_oss(response.xpath('//div[@class="img big"]/img/@src').get())
        # ---------------item
        items = MaiGooItem()
        fid_id_fid_name = get_trademard_fid_id_fid_name(
            response.xpath('//div[@class="position"]/a').xpath(
                'string(.)').getall()[1:3])
        items['fid'] = fid_id_fid_name.get('fid_id')  # 行业分类ID
        fid_name = fid_id_fid_name.get('fid_name')  # 行业分类ID
        items['title'] = response.xpath('//span[@class="font22 line18em b"]'
                                        ).xpath('string(.)').get()  # 标题
        items['thumb'] = response.xpath(
            '//div[@class="img"]/a/img/@src').get()  # 缩略图
        items['address'] = response.xpath(
            '//li[@class="dhidden"]/span[1]/text()').get().split(':')[
                1]  # 品牌发源地
        get_region = get_region_name_id(key=response.xpath(
            '//li[@class="dhidden"]/span[1]/text()').get().split(':')[1][:2])
        items['region'] = get_region.get('get_region_id')  # 省份id
        region_name = get_region.get('get_region_name')  # 省份
        items['foundtime'] = response.xpath(
            '//li[@class="dhidden"]/span[2]/text()').get().split(':')[
                1]  # 品牌创立时间
        items['corporation'] = response.xpath(
            '//span[@class="mgl"]/a/text()').get()  # 公司法人
        items['telphone'] = ','.join(
            response.xpath('//div[@class="info"]').re('i>(.*?)</span'))  # 固话
        items['companyname'] = response.xpath('//a[@class="font16"]').xpath(
            'string(.)').get().replace('(', '').replace(')', '')  # 公司名称
        items['websiteurl'] = unquote(
            response.xpath('//div[@class="img"]/a/@href').get().split('=')
            [1])  # 官网

        items['content'] = response.xpath('//div[@class="desc"]').get()  # 内容
        items['slideshow'] = response.xpath(
            '//div[@class="img big"]/img/@src').get()  # 轮播图
        items['bid'] = response.meta.get('classify', 'None')  # 品牌等级(500强。。。)
        items['grade'] = random.randint(1, 5)  # 品牌评分(1-5随机)
        items['company_email'] = response.xpath(
            '//i[contains(@class,"icon-email")]/@ptitle').get()
        items['mongo_id'] = json.dumps(response.text)

        #self link
        items['corporation_link'] = response.xpath(
            '//span[@class="mgl"]/a/@href').get()  # 公司法人_link
        items['websiteurl_info'] = response.xpath(
            '//a[@class="font16"]/@href').get()
        items['url'] = response.url

        return items
コード例 #5
0
ファイル: MaiGooSpiderV2.py プロジェクト: veritastry/ScrapyX
    def parse_response_item(self,response):
        '''解析需要数据'''
        items = MaiGooItem()
        try:
            fid_id_fid_name = get_trademard_fid_id_fid_name(
                response.xpath('//div[@class="position"]/a').xpath('string(.)').getall()[1:3])
            items['fid'] = fid_id_fid_name.get('fid_id')  # 行业分类ID
            items['scope'] = fid_id_fid_name.get('fid_name')  # 行业分类名稱
            items['title'] = response.xpath('//span[@class="font22 line18em b"]').xpath('string(.)').get()  # 标题
            items['thumb'] = response.xpath('//div[@class="img"]/a/img/@src').get()  # 缩略图
            items['address'] = response.xpath('//li[@class="dhidden"]/span[1]/text()').get().split(':')[1]  # 品牌发源地
            get_region = get_region_name_id(key=response.xpath('//li[@class="dhidden"]/span[1]/text()').get().split(':')[1][:2])
            items['region'] = get_region.get('get_region_id')  # 省份id
            region_name = get_region.get('get_region_name')  # 省份
            items['foundtime'] = response.xpath('//li[@class="dhidden"]/span[2]/text()').get().split(':')[1]  # 品牌创立时间
            items['corporation'] = response.xpath('//span[@class="mgl"]/a/text()').get()  # 公司法人
            items['telphone'] = ','.join(response.xpath('//div[@class="info"]').re('i>(.*?)</span'))  # 固话
            company_name = response.xpath('//a[@class="font16"]').xpath('string(.)').get()
            items['companyname'] = company_name.replace('(', '').replace(')', '') if company_name else company_name  # 公司名称
            items['websiteurl'] = unquote(response.xpath('//div[@class="img"]/a/@href').get().split('=')[1])  # 官网
            items['content'] = response.xpath('//div[@class="desc"]').get()  # 内容
            items['slideshow'] = ','.join(response.xpath('//div[@class="img big"]/img/@src').getall())  # 轮播图
            items['bid'] = self.classify_dict_self.get(self.classify_dict.get(response.meta.get('classify_id','None'))) #品牌等级
            items['bid_name'] = self.classify_dict.get(response.meta.get('classify_id','None'))                     # 品牌等级名稱
            items['grade'] = random.randint(1, 5)  # 品牌评分(1-5随机)
            items['email'] = response.xpath('//i[contains(@class,"icon-email")]/@ptitle').get()

            # self link
            items['corporation_link'] = response.xpath('//span[@class="mgl"]/a/@href').get()  # 公司法人_link
            items['websiteurl_info'] = response.xpath('//a[@class="font16"]/@href').get()
            items['mongo_id'] = json.dumps(response.text)
            items['url'] = response.url
            items['index_item'] = response.meta.get('index_item','')
            items['user_comment'] = []
            #评论
            comment_id = response.xpath('//div[@class="brandud"]/@brandid').get()

            yield scrapy.Request(url=self.comment_link.format(comment_id=comment_id, page=1),
                                 callback=self.parse_item_comment,
                                 dont_filter=False, meta={'pp_items': items, 'comment_id': comment_id, })
        except Exception as e:
            print(e)
コード例 #6
0
    def parse_item(self, response):
        items = ZhaoShanItem()
        response_meta = response.meta
        #------------------------加盟
        items['spider_url'] = 'http://so.jiameng.com'  # 爬取网站
        items['contents'] = response.xpath(
            '//div[@class="jm_xq_con"]').extract()  # 加盟详情全部
        items['content'] = items['contents'][0]  # 加盟详情
        items['advantage'] = items['contents'][1]  # 加盟优势
        items['process'] = items['contents'][2]  # 加盟流程
        #items['conditions'] = response.xpath()  # 加盟条件
        #items['analysis'] = response.xpath()  # 加盟分析
        items['manage'] = ','.join(
            response.xpath('//div[@class="jm_info"]//tr[3]/td[2]/text()').
            extract())  # 经营范围
        items['jiamengnum'] = random.randint(100, 1000)  # 加盟人数
        feeid = response.xpath(
            '//div[@class="jm_info"]//tr[8]/td[2]/text()').re(
                r'(\d+)') if response.xpath(
                    '//div[@class="jm_info"]//tr[8]/td[2]/text()').re(
                        r'(\d+)') else response.xpath(
                            '//em[@class="jm-rmb-num"]/text()').get().split(
                                '~')  # 加盟价钱
        items['feeid'] = alter_price(feeid)
        items['thumb'] = ','.join(
            response.xpath(
                '//div[@class="holder"]/i/a/img/@src').extract())  # 轮播图
        items['title'] = response.xpath(
            '//h2[@class="base_bd_title"]/@title').re(r'【(.*?)】')[0]  # 品牌名稱
        # items['update_biaoshi'] = response.xpath()  # 1:上传至oss
        fid_names = get_trademard_fid_id_fid_name(
            response.xpath('//div[@class="brand"]').xpath(
                'string(.)').get().replace('\n', '').split('>')[1:-1])  # 分类名称
        items['fid'] = fid_names.get('fid_id', '')  #分类id
        items['fid_name'] = fid_names.get('fid_name', '')  # 标签名
        items['trademark_throng'] = ','.join(
            response.xpath('//div[@class="jm_info"]//tr[3]/td[4]/text()').
            extract())  # 适合人群
        # items['async_part'] = response.xpath('//div[@class="w990"]/a[{}]/text() | //div[@class="w990"]/a[{}]/@href'.format(1,1)).extract()
        items['foundtime'] = ','.join(response_meta.get(
            'company_create',
            '')) if response_meta.get('company_create', None) else ''.join(
                response.xpath('//div[@class="jm_info"]//tr[2]/td[2]/text()').
                extract())  # 品牌创立时间
        items['shopnum'] = ','.join(
            response.xpath('//div[@class="jm_info"]//tr[2]/td[4]/text()').
            extract())  # 门店总数
        items['slideshow'] = ','.join(
            response.xpath(
                '//div[@class="comp_logo"]/img/@src').extract())  # 产品log图
        items['websiteurl'] = ','.join(
            response.xpath(
                '//li[@class="summary_site clear"]/div[@class="dd"]/span/text()'
            ).extract())  # 品牌官网网址
        # items['grade'] = response.xpath()  # 品牌评分
        # items['trademark_img'] = response.xpath()  # 品牌图片
        # items['trademark_cp'] = response.xpath()  # 品牌产品
        # items['title_picture'] = response.xpath()  # 品牌商标图片
        # items['pattern'] = response.xpath()  # 经营模式

        #---------------------公司
        items['companyname'] = ','.join(
            response.xpath(
                '//div[@class="comp_info_con"]/h3/@title').extract())  # 公司名称
        items['source'] = ','.join(
            response.xpath('//div[@class="jm_info"]//tr[5]/td[2]/text()').
            extract())  # 品牌发源地/公司地址
        region = get_region_name_id(
            items['source'] if items['source'] else response.
            xpath('//div[@class="jm_info"]//tr[5]/td[2]/text()').extract())
        items['region'] = region.get('get_region_id', '')  # 省份
        items['region_name'] = region.get('get_region_name', '')  # 省份名
        # items['telephone'] = response.xpath()  # 固定电话
        # items['mobilephone'] = response.xpath()  # 移动手机
        # items['email'] = response.xpath()  # 邮箱地址
        # items['fax']   =   response.xpath()  # 公司传真
        # items['corporation'] = response.xpath()  # 公司法人
        # items['unite'] = response.xpath()  # 统一社会信用代码
        items['trademark_address'] = ','.join(
            response.xpath('//div[@class="comp_info_con"]/ul/li[3]/em/text()'
                           ).extract()) if 1 else response_meta.get(
                               'company_address', '')  # 公司地址

        #-----------------------其他
        items['trademark_all_img'] = ''  # 品牌所有图片
        items['mongo_id'] = response.text  # mongo id
        items['url'] = response.url  # 网站url
        return items