def parse(self, response):
     item = QichachaItem()
     url = response.xpath('//dd[@class="detail"]')
     for ul in url:
         item['name'] = ul.xpath('a[@class="item-name J_TGoldData"]/text()').extract()
         item['price'] = ul.xpath('div[@class="attribute"]/div[@class="cprice-area"]/span[@class="c-price"]/text()').extract()
         item['sold'] = ul.xpath('div[@class="attribute"]/div[@class="sale-area"]/span[@class="sale-num"]/text()').extract()
         yield item
Example #2
0
    def NewNewInfo(self,gd_list1,title4,companyname,creditnumber):

        items= []
        print(title4 , 'NewNewInfo')

        for gd in gd_list1[1:]:

            item = QichachaItem()
            item['company_name'] = companyname
            items.append(item)

            items[-1]['credit_number'] = creditnumber


            # 股东姓名
            shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/a/h3/text()').extract_first()
            if shareholder_nm == None:
                shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/a/h3/text()').extract_first()
                if shareholder_nm == None:
                    shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/h3/text()').extract_first()
            print(shareholder_nm,'       股东姓名')
            items[-1]['shareholder'] = shareholder_nm

            # 股份比例
            shareholding_bl = gd.xpath('./td[3]/text()').extract_first().strip()
            print(shareholding_bl,'        股份比例')
            items[-1]['shareholding_ratio'] = shareholding_bl

            # 认缴出资日期
            subscription_dt = gd.xpath('./td[5]/text()').extract_first().strip('\t\n\r ,')
            print(subscription_dt,'           认缴出资日期')
            items[-1]['subscription_date'] = subscription_dt

            # 认缴出资额
            subscription_cz = gd.xpath('./td[4]/text()').extract_first().strip('\t\n\r ,')
            # number_of_shares = number_of_shares.replace(',', '')
            if subscription_cz == '-':
                # subscription_cz = 0

                subscription_cz = 0

            if '(万元)' in title4:
                subscription_cz = float(subscription_cz) * 10000
            elif '亿' in title4:
                subscription_cz = float(subscription_cz) * 100000000
            elif '万美元' in title4:
                subscription_cz = float(subscription_cz) * 10000 * 7.1
                # print("认缴出资额")
            print(subscription_cz,'            认缴出资额')
            # items[-1]['subscription_capital'] = int(subscription_cz)
            items[-1]['subscription_capital'] = subscription_cz

        return items
Example #3
0
 def parse(self, response):
     # print response.body
     # selector = Selector(response)
     # print(type(response))
     #先大后小,分块处理
     Companys = response.xpath('//a[@class="list-group-item clearfix"]')
     for eachCompany in Companys:
         abs_url = eachCompany.xpath('@href').extract()[0]
         nameandstatus = eachCompany.xpath(
             'span[@class="clear"]/span/text()').extract()
         # fullTitle = ''
         # for each in title:
         #     fullTitle += each
         #item类实例化
         item = QichachaItem()
         list = eachCompany.xpath(
             'span[@class="clear"]/small/text()').extract()
         # print(list)
         # representative = eachCompany.xpath('span[@class="clear"]/small/text()').extract()[0]
         # star = eachCompany.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
         # quote = eachCompany.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
         #quote可能为空,因此需要先进行判断
         location = list[len(list) - 1].strip()
         if len(list) > 6:
             # location = list[6]
             fund = list[3].strip()
             area = list[4].strip()
         elif '元' in list[3]:
             # location = list[5]
             area = 'null'
             fund = list[3].strip()
         elif len(list) == 5:
             # location = list[5]
             area = list[3].strip()
             fund = 'null'
         else:
             area = 'null'
             fund = 'null'
         name = ''.join(nameandstatus[:-1]).strip()
         status = nameandstatus[len(nameandstatus) - 1].strip()
         representative = list[1].strip()
         date = list[2].strip()
         item['name'] = name
         item['status'] = status
         item['representative'] = representative
         item['fund'] = fund
         item['date'] = date
         item['area'] = area
         item['location'] = location
         item['company_url'] = self.url + abs_url
         # item['representative'] = representative
         yield item
 def parse(self, response):
     item=QichachaItem()
     _company_url=response.css('a.ma_h1::attr(href)').extract_first()
     if _company_url is not None:
         company_url = parse.urljoin(response.url, _company_url)
         match_obj=re.match('.*?([a-zA-Z0-9]{10,})',_company_url)
         if match_obj:
             company_id=match_obj.group(1)
         item['company_id']=company_id
         item['company_url']=company_url
         request= scrapy.Request(company_url, headers=self.headers,callback=self.parse_basecontent)
         request.meta['item']=item
         yield request
Example #5
0
    def parse(self, response):
        item = QichachaItem()
        #报告名称
        # name= response.xpath('//table[@class="ntable"]//td/a/text()').extract()
        # for i in name:
        #     print(i)
        #     item['name']=i
        # #报告的url
        baourl = response.xpath(
            '//table[@class="ntable"]//td/a/@href').extract()

        item['pdfwei'] = baourl
        yield item
Example #6
0
 def detail_parse(self, response):
     items = QichachaItem()
     trs = response.xpath("//section[@id='Cominfo']/table[2]/tr")
     items['qiye_name'] = response.xpath(
         "//*[@id='company-top']/div[2]/div[2]/div[1]/h1/text()").extract()
     print(items['qiye_name'])
     qiye_info = []
     for i in range(len(trs) - 2):
         qiye_info.append(''.join(
             trs[i].xpath(".//td[2]/text()").extract()).replace('\n',
                                                                '').strip())
         qiye_info.append(''.join(
             trs[i].xpath(".//td[4]/text()").extract()).replace('\n',
                                                                '').strip())
     qiye_info.append(''.join(
         trs[-2].xpath(".//td[2]/text()").extract()).replace('\n',
                                                             '').strip())
     qiye_info.append(''.join(
         trs[-1].xpath(".//td[2]/text()").extract()).replace('\n',
                                                             '').strip())
     print(qiye_info)
     items['registered_capital'] = qiye_info[0]
     items['real_capital'] = qiye_info[1]
     yield items
Example #7
0
    def page_parse(self,response):
        item = QichachaItem()

        #公司名
        name = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first()
        item['name'] = name.strip().replace('\n','') if name else '暂无公司名信息'

        #电话
        phone = response.xpath('//div[@class="content"]/div[2]/span[1]/span[2]/span/text()').extract_first()
        item['phone'] = phone.strip().replace('\n','') if phone else '暂无电话信息'

        #官网
        website = response.xpath('//div[@class="content"]/div[2]/span[3]/a/@href').extract_first()
        item['website'] = website.strip().replace('\n','') if website else '暂无网站信息'

        #邮箱
        email = response.xpath('//div[@class="content"]/div[3]/span[1]/span[2]/a/text()').extract_first()
        if email:
            item['email'] = email
        else:
            email2 = response.xpath('//div[@class="content"]/div[3]/span[1]/span[2]/text()').extract_first()
            item['email'] = email2.strip().replace('\n','')

        #地址
        address = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[10]/td[2]/text()').extract_first()
        item['address'] = address.strip().replace('\n','') if address else '暂无地址信息'

        #注册资本
        registered_capital = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[2]/text()').extract_first()
        item['registered_capital'] = registered_capital.replace('\n','').strip() if registered_capital else '暂无注册资本'

        #实缴资本
        contributed_capital = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[4]/text()').extract_first()
        if contributed_capital:
            item['contributed_capital'] = contributed_capital.replace('\n','').strip()
        else:
            item['contributed_capital'] = '暂无实缴资本'

        #经营状态
        status = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[2]/td[2]/text()').extract_first()
        if status:
            item['status'] = status.replace('\n','').strip()
        else:
            item['status'] = '暂无经营状态信息'

        #成立日期
        establishment = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[2]/td[4]/text()').extract_first()
        if establishment:
            item['establishment'] = establishment.replace('\n','').strip()
        else:
            item['establishment'] = '暂无成立日期信息'

        #统一社会信用代码
        social_code = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[3]/td[2]/text()').extract_first()
        if social_code:
            item['social_code'] = social_code.replace('\n','').strip()
        else:
            item['social_code'] = '暂无统一社会信息代码信息'

        #纳税人识别号
        taxpayer_num = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[3]/td[4]/text()').extract_first()
        if taxpayer_num:
            item['taxpayer_num'] = taxpayer_num.replace('\n','').strip()
        else:
            item['taxpayer_num'] = '暂无纳税人识别号信息'

        #注册号
        registrate_num = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[4]/td[2]/text()').extract_first()
        if registrate_num:
            item['registrate_num'] = registrate_num.replace('\n','').strip()
        else:
            item['registrate_num'] = '暂无注册号信息'

        #组织机构代码
        organization_code = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[4]/td[4]/text()').extract_first()
        if organization_code:
            item['organization_code'] = organization_code.replace('\n','').strip()
        else:
            item['organization_code'] = '暂无组织机构代码信息'

        #公司类型
        company_type = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[2]/text()').extract_first()
        if company_type:
            item['company_type'] = company_type.replace('\n','').strip()
        else:
            item['company_type'] = '暂无公司类型信息'

        #所属行业
        industry_involed = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[4]/text()').extract_first()
        if industry_involed:
            item['industry_involed'] = industry_involed.replace('\n','').strip()
        else:
            item['industry_involed'] = '暂无所属行业信息'

        #核准日期
        approval_date = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[6]/td[2]/text()').extract_first()
        if approval_date:
            item['approval_date'] = approval_date.replace('\n','').strip()
        else:
            item['approval_date'] = '暂无核准日期信息'

        #登记机关
        registration_authority = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[6]/td[4]/text()').extract_first()
        if registration_authority:
            item['registration_authority'] = registration_authority.replace('\n','').strip()
        else:
            item['registration_authority'] = '暂无登记机关信息'

        #所属地区
        area = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[7]/td[2]/text()').extract_first()
        if area:
            item['area'] = area.replace('\n','').strip()
        else:
            item['area'] = '暂无所属地区信息'

        #英文名
        english_name = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[7]/td[4]/text()').extract_first()
        if english_name:
            item['english_name'] = english_name.replace('\n','').strip()
        else:
            item['english_name'] = '暂无英文名信息'

        #曾用名
        used = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[8]/td[2]')
        used_name = used.xpath('string(.)').extract_first()
        if used_name:
            item['used_name'] = used_name.replace('\n','').strip().replace('\xa0','')
        else:
            item['used_name'] = '暂无曾用名'

        #参保人数
        insured_num = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[8]/td[4]/text()').extract_first()
        if insured_num:
            item['insured_num'] = insured_num.replace('\n','').strip()
        else:
            item['insured_num'] = '暂无参保人数信息'

        #人员规模
        staff_size = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[2]/text()').extract_first()
        if staff_size:
            item['staff_size'] = staff_size.replace('\n','').strip()
        else:
            item['staff_size'] = '暂无人员规模信息'

        #营业期限
        operate_period = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[4]/text()').extract_first()
        if operate_period:
            item['operate_period'] = operate_period.replace('\n','').strip()
        else:
            item['operate_period'] = '暂无营业期限信息'

        #经营范围
        business_scope = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[11]/td[2]/text()').extract_first()
        if business_scope:
            item['business_scope'] = business_scope.replace('\n','').strip()
        else:
            item['business_scope'] = '暂无经营范围信息'
        yield item
Example #8
0
    def gd_parse(self,response):

        logging.warning('11111111111111')

        # time.sleep(2)

        items = []

        print(response)


        #得到股东信息表格的内容
        table_gd = response.xpath('//*[@id="partnerslist"]/table')
        print(table_gd,'   得到股东信息表格的内容')

        # 公司名
        companyname = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first().strip().replace('\n','')
        print(companyname, '    公司名')

        # 统一社会信用代码
        creditnumber = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"]/tr[3]/td[2]/text()').extract_first().strip().replace('\n','')
        print(creditnumber, '    统一社会信用代码')

        try:
            table_gdx = table_gd[0]
        except:
            name = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first()
            logging.warning(name + "    没有股东信息")
            return
        #得到股东信息内部的每一行的数据
        gd_list = table_gdx.xpath('./tr')


        for gd in gd_list[1:]:
            item = QichachaItem()

            item['company_name'] = companyname
            items.append(item)

            items[-1]['credit_number'] =creditnumber

            #关于公司信息标题的个数 (比如 上市信息,基本信息)
            shuliang = response.xpath('//div[@class="company-nav-contain"]/div')
            print(len(shuliang))
            shuliang = len(shuliang)

            #股东姓名
            shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/a/h3/text()').extract_first()
            if shareholder_nm == None:
                shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/a/h3/text()').extract_first()
                if shareholder_nm == None:
                    shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/h3/text()').extract_first()
            print(shareholder_nm,'股东姓名')
            items[-1]['shareholder'] = shareholder_nm

            #当shareholder_nm 为空时  说明出现新的格式,
            if shuliang ==9:

                #调用新格式 fzNewInfo 函数获取新的值
                items = self.fzNewInfo(response)

                return items

            #股份比例
            shareholding_bl =gd.xpath('./td[3]/text()').extract_first().strip()
            print(shareholding_bl, '   股份比例')
            items[-1]['shareholding_ratio'] = shareholding_bl

            #获取认缴出资额的标题
            czett = gd_list[0].xpath('./th[4]/span/text()').extract_first()
            print(czett , '  gd_parse')

            # 认缴出资日期
            subscription_dt = gd.xpath('./td[5]/text()').extract_first().strip('\t\n\r ,')
            print(subscription_dt,'    认缴出资日期')
            items[-1]['subscription_date'] = subscription_dt

            #认缴出资额
            subscription_cz = gd.xpath('./td[4]/text()').extract_first().strip('\t\n\r ,')
            print(subscription_cz,'        认缴出资额')

            # 获取认缴出资额的标题为None 需修改认缴出资额  认缴出资日期 获取位置
            if czett == None:

                # 获取认缴出资额的标题
                czett = gd_list[0].xpath('./th[5]/span/text()').extract_first()
                print(czett)
                subscription_cz = gd.xpath('./td[5]/text()').extract_first().strip('\t\n\r ,')



                # 认缴出资日期
                subscription_dt = gd.xpath('./td[6]/text()').extract_first().strip('\t\n\r ,')
                print(subscription_dt, '      认缴出资日期')
                items[-1]['subscription_date'] = subscription_dt



            if subscription_cz == '-':
                # subscription_cz = 0
                # items.pop(0)

                if shuliang == 9:
                    # 调用新格式 fzNewInfo 函数获取新的值
                    items = self.fzNewInfo(response)

                    return items
                subscription_cz = 0


            if '(万元)' in czett:
                subscription_cz = float(subscription_cz)*10000
            elif '亿' in czett:
                subscription_cz = float(subscription_cz)*100000000
            elif '万美元' in czett:
                subscription_cz = float(subscription_cz)*10000*7.1
            # print("认缴出资额")
            print(subscription_cz,'        认缴出资额')
            # items[-1]['subscription_capital'] = int(subscription_cz)
            items[-1]['subscription_capital'] = subscription_cz


        return items
Example #9
0
    def fzNewInfo(self,response):

        # logging.warning('*'*100)


        items = []
        # print(items)

        # 公司名
        companyname = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first().strip().replace('\n', '')
        print(companyname, '    公司名')

        # 统一社会信用代码
        creditnumber = response.xpath( '//section[@id="Cominfo"]/table[@class="ntable"]/tr[3]/td[2]/text()').extract_first().strip().replace('\n','')
        print(creditnumber, '    统一社会信用代码')

        # 得到股东信息表格的内容
        table_gd3 = response.xpath('//table[@class="ntable ntable-odd npth nptd"]')
        table_gdx1 = table_gd3[0]

        i = 0

        # 判断关于新格式中符合标准的内容在list中那个位置
        for title in table_gd3:
            tname = title.xpath('./tr[1]/th[3]/text()').extract_first()
            # print(tname)
            if tname == "持股比例":
                # ls_index = table_gd.index(tname)
                table_gdx1 = table_gd3[i]
                break
            i += 1

        gd_list1 = table_gdx1.xpath('./tr')
        print(len(gd_list1))


        #查看./td[4] 的标题
        print(gd_list1[0])
        gd_title = gd_list1[0]
        title4 = gd_title.xpath('./th[4]/span/text()').extract_first()
        print(title4 ,'   fzNewInfo')

        #如果 认缴出资额 在title4 中 那么就使用NewNewInfo 函数提取新格式的内容
        if title4 is not None:
            if '认缴出资额' in title4:
                items = self.NewNewInfo(gd_list1,title4,companyname,creditnumber)
                return items

        for gd in gd_list1[1:]:

            item = QichachaItem()

            item['company_name'] = companyname
            items.append(item)

            items[-1]['credit_number'] = creditnumber

            # 股东姓名
            shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/a/h3/text()').extract_first()

            if shareholder_nm == None:
                shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/a/h3/text()').extract_first()
                if shareholder_nm == None:
                    shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/h3/text()').extract_first()
            print(shareholder_nm,'   股东姓名')
            items[-1]['shareholder'] = shareholder_nm

            # 股份比例
            shareholding_bl = gd.xpath('./td[3]/text()').extract_first().strip()
            print(shareholding_bl,'      股份比例')
            items[-1]['shareholding_ratio'] = shareholding_bl

            # 持股数
            number_of_shares = gd.xpath('./td[4]/text()').extract_first().strip()
            number_of_shares = number_of_shares.replace(',', '')
            items[-1]['number_of_shares'] = number_of_shares
            print(number_of_shares,'        持股数')

        return items
Example #10
0
    def detail_parse(self, response):
        item = QichachaItem()
        item['url'] = response.url
        c_top = response.xpath('//div[@id="company-top"]/div')
        logo_ico = c_top.xpath(
            'div[@class="logo"]/div[2]/img/@src').extract_first()
        if not logo_ico:
            logo_ico = c_top.xpath(
                'div[@class="logo"]/div[1]/img/@src').extract_first()
        item['logo_ico'] = logo_ico

        centent_title = c_top.xpath(
            'div[@class="content"]/div/h1/text()').extract_first()
        if not centent_title:
            centent_title = c_top.xpath(
                'div[@class="content"]/div/text()').extract_first()
        item['centent_title'] = centent_title

        item['centent_mobile'] = response.xpath(
            '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[1]/span[2]/span/text()'
        ).extract_first()
        item['centent_index'] = response.xpath(
            '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[3]/a/text()'
        ).extract_first()
        item['centent_email'] = response.xpath(
            '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[1]/span[2]/a/text()'
        ).extract_first()
        item['centent_address'] = response.xpath(
            '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[3]/a[1]/text()'
        ).extract_first()

        # 工商信息
        info = response.xpath('//section[@id="Cominfo"]')

        item['faren'] = response.xpath(
            '//h2[@class="seo font-20"]/text()').extract_first()

        table = info.xpath('table[2]')

        # detail = tr.xpath('string(.)').extract_first()
        # item['detail'] = self.clear_data(detail)

        # 避免后期更改解析规则_采用单个字段解析
        d1 = table.xpath('tr[1]/td[2]/text()').extract_first()
        d2 = table.xpath('tr[1]/td[4]/text()').extract_first()

        d3 = table.xpath('tr[2]/td[2]/text()').extract_first()
        d4 = table.xpath('tr[2]/td[4]/text()').extract_first()

        d5 = table.xpath('tr[3]/td[2]/text()').extract_first()
        d6 = table.xpath('tr[3]/td[4]/text()').extract_first()

        d7 = table.xpath('tr[4]/td[2]/text()').extract_first()
        d8 = table.xpath('tr[4]/td[4]/text()').extract_first()

        d9 = table.xpath('tr[5]/td[2]/text()').extract_first()
        d10 = table.xpath('tr[5]/td[4]/text()').extract_first()

        d11 = table.xpath('tr[6]/td[2]/text()').extract_first()
        d12 = table.xpath('tr[6]/td[4]/text()').extract_first()

        d13 = table.xpath('tr[7]/td[2]/text()').extract_first()
        d14 = table.xpath('tr[7]/td[4]/text()').extract_first()

        d15 = table.xpath('tr[8]/td[2]/span/text()').extract_first()
        d16 = table.xpath('tr[8]/td[4]/text()').extract_first()

        d17 = table.xpath('tr[9]/td[2]/text()').extract_first()
        d18 = table.xpath('tr[9]/td[4]/text()').extract_first()

        d19 = table.xpath('tr[10]/td[2]/text()').extract_first()

        d20 = table.xpath('tr[11]/td[2]/text()').extract_first()

        detail = {
            '注册资本': d1,
            '实缴资本': d2,
            '经营状态': d3,
            '成立日期': d4,
            '统一社会信用代码': d5,
            '纳税人识别号': d6,
            '注册号': d7,
            '组织机构代码': d8,
            '公司类型': d9,
            '所属行业': d10,
            '核准日期': d11,
            '登记机关': d12,
            '所属地区': d13,
            '英文名': d14,
            '曾用名': d15,
            '参保人数': d16,
            '人员规模': d17,
            '营业期限': d18,
            '企业地址': d19,
            '经营范围': d20
        }

        detail_data = {}
        for k, v in detail.items():
            if v:
                detail_data[k] = v.replace('\n', '').replace(' ', '')
            else:
                detail_data[k] = ''

        item['detail'] = detail_data

        item['license'] = ''

        info_img_url = info.xpath('div/a/@href').extract_first()  # 营业执照

        if info_img_url:
            yield scrapy.Request(self.base_url + info_img_url,
                                 callback=self.detail_parse2,
                                 cookies=self.cookies,
                                 meta={'item': item},
                                 dont_filter=True)

        else:
            yield item