def parse(self, response): item = QichachaItem() url = response.xpath('//dd[@class="detail"]') for ul in url: item['name'] = ul.xpath('a[@class="item-name J_TGoldData"]/text()').extract() item['price'] = ul.xpath('div[@class="attribute"]/div[@class="cprice-area"]/span[@class="c-price"]/text()').extract() item['sold'] = ul.xpath('div[@class="attribute"]/div[@class="sale-area"]/span[@class="sale-num"]/text()').extract() yield item
def NewNewInfo(self,gd_list1,title4,companyname,creditnumber): items= [] print(title4 , 'NewNewInfo') for gd in gd_list1[1:]: item = QichachaItem() item['company_name'] = companyname items.append(item) items[-1]['credit_number'] = creditnumber # 股东姓名 shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/a/h3/text()').extract_first() if shareholder_nm == None: shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/a/h3/text()').extract_first() if shareholder_nm == None: shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/h3/text()').extract_first() print(shareholder_nm,' 股东姓名') items[-1]['shareholder'] = shareholder_nm # 股份比例 shareholding_bl = gd.xpath('./td[3]/text()').extract_first().strip() print(shareholding_bl,' 股份比例') items[-1]['shareholding_ratio'] = shareholding_bl # 认缴出资日期 subscription_dt = gd.xpath('./td[5]/text()').extract_first().strip('\t\n\r ,') print(subscription_dt,' 认缴出资日期') items[-1]['subscription_date'] = subscription_dt # 认缴出资额 subscription_cz = gd.xpath('./td[4]/text()').extract_first().strip('\t\n\r ,') # number_of_shares = number_of_shares.replace(',', '') if subscription_cz == '-': # subscription_cz = 0 subscription_cz = 0 if '(万元)' in title4: subscription_cz = float(subscription_cz) * 10000 elif '亿' in title4: subscription_cz = float(subscription_cz) * 100000000 elif '万美元' in title4: subscription_cz = float(subscription_cz) * 10000 * 7.1 # print("认缴出资额") print(subscription_cz,' 认缴出资额') # items[-1]['subscription_capital'] = int(subscription_cz) items[-1]['subscription_capital'] = subscription_cz return items
def parse(self, response): # print response.body # selector = Selector(response) # print(type(response)) #先大后小,分块处理 Companys = response.xpath('//a[@class="list-group-item clearfix"]') for eachCompany in Companys: abs_url = eachCompany.xpath('@href').extract()[0] nameandstatus = eachCompany.xpath( 'span[@class="clear"]/span/text()').extract() # fullTitle = '' # for each in title: # fullTitle += each #item类实例化 item = QichachaItem() list = eachCompany.xpath( 'span[@class="clear"]/small/text()').extract() # print(list) # representative = eachCompany.xpath('span[@class="clear"]/small/text()').extract()[0] # star = eachCompany.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0] # quote = eachCompany.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract() #quote可能为空,因此需要先进行判断 location = list[len(list) - 1].strip() if len(list) > 6: # location = list[6] fund = list[3].strip() area = list[4].strip() elif '元' in list[3]: # location = list[5] area = 'null' fund = list[3].strip() elif len(list) == 5: # location = list[5] area = list[3].strip() fund = 'null' else: area = 'null' fund = 'null' name = ''.join(nameandstatus[:-1]).strip() status = nameandstatus[len(nameandstatus) - 1].strip() representative = list[1].strip() date = list[2].strip() item['name'] = name item['status'] = status item['representative'] = representative item['fund'] = fund item['date'] = date item['area'] = area item['location'] = location item['company_url'] = self.url + abs_url # item['representative'] = representative yield item
def parse(self, response): item=QichachaItem() _company_url=response.css('a.ma_h1::attr(href)').extract_first() if _company_url is not None: company_url = parse.urljoin(response.url, _company_url) match_obj=re.match('.*?([a-zA-Z0-9]{10,})',_company_url) if match_obj: company_id=match_obj.group(1) item['company_id']=company_id item['company_url']=company_url request= scrapy.Request(company_url, headers=self.headers,callback=self.parse_basecontent) request.meta['item']=item yield request
def parse(self, response): item = QichachaItem() #报告名称 # name= response.xpath('//table[@class="ntable"]//td/a/text()').extract() # for i in name: # print(i) # item['name']=i # #报告的url baourl = response.xpath( '//table[@class="ntable"]//td/a/@href').extract() item['pdfwei'] = baourl yield item
def detail_parse(self, response): items = QichachaItem() trs = response.xpath("//section[@id='Cominfo']/table[2]/tr") items['qiye_name'] = response.xpath( "//*[@id='company-top']/div[2]/div[2]/div[1]/h1/text()").extract() print(items['qiye_name']) qiye_info = [] for i in range(len(trs) - 2): qiye_info.append(''.join( trs[i].xpath(".//td[2]/text()").extract()).replace('\n', '').strip()) qiye_info.append(''.join( trs[i].xpath(".//td[4]/text()").extract()).replace('\n', '').strip()) qiye_info.append(''.join( trs[-2].xpath(".//td[2]/text()").extract()).replace('\n', '').strip()) qiye_info.append(''.join( trs[-1].xpath(".//td[2]/text()").extract()).replace('\n', '').strip()) print(qiye_info) items['registered_capital'] = qiye_info[0] items['real_capital'] = qiye_info[1] yield items
def page_parse(self,response): item = QichachaItem() #公司名 name = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first() item['name'] = name.strip().replace('\n','') if name else '暂无公司名信息' #电话 phone = response.xpath('//div[@class="content"]/div[2]/span[1]/span[2]/span/text()').extract_first() item['phone'] = phone.strip().replace('\n','') if phone else '暂无电话信息' #官网 website = response.xpath('//div[@class="content"]/div[2]/span[3]/a/@href').extract_first() item['website'] = website.strip().replace('\n','') if website else '暂无网站信息' #邮箱 email = response.xpath('//div[@class="content"]/div[3]/span[1]/span[2]/a/text()').extract_first() if email: item['email'] = email else: email2 = response.xpath('//div[@class="content"]/div[3]/span[1]/span[2]/text()').extract_first() item['email'] = email2.strip().replace('\n','') #地址 address = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[10]/td[2]/text()').extract_first() item['address'] = address.strip().replace('\n','') if address else '暂无地址信息' #注册资本 registered_capital = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[2]/text()').extract_first() item['registered_capital'] = registered_capital.replace('\n','').strip() if registered_capital else '暂无注册资本' #实缴资本 contributed_capital = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[1]/td[4]/text()').extract_first() if contributed_capital: item['contributed_capital'] = contributed_capital.replace('\n','').strip() else: item['contributed_capital'] = '暂无实缴资本' #经营状态 status = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[2]/td[2]/text()').extract_first() if status: item['status'] = status.replace('\n','').strip() else: item['status'] = '暂无经营状态信息' #成立日期 establishment = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[2]/td[4]/text()').extract_first() if establishment: item['establishment'] = establishment.replace('\n','').strip() else: item['establishment'] = '暂无成立日期信息' #统一社会信用代码 social_code = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[3]/td[2]/text()').extract_first() if social_code: item['social_code'] = social_code.replace('\n','').strip() else: item['social_code'] = '暂无统一社会信息代码信息' #纳税人识别号 taxpayer_num = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[3]/td[4]/text()').extract_first() if taxpayer_num: item['taxpayer_num'] = taxpayer_num.replace('\n','').strip() else: item['taxpayer_num'] = '暂无纳税人识别号信息' #注册号 registrate_num = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[4]/td[2]/text()').extract_first() if registrate_num: item['registrate_num'] = registrate_num.replace('\n','').strip() else: item['registrate_num'] = '暂无注册号信息' #组织机构代码 organization_code = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[4]/td[4]/text()').extract_first() if organization_code: item['organization_code'] = organization_code.replace('\n','').strip() else: item['organization_code'] = '暂无组织机构代码信息' #公司类型 company_type = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[2]/text()').extract_first() if company_type: item['company_type'] = company_type.replace('\n','').strip() else: item['company_type'] = '暂无公司类型信息' #所属行业 industry_involed = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[5]/td[4]/text()').extract_first() if industry_involed: item['industry_involed'] = industry_involed.replace('\n','').strip() else: item['industry_involed'] = '暂无所属行业信息' #核准日期 approval_date = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[6]/td[2]/text()').extract_first() if approval_date: item['approval_date'] = approval_date.replace('\n','').strip() else: item['approval_date'] = '暂无核准日期信息' #登记机关 registration_authority = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[6]/td[4]/text()').extract_first() if registration_authority: item['registration_authority'] = registration_authority.replace('\n','').strip() else: item['registration_authority'] = '暂无登记机关信息' #所属地区 area = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[7]/td[2]/text()').extract_first() if area: item['area'] = area.replace('\n','').strip() else: item['area'] = '暂无所属地区信息' #英文名 english_name = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[7]/td[4]/text()').extract_first() if english_name: item['english_name'] = english_name.replace('\n','').strip() else: item['english_name'] = '暂无英文名信息' #曾用名 used = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[8]/td[2]') used_name = used.xpath('string(.)').extract_first() if used_name: item['used_name'] = used_name.replace('\n','').strip().replace('\xa0','') else: item['used_name'] = '暂无曾用名' #参保人数 insured_num = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[8]/td[4]/text()').extract_first() if insured_num: item['insured_num'] = insured_num.replace('\n','').strip() else: item['insured_num'] = '暂无参保人数信息' #人员规模 staff_size = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[2]/text()').extract_first() if staff_size: item['staff_size'] = staff_size.replace('\n','').strip() else: item['staff_size'] = '暂无人员规模信息' #营业期限 operate_period = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[9]/td[4]/text()').extract_first() if operate_period: item['operate_period'] = operate_period.replace('\n','').strip() else: item['operate_period'] = '暂无营业期限信息' #经营范围 business_scope = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"][2]/tr[11]/td[2]/text()').extract_first() if business_scope: item['business_scope'] = business_scope.replace('\n','').strip() else: item['business_scope'] = '暂无经营范围信息' yield item
def gd_parse(self,response): logging.warning('11111111111111') # time.sleep(2) items = [] print(response) #得到股东信息表格的内容 table_gd = response.xpath('//*[@id="partnerslist"]/table') print(table_gd,' 得到股东信息表格的内容') # 公司名 companyname = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first().strip().replace('\n','') print(companyname, ' 公司名') # 统一社会信用代码 creditnumber = response.xpath('//section[@id="Cominfo"]/table[@class="ntable"]/tr[3]/td[2]/text()').extract_first().strip().replace('\n','') print(creditnumber, ' 统一社会信用代码') try: table_gdx = table_gd[0] except: name = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first() logging.warning(name + " 没有股东信息") return #得到股东信息内部的每一行的数据 gd_list = table_gdx.xpath('./tr') for gd in gd_list[1:]: item = QichachaItem() item['company_name'] = companyname items.append(item) items[-1]['credit_number'] =creditnumber #关于公司信息标题的个数 (比如 上市信息,基本信息) shuliang = response.xpath('//div[@class="company-nav-contain"]/div') print(len(shuliang)) shuliang = len(shuliang) #股东姓名 shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/a/h3/text()').extract_first() if shareholder_nm == None: shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/a/h3/text()').extract_first() if shareholder_nm == None: shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/h3/text()').extract_first() print(shareholder_nm,'股东姓名') items[-1]['shareholder'] = shareholder_nm #当shareholder_nm 为空时 说明出现新的格式, if shuliang ==9: #调用新格式 fzNewInfo 函数获取新的值 items = self.fzNewInfo(response) return items #股份比例 shareholding_bl =gd.xpath('./td[3]/text()').extract_first().strip() print(shareholding_bl, ' 股份比例') items[-1]['shareholding_ratio'] = shareholding_bl #获取认缴出资额的标题 czett = gd_list[0].xpath('./th[4]/span/text()').extract_first() print(czett , ' gd_parse') # 认缴出资日期 subscription_dt = gd.xpath('./td[5]/text()').extract_first().strip('\t\n\r ,') print(subscription_dt,' 认缴出资日期') items[-1]['subscription_date'] = subscription_dt #认缴出资额 subscription_cz = gd.xpath('./td[4]/text()').extract_first().strip('\t\n\r ,') print(subscription_cz,' 认缴出资额') # 获取认缴出资额的标题为None 需修改认缴出资额 认缴出资日期 获取位置 if czett == None: # 获取认缴出资额的标题 czett = gd_list[0].xpath('./th[5]/span/text()').extract_first() print(czett) subscription_cz = gd.xpath('./td[5]/text()').extract_first().strip('\t\n\r ,') # 认缴出资日期 subscription_dt = gd.xpath('./td[6]/text()').extract_first().strip('\t\n\r ,') print(subscription_dt, ' 认缴出资日期') items[-1]['subscription_date'] = subscription_dt if subscription_cz == '-': # subscription_cz = 0 # items.pop(0) if shuliang == 9: # 调用新格式 fzNewInfo 函数获取新的值 items = self.fzNewInfo(response) return items subscription_cz = 0 if '(万元)' in czett: subscription_cz = float(subscription_cz)*10000 elif '亿' in czett: subscription_cz = float(subscription_cz)*100000000 elif '万美元' in czett: subscription_cz = float(subscription_cz)*10000*7.1 # print("认缴出资额") print(subscription_cz,' 认缴出资额') # items[-1]['subscription_capital'] = int(subscription_cz) items[-1]['subscription_capital'] = subscription_cz return items
def fzNewInfo(self,response): # logging.warning('*'*100) items = [] # print(items) # 公司名 companyname = response.xpath('//div[@class="content"]/div[1]/h1/text()').extract_first().strip().replace('\n', '') print(companyname, ' 公司名') # 统一社会信用代码 creditnumber = response.xpath( '//section[@id="Cominfo"]/table[@class="ntable"]/tr[3]/td[2]/text()').extract_first().strip().replace('\n','') print(creditnumber, ' 统一社会信用代码') # 得到股东信息表格的内容 table_gd3 = response.xpath('//table[@class="ntable ntable-odd npth nptd"]') table_gdx1 = table_gd3[0] i = 0 # 判断关于新格式中符合标准的内容在list中那个位置 for title in table_gd3: tname = title.xpath('./tr[1]/th[3]/text()').extract_first() # print(tname) if tname == "持股比例": # ls_index = table_gd.index(tname) table_gdx1 = table_gd3[i] break i += 1 gd_list1 = table_gdx1.xpath('./tr') print(len(gd_list1)) #查看./td[4] 的标题 print(gd_list1[0]) gd_title = gd_list1[0] title4 = gd_title.xpath('./th[4]/span/text()').extract_first() print(title4 ,' fzNewInfo') #如果 认缴出资额 在title4 中 那么就使用NewNewInfo 函数提取新格式的内容 if title4 is not None: if '认缴出资额' in title4: items = self.NewNewInfo(gd_list1,title4,companyname,creditnumber) return items for gd in gd_list1[1:]: item = QichachaItem() item['company_name'] = companyname items.append(item) items[-1]['credit_number'] = creditnumber # 股东姓名 shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/a/h3/text()').extract_first() if shareholder_nm == None: shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/a/h3/text()').extract_first() if shareholder_nm == None: shareholder_nm = gd.xpath('./td[2]/table/tr/td[2]/div/h3/text()').extract_first() print(shareholder_nm,' 股东姓名') items[-1]['shareholder'] = shareholder_nm # 股份比例 shareholding_bl = gd.xpath('./td[3]/text()').extract_first().strip() print(shareholding_bl,' 股份比例') items[-1]['shareholding_ratio'] = shareholding_bl # 持股数 number_of_shares = gd.xpath('./td[4]/text()').extract_first().strip() number_of_shares = number_of_shares.replace(',', '') items[-1]['number_of_shares'] = number_of_shares print(number_of_shares,' 持股数') return items
def detail_parse(self, response): item = QichachaItem() item['url'] = response.url c_top = response.xpath('//div[@id="company-top"]/div') logo_ico = c_top.xpath( 'div[@class="logo"]/div[2]/img/@src').extract_first() if not logo_ico: logo_ico = c_top.xpath( 'div[@class="logo"]/div[1]/img/@src').extract_first() item['logo_ico'] = logo_ico centent_title = c_top.xpath( 'div[@class="content"]/div/h1/text()').extract_first() if not centent_title: centent_title = c_top.xpath( 'div[@class="content"]/div/text()').extract_first() item['centent_title'] = centent_title item['centent_mobile'] = response.xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[1]/span[2]/span/text()' ).extract_first() item['centent_index'] = response.xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[1]/span[3]/a/text()' ).extract_first() item['centent_email'] = response.xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[1]/span[2]/a/text()' ).extract_first() item['centent_address'] = response.xpath( '//*[@id="company-top"]/div[2]/div[2]/div[3]/div[2]/span[3]/a[1]/text()' ).extract_first() # 工商信息 info = response.xpath('//section[@id="Cominfo"]') item['faren'] = response.xpath( '//h2[@class="seo font-20"]/text()').extract_first() table = info.xpath('table[2]') # detail = tr.xpath('string(.)').extract_first() # item['detail'] = self.clear_data(detail) # 避免后期更改解析规则_采用单个字段解析 d1 = table.xpath('tr[1]/td[2]/text()').extract_first() d2 = table.xpath('tr[1]/td[4]/text()').extract_first() d3 = table.xpath('tr[2]/td[2]/text()').extract_first() d4 = table.xpath('tr[2]/td[4]/text()').extract_first() d5 = table.xpath('tr[3]/td[2]/text()').extract_first() d6 = table.xpath('tr[3]/td[4]/text()').extract_first() d7 = table.xpath('tr[4]/td[2]/text()').extract_first() d8 = table.xpath('tr[4]/td[4]/text()').extract_first() d9 = table.xpath('tr[5]/td[2]/text()').extract_first() d10 = table.xpath('tr[5]/td[4]/text()').extract_first() d11 = table.xpath('tr[6]/td[2]/text()').extract_first() d12 = table.xpath('tr[6]/td[4]/text()').extract_first() d13 = table.xpath('tr[7]/td[2]/text()').extract_first() d14 = table.xpath('tr[7]/td[4]/text()').extract_first() d15 = table.xpath('tr[8]/td[2]/span/text()').extract_first() d16 = table.xpath('tr[8]/td[4]/text()').extract_first() d17 = table.xpath('tr[9]/td[2]/text()').extract_first() d18 = table.xpath('tr[9]/td[4]/text()').extract_first() d19 = table.xpath('tr[10]/td[2]/text()').extract_first() d20 = table.xpath('tr[11]/td[2]/text()').extract_first() detail = { '注册资本': d1, '实缴资本': d2, '经营状态': d3, '成立日期': d4, '统一社会信用代码': d5, '纳税人识别号': d6, '注册号': d7, '组织机构代码': d8, '公司类型': d9, '所属行业': d10, '核准日期': d11, '登记机关': d12, '所属地区': d13, '英文名': d14, '曾用名': d15, '参保人数': d16, '人员规模': d17, '营业期限': d18, '企业地址': d19, '经营范围': d20 } detail_data = {} for k, v in detail.items(): if v: detail_data[k] = v.replace('\n', '').replace(' ', '') else: detail_data[k] = '' item['detail'] = detail_data item['license'] = '' info_img_url = info.xpath('div/a/@href').extract_first() # 营业执照 if info_img_url: yield scrapy.Request(self.base_url + info_img_url, callback=self.detail_parse2, cookies=self.cookies, meta={'item': item}, dont_filter=True) else: yield item