def analyze_detail(self, html, company_id, url): xpath_html = etree.HTML(html) company = Company(company_id=company_id, company_source=self.source) #转换为一段json字符串,几乎包含所有的信息 # company_text = xpath_html.xpath("//script[@id='companyInfoData']/text()") # if company_text[0]: # company_text = company_text[0] try: company_text = xpath_html.xpath( "//script[@id='companyInfoData']/text()")[0] except: return company_info = json.loads(company_text) # 公司基本信息,包括人数,类型等 baseinfo = company_info['baseInfo'] # #地址列表,,里面包含很多地址信息 # address = company_info['addressList'][0] # #里面包含公司基本信息,包括名字、简介等, # coreInfo = company_info['coreInfo'] try: address = company_info['addressList'][0] company.address = address['detailAddress'] #详细地址 company.city = address['city'] #城市 company.company_name = company_info['coreInfo'][ 'companyName'] #公司名称 except Exception as e: log.error('{}缺少必要字段,error={}'.format(url, e)) return #长简介 if company_info['introduction'].get('companyProfile'): company.company_info = company_info['introduction'][ 'companyProfile'] #短简介 if company_info['coreInfo'].get('companyIntroduce'): # if company_info['coreInfo']['companyIntroduce']: company.company_short_info = company_info['coreInfo'][ 'companyIntroduce'] if baseinfo.get('industryField'): company.business = company_info['baseInfo']['industryField'] if baseinfo.get('financeStage'): company.development_stage = company_info['baseInfo'][ 'financeStage'] if baseinfo.get('companySize'): company.company_size = company_info['baseInfo']['companySize'] #所在区域 if address.get('district'): company.region = address['district'] company.url = url # result = company.serialization_info() # # print(result) company.insert_db()
def analyze_detail(self, html, company_id, url): xpath_html = etree.HTML(html) company = Company(company_id=company_id, company_source=self.source) company.address = xpath_html.xpath('string(//*[@id="location_container"]/div[2]/div[2])').strip() company.company_info = xpath_html.xpath('string(//*[@id="company_intro"])').strip() company.company_short_info = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/div/text()')[0].strip() company.city = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[4]/span/text()')[0].strip() company.business = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[1]/span/text()')[0].strip() company.development_stage = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[2]/span/text()')[ 0].strip() company.company_name = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/h1/a/text()')[0].strip() company.company_size = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[3]/span/text()')[0].strip() company.url = url company.insert_db()