예제 #1
0
    def analyze_detail(self, html, company_id, url):
        xpath_html = etree.HTML(html)
        company = Company(company_id=company_id, company_source=self.source)
        #转换为一段json字符串,几乎包含所有的信息
        # company_text = xpath_html.xpath("//script[@id='companyInfoData']/text()")
        # if company_text[0]:
        #     company_text = company_text[0]

        try:
            company_text = xpath_html.xpath(
                "//script[@id='companyInfoData']/text()")[0]
        except:
            return
        company_info = json.loads(company_text)
        # 公司基本信息,包括人数,类型等
        baseinfo = company_info['baseInfo']
        # #地址列表,,里面包含很多地址信息
        # address = company_info['addressList'][0]
        # #里面包含公司基本信息,包括名字、简介等,
        # coreInfo = company_info['coreInfo']
        try:
            address = company_info['addressList'][0]
            company.address = address['detailAddress']  #详细地址
            company.city = address['city']  #城市
            company.company_name = company_info['coreInfo'][
                'companyName']  #公司名称
        except Exception as e:
            log.error('{}缺少必要字段,error={}'.format(url, e))
            return
        #长简介
        if company_info['introduction'].get('companyProfile'):
            company.company_info = company_info['introduction'][
                'companyProfile']
        #短简介
        if company_info['coreInfo'].get('companyIntroduce'):
            # if company_info['coreInfo']['companyIntroduce']:
            company.company_short_info = company_info['coreInfo'][
                'companyIntroduce']
        if baseinfo.get('industryField'):
            company.business = company_info['baseInfo']['industryField']
        if baseinfo.get('financeStage'):
            company.development_stage = company_info['baseInfo'][
                'financeStage']
        if baseinfo.get('companySize'):
            company.company_size = company_info['baseInfo']['companySize']
        #所在区域
        if address.get('district'):
            company.region = address['district']
        company.url = url
        # result = company.serialization_info()
        # # print(result)
        company.insert_db()
예제 #2
0
    def analyze_detail(self, html, company_id, url):
        xpath_html = etree.HTML(html)
        company = Company(company_id=company_id, company_source=self.source)
        company.address = xpath_html.xpath('string(//*[@id="location_container"]/div[2]/div[2])').strip()
        company.company_info = xpath_html.xpath('string(//*[@id="company_intro"])').strip()

        company.company_short_info = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/div/text()')[0].strip()
        company.city = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[4]/span/text()')[0].strip()
        company.business = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[1]/span/text()')[0].strip()
        company.development_stage = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[2]/span/text()')[
            0].strip()
        company.company_name = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/h1/a/text()')[0].strip()
        company.company_size = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[3]/span/text()')[0].strip()
        company.url = url
        company.insert_db()