예제 #1
0
    def analyze_detail(self, html, company_id, url):
        xpath_html = etree.HTML(html)
        company = Company(company_id=company_id, company_source=self.source)
        #转换为一段json字符串,几乎包含所有的信息
        # company_text = xpath_html.xpath("//script[@id='companyInfoData']/text()")
        # if company_text[0]:
        #     company_text = company_text[0]

        try:
            company_text = xpath_html.xpath(
                "//script[@id='companyInfoData']/text()")[0]
        except:
            return
        company_info = json.loads(company_text)
        # 公司基本信息,包括人数,类型等
        baseinfo = company_info['baseInfo']
        # #地址列表,,里面包含很多地址信息
        # address = company_info['addressList'][0]
        # #里面包含公司基本信息,包括名字、简介等,
        # coreInfo = company_info['coreInfo']
        try:
            address = company_info['addressList'][0]
            company.address = address['detailAddress']  #详细地址
            company.city = address['city']  #城市
            company.company_name = company_info['coreInfo'][
                'companyName']  #公司名称
        except Exception as e:
            log.error('{}缺少必要字段,error={}'.format(url, e))
            return
        #长简介
        if company_info['introduction'].get('companyProfile'):
            company.company_info = company_info['introduction'][
                'companyProfile']
        #短简介
        if company_info['coreInfo'].get('companyIntroduce'):
            # if company_info['coreInfo']['companyIntroduce']:
            company.company_short_info = company_info['coreInfo'][
                'companyIntroduce']
        if baseinfo.get('industryField'):
            company.business = company_info['baseInfo']['industryField']
        if baseinfo.get('financeStage'):
            company.development_stage = company_info['baseInfo'][
                'financeStage']
        if baseinfo.get('companySize'):
            company.company_size = company_info['baseInfo']['companySize']
        #所在区域
        if address.get('district'):
            company.region = address['district']
        company.url = url
        # result = company.serialization_info()
        # # print(result)
        company.insert_db()
예제 #2
0
 def analyse_detail(self, res, address_dict, rest):
     try:
         contents = res['content']
     except:
         log.error('{}没有找到content这个字段'.format(rest.url))
         return
     for poi in contents[:10]:
         try:
             poi_address = poi['addr']
         except:
             log.error('{}没有addr这个字段'.format(rest.url))
             continue
         match_word = address_dict['match_word']
         if match_word is not None:
             if address_dict['match_word'] not in poi_address:
                 continue
             std_tag = poi['std_tag']
             di_tag = poi['di_tag']
             if std_tag is not None and di_tag is not None:
                 tag = std_tag + di_tag
             elif std_tag is None and di_tag is not None:
                 tag = di_tag
             elif std_tag is not None and di_tag is None:
                 tag = std_tag
             else:
                 tag = ''
             if '公司' in tag:
                 try:
                     company_id = poi['primary_uid']
                     company = Company(company_id, self.source)
                 except Exception as e:
                     log.error('{}中无法匹配到company_id'.format(rest.url))
                     continue
                 company_name = poi['name']
                 address = poi['addr']
                 city = address_dict['city']
                 region = address_dict['region']
                 print(company_name, address, city, region)
                 company.company_name = company_name
                 company.address = address
                 company.city = city
                 company.region = region
                 company.insert_db()
예제 #3
0
    def send_url(self, url):
        try:
            rest = requests.get(url, headers=self.headers, proxies=self.proxies)
        except Exception as e:
            log.error('{}失败,原因是{}'.format(url, e))
            return
        res = rest.text
        html = etree.HTML(res)
        try:
            company_id = re.search('/company/(\d+)/', url).group(1)
            print(company_id)
            company = Company(company_id, self.source)
            company.url = url
        except Exception as e:
            log.error('缺少必要字段,原因{}'.format(e))
            return

        company_name = html.xpath('//h1/text()')
        if len(company_name) > 0:
            company.company_name = company_name[0]
        address = re.search('data-address="(.*?)"', res)
        if address:
            address = address.group(1)
            print(address)
            company.address = address

        city_list = html.xpath('//div[@class="comp-summary-tag"]/a[@class="comp-summary-tag-dq"]/text()')
        if len(city_list) > 0:
            city = city_list[0]
            company.city = city
        if address:
            region = address
            company.region = region

        business_list = html.xpath('//div[@class="comp-summary-tag"]/a[@data-selector="comp-industry"]/text()')
        if len(business_list) > 0:
            company.business = business_list[0]
        a1_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[1]/text()')
        if len(a1_xpath) > 0:
            string = a1_xpath[0]
            if '人' in string:
                company.company_size = string
            else:
                company.development_stage = string
                a2_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[2]/text()')
                if len(a2_xpath) > 0:
                    size = a2_xpath[0]
                    if '人' in size:
                        company.company_size = size

        text_list = html.xpath("//p[@class='profile']/text()")
        if len(text_list) > 0:
            company_info = ','.join(text_list)
            company.company_info = ''.join(company_info.split())

        if re.search('经营期限:(.*?)</li', res) is not None:
            company.operating_period = re.search('经营期限:(.*?)</li', res).group(1)
            # print(company.operating_period)
        if re.search('注册时间:(.*?)</li', res) is not None:
            company.registration_time = re.search('注册时间:(.*?)</li', res).group(1)
            # print(company.registration_time)
        if re.search('注册资本:(.*?)</li', res) is not None:
            company.registered_capital = re.search('注册资本:(.*?)</li', res).group(1)
            # print(company.registered_capital)
        company.insert_db()