def analyse_detail(self, res, address_dict, rest): try: contents = res['content'] except: log.error('{}没有找到content这个字段'.format(rest.url)) return for poi in contents[:10]: try: poi_address = poi['addr'] except: log.error('{}没有addr这个字段'.format(rest.url)) continue match_word = address_dict['match_word'] if match_word is not None: if address_dict['match_word'] not in poi_address: continue std_tag = poi['std_tag'] di_tag = poi['di_tag'] if std_tag is not None and di_tag is not None: tag = std_tag + di_tag elif std_tag is None and di_tag is not None: tag = di_tag elif std_tag is not None and di_tag is None: tag = std_tag else: tag = '' if '公司' in tag: try: company_id = poi['primary_uid'] company = Company(company_id, self.source) except Exception as e: log.error('{}中无法匹配到company_id'.format(rest.url)) continue company_name = poi['name'] address = poi['addr'] city = address_dict['city'] region = address_dict['region'] print(company_name, address, city, region) company.company_name = company_name company.address = address company.city = city company.region = region company.insert_db()
def analyze_detail(self, html, company_id, url): xpath_html = etree.HTML(html) company = Company(company_id=company_id, company_source=self.source) #转换为一段json字符串,几乎包含所有的信息 # company_text = xpath_html.xpath("//script[@id='companyInfoData']/text()") # if company_text[0]: # company_text = company_text[0] try: company_text = xpath_html.xpath( "//script[@id='companyInfoData']/text()")[0] except: return company_info = json.loads(company_text) # 公司基本信息,包括人数,类型等 baseinfo = company_info['baseInfo'] # #地址列表,,里面包含很多地址信息 # address = company_info['addressList'][0] # #里面包含公司基本信息,包括名字、简介等, # coreInfo = company_info['coreInfo'] try: address = company_info['addressList'][0] company.address = address['detailAddress'] #详细地址 company.city = address['city'] #城市 company.company_name = company_info['coreInfo'][ 'companyName'] #公司名称 except Exception as e: log.error('{}缺少必要字段,error={}'.format(url, e)) return #长简介 if company_info['introduction'].get('companyProfile'): company.company_info = company_info['introduction'][ 'companyProfile'] #短简介 if company_info['coreInfo'].get('companyIntroduce'): # if company_info['coreInfo']['companyIntroduce']: company.company_short_info = company_info['coreInfo'][ 'companyIntroduce'] if baseinfo.get('industryField'): company.business = company_info['baseInfo']['industryField'] if baseinfo.get('financeStage'): company.development_stage = company_info['baseInfo'][ 'financeStage'] if baseinfo.get('companySize'): company.company_size = company_info['baseInfo']['companySize'] #所在区域 if address.get('district'): company.region = address['district'] company.url = url # result = company.serialization_info() # # print(result) company.insert_db()
async def detail_parse(self, res): html = etree.HTML(res) try: company_id = re.search('ecomp_id : "(\d+)"', res).group(1) company = Company(company_id, self.source) company.company_name = html.xpath('//h1/text()')[0] company.address = re.search('公司地址:</span>(.*?)</li>', res).group(1) company.city = re.search('data-city="(.*?)"', res).group(1) if html.xpath("//p[@class='profile']/text()") is not None: company.company_info = html.xpath( "//p[@class='profile']/text()")[0].strip() if re.search('公司规模:</span>(.*?)</li>', res) is not None: company.company_size = re.search('公司规模:</span>(.*?)</li>', res).group(1) if re.search('经营期限:(.*?)</li', res) is not None: company.operating_period = re.search('经营期限:(.*?)</li', res).group(1) if re.search('注册时间:(.*?)</li', res) is not None: company.registration_time = re.search('注册时间:(.*?)</li', res).group(1) if re.search('注册资本:(.*?)</li', res) is not None: company.registered_capital = re.search('注册资本:(.*?)</li', res).group(1) if html.xpath("//a[@class='comp-industry']/text()") is not None: company.business = html.xpath( "//a[@class='comp-industry']/text()")[0] company.insert_db() except Exception as e: log.error('解析失败{}'.format(e))
def send_url(self, url): try: rest = requests.get(url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('{}失败,原因是{}'.format(url, e)) return res = rest.text html = etree.HTML(res) try: company_id = re.search('/company/(\d+)/', url).group(1) print(company_id) company = Company(company_id, self.source) company.url = url except Exception as e: log.error('缺少必要字段,原因{}'.format(e)) return company_name = html.xpath('//h1/text()') if len(company_name) > 0: company.company_name = company_name[0] address = re.search('data-address="(.*?)"', res) if address: address = address.group(1) print(address) company.address = address city_list = html.xpath('//div[@class="comp-summary-tag"]/a[@class="comp-summary-tag-dq"]/text()') if len(city_list) > 0: city = city_list[0] company.city = city if address: region = address company.region = region business_list = html.xpath('//div[@class="comp-summary-tag"]/a[@data-selector="comp-industry"]/text()') if len(business_list) > 0: company.business = business_list[0] a1_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[1]/text()') if len(a1_xpath) > 0: string = a1_xpath[0] if '人' in string: company.company_size = string else: company.development_stage = string a2_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[2]/text()') if len(a2_xpath) > 0: size = a2_xpath[0] if '人' in size: company.company_size = size text_list = html.xpath("//p[@class='profile']/text()") if len(text_list) > 0: company_info = ','.join(text_list) company.company_info = ''.join(company_info.split()) if re.search('经营期限:(.*?)</li', res) is not None: company.operating_period = re.search('经营期限:(.*?)</li', res).group(1) # print(company.operating_period) if re.search('注册时间:(.*?)</li', res) is not None: company.registration_time = re.search('注册时间:(.*?)</li', res).group(1) # print(company.registration_time) if re.search('注册资本:(.*?)</li', res) is not None: company.registered_capital = re.search('注册资本:(.*?)</li', res).group(1) # print(company.registered_capital) company.insert_db()
def analyze_detail(self, html, company_id, url): xpath_html = etree.HTML(html) company = Company(company_id=company_id, company_source=self.source) company.address = xpath_html.xpath('string(//*[@id="location_container"]/div[2]/div[2])').strip() company.company_info = xpath_html.xpath('string(//*[@id="company_intro"])').strip() company.company_short_info = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/div/text()')[0].strip() company.city = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[4]/span/text()')[0].strip() company.business = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[1]/span/text()')[0].strip() company.development_stage = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[2]/span/text()')[ 0].strip() company.company_name = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/h1/a/text()')[0].strip() company.company_size = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[3]/span/text()')[0].strip() company.url = url company.insert_db()
def data_fetch(self, url): response = requests.get(url=url,proxies=self.proxies,headers=self.headers) print(url) if response.status_code == 200: response.encoding = 'GBK' tree = etree.HTML(response.text) company_id = re.search('https://jobs\.51job\.com/all/co(\d+)\.html', url).group(1) company_source = '51job' company = Company(company_id=company_id,company_source=company_source) try: address1 = tree.xpath("/html/body/div[2]/div[2]/div[3]/div[2]/div/p/text()")[1] address2 = address1.replace(' ', '').replace('\n', '') if '(' in address2: address = address2.split('(')[0] else: address = address2 company_info = tree.xpath("//div[@class='con_txt']/text()")[0] company_size_business = tree.xpath("//p[@class='ltype']/text()")[0] company_size_business = company_size_business.split('|') if len(company_size_business) > 2: company_size = company_size_business[1] business = company_size_business[2] else: company_size = company_size_business[0] business = company_size_business[1] company.address = address company.company_info = company_info company.business = business company.company_size = company_size except Exception as e: print(e) company_name = tree.xpath("//div[@class='tHeader tHCop']/div[1]/h1/text()")[0] company.company_id = company_id company.company_name = company_name company.company_source = company_source company.url = url company.insert_db() else: print(response.status_code)