def parse_list_page(html_text, detail=True): jobs = [] try: html_text = html_text.replace("<!DOCTYPE html>", "<html>") html = BeautifulSoup(html_text, "lxml") job_primarys = html.find_all("div", {"class": "job-primary"}) for primary in job_primarys: try: if not primary: logger.error("primary is None") continue job_dict = {} job_info = primary.find("div", {'class': 'info-primary'}) job_dict["title"] = job_info.find("div", {'class': "job-title"}).text job_dict['detail_url'] = '{}{}'.format(HOST, job_info.find("a").attrs.get("href", '')) if detail: detail, team = parse_detail_page_by_url(job_dict['detail_url']) job_dict['job_description'] = detail job_dict['team_description'] = team else: job_dict['job_description'] = '' job_dict['team_description'] = '' job_dict['money'] = job_info.find("span", {'class': 'red'}).text pa = job_info.find("p") job_dict['area'] = str(pa.contents[0]).strip() job_dict['experience'] = str(pa.contents[2]).strip() if len(pa.contents) >= 5: job_dict['education'] = str(pa.contents[4]) else: job_dict['education'] = 'unknown' company_info = primary.find("div", {'class': 'info-company'}) ac = company_info.find('a') job_dict['company_url'] = '{}{}'.format(HOST, ac.attrs.get("href", '')) job_dict['company_name'] = ac.text pc = company_info.find('p') job_dict['company_industry'] = str(pc.contents[0]) job_dict['company_stage'] = str(pc.contents[2]) if len(pc.contents) >= 5: job_dict['company_scale'] = str(pc.contents[4]) else: job_dict['company_scale'] = "unknown" publis_info = primary.find('div', {'class': 'info-publis'}) job_dict['company_contact'] = publis_info.find('h3').text jobs.append(job_dict) except: continue except Exception as e: logger.exception("parse exception,e={}".format(e)) return jobs
def parse_detail_page_by_html(html_text): detail = "unknown" team = "unknown" try: logger.info("parse_detail_page_by_html") html_text = html_text.replace("<!DOCTYPE html>", "<html>") detail_page = BeautifulSoup(html_text, 'lxml') sec = detail_page.find("div", {'class': 'job-sec'}) detail_div = sec.find('div', {'class': 'text'}) if detail_div: detail = detail_div.text.strip() team_div = detail_page.find("div", {'class': 'job-tags'}) if team_div: team = team_div.text.strip() except Exception as e: logger.exception("e={}".format(e)) return detail, team
def browse_page(self, browse_times=0, distance=0, interval=0, back_top=True): """ 浏览页面 :param browse_times: 浏览次数 :param distance: 每次间隔距离,默认为零,代表使用随机距离 :param interval: 间隔时间, 单位秒, 默认为零,代表使用随机停顿时间 :param back_top: 是否回到顶点 :return: """ # 浏览页面js try: logger.info('browse_page start.') y_dis = 0 if browse_times <= 0: browse_times = random.randint(3, 15) for i in range(browse_times): if interval <= 0: self.sleep(1, 10) else: time.sleep(interval) if distance > 0: y_dis += distance else: y_dis += random.randint(20, 200) self.driver.execute_script("window.scrollTo(0,{})".format(y_dis)) if back_top: self.driver.execute_script("window.scrollTo(0,0)") logger.info('browse_page end.') return True except Exception as e: logger.exception('browse_page exception. e={}'.format(e)) return False
def search(self, keyword='', page=1): assert self.driver, "Driver is not valid! Please invoke start_chrome before login!" self.driver.get(self.start_url) self.sleep() all_jobs = [] try: if keyword: query_box = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.NAME, 'query'))) # email_box.send_keys(account) self.send_keys(query_box, keyword) self.sleep() query_box.send_keys(Keys.ENTER) self.sleep() start_page = 1 while start_page <= page: logger.info("--------开始抓取第{}页".format(start_page)) jobs = parse_list_page(self.driver.page_source, detail=False) # job_primarys = self.driver.find_elements_by_class_name("info-primary") index_in_page = 0 list_page_url = self.driver.current_url # 保留工作列表页面的链接, 详情抓取完成后再回到这个页面 for jp in jobs: index_in_page += 1 try: detail_url = jp.get("detail_url", "") if not detail_url: continue self.driver.get(detail_url) self.sleep(1, 3) logger.info("获取第{}页中第{}个工作职位详情, 职位标题:{}, 详情面URL={}".format(start_page, index_in_page, jp.get("title", ''), self.driver.current_url)) WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'job-sec'))) jd, td = parse_detail_page_by_html(self.driver.page_source) jp['job_description'] = jd jp['team_description'] = td self.sleep(1, 3) except Exception as e: logger.exception("遍历获取详情页数据时异常, 跳过该详情页解析! url={}, e={}".format(detail_url, e)) self.sleep(5, 15) # 如果异常,在这里降降速 continue all_jobs += jobs logger.info("已获取{}页数据, 累计{}条".format(start_page, len(all_jobs))) if start_page >= page: break self.driver.get(list_page_url) self.sleep() logger.info("回到工作列表页面:{}".format(self.driver.current_url)) # 回到列表页面后,拉到最下面,进行翻页 self.browse_page(browse_times=1, distance=3000, back_top=False) next_btn_disabled = self.driver.find_elements_by_class_name("next disabled") if next_btn_disabled: logger.warning("已经到最后一页了, 下一页按钮处于禁用状态.next button is disabled. page={}".format(start_page)) break next_btn = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.CLASS_NAME, 'next'))) self.click(next_btn) self.sleep() start_page += 1 logger.info("翻到第{}页, url={}".format(start_page, self.driver.current_url)) except Exception as e: logger.exception("获取数据异常:{}".format(e)) self.quit() return all_jobs
def main(): res = session.get("https://www.zhipin.com/") res = session.get("https://www.zhipin.com/wapi/zpCommon/data/position.json") res = session.get("https://www.zhipin.com/wapi/zpCommon/data/city.json") res = session.get("https://www.zhipin.com/wapi/zpgeek/qrcode/generate.json?content=https%3A%2F%2Fwww.zhipin.com%2Fd%2Fv2%2F%3Ftype%3Dqr%26pkn%3Dmain-m%26sid%3Dmoren_14&w=200&h=200") for k, v in cookies.items(): session.cookies[k] = v requst_url = "https://www.zhipin.com/job_detail/?query={}&city={}&industry={}&position={}".format(query_key, city_code, industry, position) res = session.get(requst_url) page = 1 jobs = [] while res.status_code == 200 and page <= 20: try: page += 1 res.encoding = "utf-8" html_text = res.text.replace("<!DOCTYPE html>", "<html>") html = BeautifulSoup(html_text, "lxml") job_primarys= html.find_all("div", {"class": "job-primary"}) for primary in job_primarys: try: if not primary: logger.error("primary is None") continue job_dict = {} job_info = primary.find("div", {'class': 'info-primary'}) job_dict["title"] = job_info.find("div", {'class': "job-title"}).text job_dict['detail_url'] = '{}{}'.format(HOST, job_info.find("a").attrs.get("href", '')) detail, team = parse_detail_page_by_url(job_dict['detail_url']) job_dict['job_description'] = detail job_dict['team_description'] = team job_dict['money'] = job_info.find("span", {'class': 'red'}).text pa = job_info.find("p") job_dict['area'] = str(pa.contents[0]).strip() job_dict['experience'] = str(pa.contents[2]).strip() if len(pa.contents) >= 5: job_dict['education'] = str(pa.contents[4]) else: job_dict['education'] = 'unknown' company_info = primary.find("div", {'class': 'info-company'}) ac = company_info.find('a') job_dict['company_url'] = '{}{}'.format(HOST, ac.attrs.get("href", '')) job_dict['company_name'] = ac.text pc = company_info.find('p') job_dict['company_industry'] = str(pc.contents[0]) job_dict['company_stage'] = str(pc.contents[2]) if len(pc.contents) >= 5: job_dict['company_scale'] = str(pc.contents[4]) else: job_dict['company_scale'] = "unknown" publis_info = primary.find('div', {'class': 'info-publis'}) job_dict['company_contact'] = publis_info.find('h3').text jobs.append(job_dict) except: continue url = "https://www.zhipin.com/c{}/?query={}&page={}&ka=page-{}".format(city_code, query_key, page, page) logger.info("------fetch page={}".format(page)) time.sleep(5) res = session.get(url) except Exception as e: logger.exception("parse exception,e={}".format(e)) continue else: logger.error("request error: status code={}, text={}".format(res.status_code, res.text)) print(len(jobs)) pprint.pprint(jobs) data2csv(jobs)