def send_keys(self, ele, inputs, smart=True): """ 封装输入的函数 :param ele: 需要输入的元素 :param inputs: 传入的输入内容 :param smart: 是否智能输入 :return: """ if not all([ele, inputs]): return False try: if smart: while inputs: n = random.randint(2, 8) if n > len(inputs): n = len(inputs) split_inputs = inputs[:n] ele.send_keys(split_inputs) self.sleep(0.2, 1.5) inputs = inputs[n:] else: ele.send_keys(inputs) except Exception as e: logger.info("send_keys catch exception, e={}".format(e)) return False return True
def start_chrome(self, force_display=False, force_client=""): """ 配置并启动浏览器 :param force_display: 强制显示chrome界面 :param force_client: 强制指定启动chrome的环境, 可选输入项为"pc"或“mobile" :return: True/False """ try: # 定制浏览器启动项 chrome_options = webdriver.ChromeOptions() if self.headless and not force_display: chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-infobars') chrome_options.add_argument('--disable-popup-blocking') # 禁止弹出拦截 chrome_options.add_argument("--ignore-certificate-errors") # 忽略 Chrome 浏览器证书错误报警提示 chrome_options.add_argument('lang=zh_cn') prefs = {'profile.default_content_setting_values':{'notifications': 2}} chrome_options.add_experimental_option('prefs', prefs) if self.user_agent and force_client != "mobile": chrome_options.add_argument('--user-agent={}'.format(self.user_agent)) if self.device and force_client != "pc": # 移动设备仿真 mobile_emulation = { 'deviceName': self.device # "deviceMetrics": {"width": 600, "height":800, "pixelRatio": 4.0}, # "userAgent": "Mozilla/5.0 (Linux; Android 8.0.0; XT1635-02 Build/OPNS27.76-12-22-9)" } chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) chrome_driver = webdriver.Chrome(chrome_options=chrome_options) time.sleep(1) logger.info("start_chrome device={}, user-agent={}, headless={}, options={}".format(self.device, self.user_agent, self.headless, chrome_options.arguments)) self.driver = chrome_driver self.options = chrome_options return True except Exception as e: logger.error("The browser did not start successfully, exception: {}".format(str(e))) return False
def parse_detail_page_by_html(html_text): detail = "unknown" team = "unknown" try: logger.info("parse_detail_page_by_html") html_text = html_text.replace("<!DOCTYPE html>", "<html>") detail_page = BeautifulSoup(html_text, 'lxml') sec = detail_page.find("div", {'class': 'job-sec'}) detail_div = sec.find('div', {'class': 'text'}) if detail_div: detail = detail_div.text.strip() team_div = detail_page.find("div", {'class': 'job-tags'}) if team_div: team = team_div.text.strip() except Exception as e: logger.exception("e={}".format(e)) return detail, team
def run_task(query_key, city_num, page, industry='', position='', headless=0): query_key_url = urllib.parse.quote(query_key) city_code = hot_city[city_num].get("code", "100010000") industry = industry position = position page = 10 if page > 10 else page # boss一般不超过10页 start_url = "https://www.zhipin.com/job_detail/?query={}&city={}&industry={}&position={}".format( query_key_url, city_code, industry, position) file_name = u"{city}_{key}_{time}.csv".format( city=hot_city[city_num].get("name", ""), key=query_key, time=datetime.datetime.now().strftime("%Y%m%d%H%M%S")) logger.info(u"-----------开始从BOSS直聘抓取:{}有关{}的职位, 取前{}页!".format( hot_city[city_num].get("name", ""), query_key, page)) start_time = datetime.datetime.now() bsa = BossActions(finger_print=HEADERS, start_url=start_url, headless=True if headless == 0 else False) bsa.connect_chrome() jobs = bsa.search("", page=page) data2csv(jobs, file_name=file_name) logger.info("导出数据完成, 文件名:{}, 导出数据{}条".format(file_name, len(jobs))) run_time = round( (datetime.datetime.now() - start_time).total_seconds() / 60.0, 2) logger.info( u"-----------完成从BOSS直聘抓取:{}有关{}的职位, 取前{}页, 共计数据{}条, 耗时:{}分钟".format( hot_city[city_num].get("name", ""), query_key, page, len(jobs), run_time))
def __init__(self, finger_print={}, headless=False, start_url="https://www.zhipin.com/", account_info={}): """ 初始化 :param account_info: 账号相关的信息,如账号、密码、性别等,必须是字典类型 :param finger_print: 指定浏览器指纹,包括devices/user-agent :param headless: 是否指定浏览为无头浏览器 :param start_url: 启动的URL """ assert isinstance(account_info, dict), "账号信息必须是字典类型" assert isinstance(finger_print, dict), "浏览器指纹" self.account = account_info.get("account", "") self.password = account_info.get("password", "") self.gender = account_info.get("gender", 1) self.phone_number = account_info.get("phone_number", "") self.cookies = account_info.get("configure", {}).get("cookies", "") self.start_url = start_url self.fb_exp = None super(BossActions, self).__init__(finger_print=finger_print, headless=headless) logger.info("BossActions init, account_info={}, device={}, user_agent={}, headless={}, " "start_url={}".format(account_info, self.device, self.user_agent, headless, start_url))
def browse_page(self, browse_times=0, distance=0, interval=0, back_top=True): """ 浏览页面 :param browse_times: 浏览次数 :param distance: 每次间隔距离,默认为零,代表使用随机距离 :param interval: 间隔时间, 单位秒, 默认为零,代表使用随机停顿时间 :param back_top: 是否回到顶点 :return: """ # 浏览页面js try: logger.info('browse_page start.') y_dis = 0 if browse_times <= 0: browse_times = random.randint(3, 15) for i in range(browse_times): if interval <= 0: self.sleep(1, 10) else: time.sleep(interval) if distance > 0: y_dis += distance else: y_dis += random.randint(20, 200) self.driver.execute_script("window.scrollTo(0,{})".format(y_dis)) if back_top: self.driver.execute_script("window.scrollTo(0,0)") logger.info('browse_page end.') return True except Exception as e: logger.exception('browse_page exception. e={}'.format(e)) return False
def search(self, keyword='', page=1): assert self.driver, "Driver is not valid! Please invoke start_chrome before login!" self.driver.get(self.start_url) self.sleep() all_jobs = [] try: if keyword: query_box = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.NAME, 'query'))) # email_box.send_keys(account) self.send_keys(query_box, keyword) self.sleep() query_box.send_keys(Keys.ENTER) self.sleep() start_page = 1 while start_page <= page: logger.info("--------开始抓取第{}页".format(start_page)) jobs = parse_list_page(self.driver.page_source, detail=False) # job_primarys = self.driver.find_elements_by_class_name("info-primary") index_in_page = 0 list_page_url = self.driver.current_url # 保留工作列表页面的链接, 详情抓取完成后再回到这个页面 for jp in jobs: index_in_page += 1 try: detail_url = jp.get("detail_url", "") if not detail_url: continue self.driver.get(detail_url) self.sleep(1, 3) logger.info("获取第{}页中第{}个工作职位详情, 职位标题:{}, 详情面URL={}".format(start_page, index_in_page, jp.get("title", ''), self.driver.current_url)) WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'job-sec'))) jd, td = parse_detail_page_by_html(self.driver.page_source) jp['job_description'] = jd jp['team_description'] = td self.sleep(1, 3) except Exception as e: logger.exception("遍历获取详情页数据时异常, 跳过该详情页解析! url={}, e={}".format(detail_url, e)) self.sleep(5, 15) # 如果异常,在这里降降速 continue all_jobs += jobs logger.info("已获取{}页数据, 累计{}条".format(start_page, len(all_jobs))) if start_page >= page: break self.driver.get(list_page_url) self.sleep() logger.info("回到工作列表页面:{}".format(self.driver.current_url)) # 回到列表页面后,拉到最下面,进行翻页 self.browse_page(browse_times=1, distance=3000, back_top=False) next_btn_disabled = self.driver.find_elements_by_class_name("next disabled") if next_btn_disabled: logger.warning("已经到最后一页了, 下一页按钮处于禁用状态.next button is disabled. page={}".format(start_page)) break next_btn = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.CLASS_NAME, 'next'))) self.click(next_btn) self.sleep() start_page += 1 logger.info("翻到第{}页, url={}".format(start_page, self.driver.current_url)) except Exception as e: logger.exception("获取数据异常:{}".format(e)) self.quit() return all_jobs
def main(): res = session.get("https://www.zhipin.com/") res = session.get("https://www.zhipin.com/wapi/zpCommon/data/position.json") res = session.get("https://www.zhipin.com/wapi/zpCommon/data/city.json") res = session.get("https://www.zhipin.com/wapi/zpgeek/qrcode/generate.json?content=https%3A%2F%2Fwww.zhipin.com%2Fd%2Fv2%2F%3Ftype%3Dqr%26pkn%3Dmain-m%26sid%3Dmoren_14&w=200&h=200") for k, v in cookies.items(): session.cookies[k] = v requst_url = "https://www.zhipin.com/job_detail/?query={}&city={}&industry={}&position={}".format(query_key, city_code, industry, position) res = session.get(requst_url) page = 1 jobs = [] while res.status_code == 200 and page <= 20: try: page += 1 res.encoding = "utf-8" html_text = res.text.replace("<!DOCTYPE html>", "<html>") html = BeautifulSoup(html_text, "lxml") job_primarys= html.find_all("div", {"class": "job-primary"}) for primary in job_primarys: try: if not primary: logger.error("primary is None") continue job_dict = {} job_info = primary.find("div", {'class': 'info-primary'}) job_dict["title"] = job_info.find("div", {'class': "job-title"}).text job_dict['detail_url'] = '{}{}'.format(HOST, job_info.find("a").attrs.get("href", '')) detail, team = parse_detail_page_by_url(job_dict['detail_url']) job_dict['job_description'] = detail job_dict['team_description'] = team job_dict['money'] = job_info.find("span", {'class': 'red'}).text pa = job_info.find("p") job_dict['area'] = str(pa.contents[0]).strip() job_dict['experience'] = str(pa.contents[2]).strip() if len(pa.contents) >= 5: job_dict['education'] = str(pa.contents[4]) else: job_dict['education'] = 'unknown' company_info = primary.find("div", {'class': 'info-company'}) ac = company_info.find('a') job_dict['company_url'] = '{}{}'.format(HOST, ac.attrs.get("href", '')) job_dict['company_name'] = ac.text pc = company_info.find('p') job_dict['company_industry'] = str(pc.contents[0]) job_dict['company_stage'] = str(pc.contents[2]) if len(pc.contents) >= 5: job_dict['company_scale'] = str(pc.contents[4]) else: job_dict['company_scale'] = "unknown" publis_info = primary.find('div', {'class': 'info-publis'}) job_dict['company_contact'] = publis_info.find('h3').text jobs.append(job_dict) except: continue url = "https://www.zhipin.com/c{}/?query={}&page={}&ka=page-{}".format(city_code, query_key, page, page) logger.info("------fetch page={}".format(page)) time.sleep(5) res = session.get(url) except Exception as e: logger.exception("parse exception,e={}".format(e)) continue else: logger.error("request error: status code={}, text={}".format(res.status_code, res.text)) print(len(jobs)) pprint.pprint(jobs) data2csv(jobs)