def init_browser(self, headless=False): """初始化浏览器""" chrome_options = webdriver.ChromeOptions() if self.headless or headless: chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") # windows 平台需要添加此参数 if self.proxy_server: chrome_options.add_argument('--proxy-server={}'.format( self.proxy_server)) chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=self.executable_path) browser.execute_cdp_cmd( "Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webdriver', { get: () => undefined }) """ }) browser.execute_cdp_cmd("Network.enable", {}) browser.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": generate_header()}) logger.info(f'初始化浏览器') return browser
def login(self, username, password): self.browser.get(self.url) self.load_cookie() time.sleep(self.sleep_interval) self.browser.refresh() try: login_element = self.browser.find_element_by_class_name('login') is_login = False except: is_login = True logger.info(f'成功登录拉勾网') if not is_login: logger.warning(f'尚未登录拉勾网, 进行登录') self.browser.get('https://passport.lagou.com/login/login.html') time.sleep(self.sleep_interval) self.browser.find_element_by_xpath( "//input[@placeholder='请输入常用手机号/邮箱']").send_keys(username) time.sleep(self.sleep_interval) self.browser.find_element_by_xpath( "//input[@placeholder='请输入密码']").send_keys(password) time.sleep(self.sleep_interval) self.browser.find_element_by_xpath( "//form[@class='active']/div[5]/input[@type='submit']").click( ) # 遇到验证码后 阻塞 手动验证 c = input('如果出现验证码, 请手动验证后回车, 否则直接回车') self.save_cookie() logger.info(f'成功登录拉勾网')
def run(self, keyword='', save_method='file'): logger.info(f'郑彩生成报告, 请稍等!') self.read_data_to_frame(keyword, save_method) city_max_jobs, city_max_jobs_num = self.address_map_chart() # 左侧 self.experience_pie_chart() self.eduction_pie_chart() self.character_pie_chart() self.address_pie_chart() self.city_experience_chart() self.city_eduction_chart() # 右侧 self.company_size_bar_chart() self.company_scale_bar_chart() self.company_field_chart() self.company_name_chart() self.city_company_scale_chart() self.city_company_size_chart() # 中间 self.salary_pie_chart() self.advantage_word_cloud() self.requirement_word_cloud() self.generate_report_javascript() self.generate_report(keyword, city_max_jobs, city_max_jobs_num) import webbrowser base_dir = os.path.dirname(__file__) webbrowser.open_new(os.path.join(base_dir, 'report', 'report.html'))
def save_to_mongodb(self, data_id, url): saved_data = self.mongo_db.find_one('lagou', {'id': data_id}) if saved_data: logger.warning(f'ID为:{data_id}的数据已经保存在MongoDB, 不再保存!') else: data = self.parse_details(url) self.mongo_db.insert_one('lagou', data) logger.info(f'保存数据:{data["id"]}在MongoDB')
def save_to_file(self, data, file_path): create_or_get_directory(os.path.dirname(file_path)) if os.path.exists(file_path): with open(file_path, 'a') as f: dict_writer = csv.DictWriter(f, data.keys()) dict_writer.writerow(data) logger.info(f'保存数据至:{file_path}') else: with open(file_path, 'w') as f: dict_writer = csv.DictWriter(f, data.keys()) dict_writer.writeheader() dict_writer.writerow(data) logger.info(f'保存数据至:{file_path}')
def load_cookie(self): # 加载cookie if not os.path.exists('configures/cookies_file'): return False logger.info('载入cookie') with open('configures/cookies_file', 'rb') as f: cookies = pickle.load(f) for cookie in cookies: self.browser.add_cookie({ 'name': cookie['name'], 'value': cookie['value'], 'path': cookie['path'], 'secure': cookie['secure'], }) return True
def run(self, username, password, keyword=None): """运行入口""" self.login(username, password) # common_cities, all_cities = self.get_cities() # if not common_cities: common_cities = COMMON_CITIES # if not all_cities: all_cities = ALL_CITIES common_cities = COMMON_CITIES all_cities = ALL_CITIES logger.info(f'获取常用城市:{common_cities}') logger.info(f'获取全部城市:{all_cities}') time.sleep(self.sleep_interval) if keyword: categories = [keyword] else: # categories = self.get_categories() # if not categories: categories = CATEGORIES categories = CATEGORIES logger.info(f'获取全部搜索分类:{categories}') for category in categories: for common_city in common_cities: districts = self.get_districts(common_city) logger.info(f'获取城市:{common_city}的行政区域:{districts}') # 当触发爬虫验证机制, 需要关闭浏览器重新开始, 正常则继续下一个 while True: running = self.main(keyword=category, city=common_city) if running: break else: continue for district in districts: while True: running = self.main(keyword=category, city=common_city, district=district) if running: break else: continue
def quit_browser(self, browser=None): logger.info('关闭浏览器窗口') if browser: browser.quit() else: self.browser.quit()
def save_cookie(self): # 保存cookie文件 logger.info('保存cookie') with open('configures/cookies_file', 'wb') as f: pickle.dump(self.browser.get_cookies(), f)
def main(self, keyword='python', city='全国', district=None): """爬虫主方法""" try: keyword = keyword.lower() logger.info(f'当前搜索关键字:{keyword}, 地点:{city}, 行政区域:{district}') self.browser.get(url=self.url) time.sleep(self.sleep_interval) try: WebDriverWait(driver=self.browser, timeout=10).until( EC.presence_of_element_located( (By.XPATH, "//p[@class='checkTips']/a"))).click() except Exception as error: pass self.browser.find_element_by_id('search_input').send_keys(keyword) time.sleep(self.sleep_interval) self.browser.find_element_by_class_name('search_button').click() try: WebDriverWait(driver=self.browser, timeout=10).until( EC.presence_of_element_located( (By.CLASS_NAME, "body-btn"))).click() except Exception as error: pass time.sleep(self.sleep_interval) # 进入选择城市 self.browser.find_element_by_class_name( 'position-head').find_element_by_class_name( 'btn-more').click() self.browser.find_element_by_xpath( "//*[contains(text(), '全部城市')]").click() self.browser.find_element_by_xpath( f"//*[contains(text(), '{city}')]").click() time.sleep(self.sleep_interval) if district: district_div_element = WebDriverWait( driver=self.browser, timeout=10).until( EC.presence_of_element_located( (By.XPATH, "//div[@data-type='district']"))) district_div_element.find_element_by_xpath( f"//*[contains(text(), '{district}')]").click() logger.info( f'开始获取当前条件:{keyword}, {city}, {district}的职位链接..........') self.urls = [] self.get_urls() self.page_scroll() for url in self.urls: try: m = re.search('jobs/(.+?).html', url) data_id = m.group(1) except: data_id = url if self.save_method == 'database': self.save_to_mongodb(data_id, url) else: if not district: district = '不限' file_path = os.path.join(self.save_file_path, f'{keyword}.csv') saved_data = self.get_saved_jobs(file_path) saved_data = [saved['id'] for saved in saved_data] if data_id in saved_data: logger.warning( f'ID为:{data_id}的数据已经保存在:{file_path}, 不再保存') else: try: data = self.parse_details(url) self.save_to_file(data, file_path) except Exception as error: logger.error(f'获取职位详情发生错误:{error}') time.sleep(self.sleep_interval) return True except Exception as error: logger.error(f'爬取过程中发生错误, 可能触发了反爬虫机制. 重新打开浏览器:{error}') self.quit_browser(self.browser) time.sleep(self.sleep_interval) self.browser = self.init_browser() return False