Example #1
0
 def init_browser(self, headless=False):
     """初始化浏览器"""
     chrome_options = webdriver.ChromeOptions()
     if self.headless or headless:
         chrome_options.add_argument("--headless")
         chrome_options.add_argument("--disable-gpu")  # windows 平台需要添加此参数
     if self.proxy_server:
         chrome_options.add_argument('--proxy-server={}'.format(
             self.proxy_server))
     chrome_options.add_experimental_option("excludeSwitches",
                                            ["enable-automation"])
     chrome_options.add_experimental_option('useAutomationExtension', False)
     browser = webdriver.Chrome(chrome_options=chrome_options,
                                executable_path=self.executable_path)
     browser.execute_cdp_cmd(
         "Page.addScriptToEvaluateOnNewDocument", {
             "source":
             """
                                 Object.defineProperty(navigator, 'webdriver', {
                                   get: () => undefined
                                 })
                               """
         })
     browser.execute_cdp_cmd("Network.enable", {})
     browser.execute_cdp_cmd("Network.setExtraHTTPHeaders",
                             {"headers": generate_header()})
     logger.info(f'初始化浏览器')
     return browser
Example #2
0
 def login(self, username, password):
     self.browser.get(self.url)
     self.load_cookie()
     time.sleep(self.sleep_interval)
     self.browser.refresh()
     try:
         login_element = self.browser.find_element_by_class_name('login')
         is_login = False
     except:
         is_login = True
         logger.info(f'成功登录拉勾网')
     if not is_login:
         logger.warning(f'尚未登录拉勾网, 进行登录')
         self.browser.get('https://passport.lagou.com/login/login.html')
         time.sleep(self.sleep_interval)
         self.browser.find_element_by_xpath(
             "//input[@placeholder='请输入常用手机号/邮箱']").send_keys(username)
         time.sleep(self.sleep_interval)
         self.browser.find_element_by_xpath(
             "//input[@placeholder='请输入密码']").send_keys(password)
         time.sleep(self.sleep_interval)
         self.browser.find_element_by_xpath(
             "//form[@class='active']/div[5]/input[@type='submit']").click(
             )
         # 遇到验证码后  阻塞  手动验证
         c = input('如果出现验证码, 请手动验证后回车, 否则直接回车')
         self.save_cookie()
         logger.info(f'成功登录拉勾网')
Example #3
0
 def run(self, keyword='', save_method='file'):
     logger.info(f'郑彩生成报告, 请稍等!')
     self.read_data_to_frame(keyword, save_method)
     city_max_jobs, city_max_jobs_num = self.address_map_chart()
     # 左侧
     self.experience_pie_chart()
     self.eduction_pie_chart()
     self.character_pie_chart()
     self.address_pie_chart()
     self.city_experience_chart()
     self.city_eduction_chart()
     # 右侧
     self.company_size_bar_chart()
     self.company_scale_bar_chart()
     self.company_field_chart()
     self.company_name_chart()
     self.city_company_scale_chart()
     self.city_company_size_chart()
     # 中间
     self.salary_pie_chart()
     self.advantage_word_cloud()
     self.requirement_word_cloud()
     self.generate_report_javascript()
     self.generate_report(keyword, city_max_jobs, city_max_jobs_num)
     import webbrowser
     base_dir = os.path.dirname(__file__)
     webbrowser.open_new(os.path.join(base_dir, 'report', 'report.html'))
Example #4
0
 def save_to_mongodb(self, data_id, url):
     saved_data = self.mongo_db.find_one('lagou', {'id': data_id})
     if saved_data:
         logger.warning(f'ID为:{data_id}的数据已经保存在MongoDB, 不再保存!')
     else:
         data = self.parse_details(url)
         self.mongo_db.insert_one('lagou', data)
         logger.info(f'保存数据:{data["id"]}在MongoDB')
Example #5
0
 def save_to_file(self, data, file_path):
     create_or_get_directory(os.path.dirname(file_path))
     if os.path.exists(file_path):
         with open(file_path, 'a') as f:
             dict_writer = csv.DictWriter(f, data.keys())
             dict_writer.writerow(data)
         logger.info(f'保存数据至:{file_path}')
     else:
         with open(file_path, 'w') as f:
             dict_writer = csv.DictWriter(f, data.keys())
             dict_writer.writeheader()
             dict_writer.writerow(data)
         logger.info(f'保存数据至:{file_path}')
Example #6
0
 def load_cookie(self):
     # 加载cookie
     if not os.path.exists('configures/cookies_file'):
         return False
     logger.info('载入cookie')
     with open('configures/cookies_file', 'rb') as f:
         cookies = pickle.load(f)
         for cookie in cookies:
             self.browser.add_cookie({
                 'name': cookie['name'],
                 'value': cookie['value'],
                 'path': cookie['path'],
                 'secure': cookie['secure'],
             })
     return True
Example #7
0
 def run(self, username, password, keyword=None):
     """运行入口"""
     self.login(username, password)
     # common_cities, all_cities = self.get_cities()
     # if not common_cities: common_cities = COMMON_CITIES
     # if not all_cities: all_cities = ALL_CITIES
     common_cities = COMMON_CITIES
     all_cities = ALL_CITIES
     logger.info(f'获取常用城市:{common_cities}')
     logger.info(f'获取全部城市:{all_cities}')
     time.sleep(self.sleep_interval)
     if keyword:
         categories = [keyword]
     else:
         # categories = self.get_categories()
         # if not categories: categories = CATEGORIES
         categories = CATEGORIES
         logger.info(f'获取全部搜索分类:{categories}')
     for category in categories:
         for common_city in common_cities:
             districts = self.get_districts(common_city)
             logger.info(f'获取城市:{common_city}的行政区域:{districts}')
             # 当触发爬虫验证机制, 需要关闭浏览器重新开始, 正常则继续下一个
             while True:
                 running = self.main(keyword=category, city=common_city)
                 if running:
                     break
                 else:
                     continue
             for district in districts:
                 while True:
                     running = self.main(keyword=category,
                                         city=common_city,
                                         district=district)
                     if running:
                         break
                     else:
                         continue
Example #8
0
 def quit_browser(self, browser=None):
     logger.info('关闭浏览器窗口')
     if browser:
         browser.quit()
     else:
         self.browser.quit()
Example #9
0
 def save_cookie(self):
     # 保存cookie文件
     logger.info('保存cookie')
     with open('configures/cookies_file', 'wb') as f:
         pickle.dump(self.browser.get_cookies(), f)
Example #10
0
 def main(self, keyword='python', city='全国', district=None):
     """爬虫主方法"""
     try:
         keyword = keyword.lower()
         logger.info(f'当前搜索关键字:{keyword}, 地点:{city}, 行政区域:{district}')
         self.browser.get(url=self.url)
         time.sleep(self.sleep_interval)
         try:
             WebDriverWait(driver=self.browser, timeout=10).until(
                 EC.presence_of_element_located(
                     (By.XPATH, "//p[@class='checkTips']/a"))).click()
         except Exception as error:
             pass
         self.browser.find_element_by_id('search_input').send_keys(keyword)
         time.sleep(self.sleep_interval)
         self.browser.find_element_by_class_name('search_button').click()
         try:
             WebDriverWait(driver=self.browser, timeout=10).until(
                 EC.presence_of_element_located(
                     (By.CLASS_NAME, "body-btn"))).click()
         except Exception as error:
             pass
         time.sleep(self.sleep_interval)
         # 进入选择城市
         self.browser.find_element_by_class_name(
             'position-head').find_element_by_class_name(
                 'btn-more').click()
         self.browser.find_element_by_xpath(
             "//*[contains(text(), '全部城市')]").click()
         self.browser.find_element_by_xpath(
             f"//*[contains(text(), '{city}')]").click()
         time.sleep(self.sleep_interval)
         if district:
             district_div_element = WebDriverWait(
                 driver=self.browser, timeout=10).until(
                     EC.presence_of_element_located(
                         (By.XPATH, "//div[@data-type='district']")))
             district_div_element.find_element_by_xpath(
                 f"//*[contains(text(), '{district}')]").click()
         logger.info(
             f'开始获取当前条件:{keyword}, {city}, {district}的职位链接..........')
         self.urls = []
         self.get_urls()
         self.page_scroll()
         for url in self.urls:
             try:
                 m = re.search('jobs/(.+?).html', url)
                 data_id = m.group(1)
             except:
                 data_id = url
             if self.save_method == 'database':
                 self.save_to_mongodb(data_id, url)
             else:
                 if not district:
                     district = '不限'
                 file_path = os.path.join(self.save_file_path,
                                          f'{keyword}.csv')
                 saved_data = self.get_saved_jobs(file_path)
                 saved_data = [saved['id'] for saved in saved_data]
                 if data_id in saved_data:
                     logger.warning(
                         f'ID为:{data_id}的数据已经保存在:{file_path}, 不再保存')
                 else:
                     try:
                         data = self.parse_details(url)
                         self.save_to_file(data, file_path)
                     except Exception as error:
                         logger.error(f'获取职位详情发生错误:{error}')
                     time.sleep(self.sleep_interval)
         return True
     except Exception as error:
         logger.error(f'爬取过程中发生错误, 可能触发了反爬虫机制. 重新打开浏览器:{error}')
         self.quit_browser(self.browser)
         time.sleep(self.sleep_interval)
         self.browser = self.init_browser()
         return False