Example #1
0
    def send_keys(self, ele, inputs, smart=True):
        """
        封装输入的函数
        :param ele: 需要输入的元素
        :param inputs: 传入的输入内容
        :param smart: 是否智能输入
        :return:
        """
        if not all([ele, inputs]):
            return False

        try:
            if smart:
                while inputs:
                    n = random.randint(2, 8)
                    if n > len(inputs):
                        n = len(inputs)
                    split_inputs = inputs[:n]
                    ele.send_keys(split_inputs)
                    self.sleep(0.2, 1.5)
                    inputs = inputs[n:]
            else:
                ele.send_keys(inputs)
        except Exception as e:
            logger.info("send_keys catch exception, e={}".format(e))
            return False

        return True
Example #2
0
    def start_chrome(self, force_display=False, force_client=""):
        """
        配置并启动浏览器
        :param force_display: 强制显示chrome界面
        :param force_client: 强制指定启动chrome的环境, 可选输入项为"pc"或“mobile"
        :return: True/False
        """
        try:
            # 定制浏览器启动项
            chrome_options = webdriver.ChromeOptions()
            if self.headless and not force_display:
                chrome_options.add_argument('--headless')
                chrome_options.add_argument('--no-sandbox')
                chrome_options.add_argument('--disable-extensions')
                chrome_options.add_argument('--disable-gpu')

            chrome_options.add_argument('--disable-infobars')
            chrome_options.add_argument('--disable-popup-blocking')  # 禁止弹出拦截
            chrome_options.add_argument("--ignore-certificate-errors")  # 忽略 Chrome 浏览器证书错误报警提示
            chrome_options.add_argument('lang=zh_cn')

            prefs = {'profile.default_content_setting_values':{'notifications': 2}}
            chrome_options.add_experimental_option('prefs', prefs)

            if self.user_agent and force_client != "mobile":
                chrome_options.add_argument('--user-agent={}'.format(self.user_agent))

            if self.device and force_client != "pc":
                # 移动设备仿真
                mobile_emulation = {
                    'deviceName': self.device
                    # "deviceMetrics": {"width": 600, "height":800, "pixelRatio": 4.0},
                    # "userAgent": "Mozilla/5.0 (Linux; Android 8.0.0; XT1635-02 Build/OPNS27.76-12-22-9)"
                }
                chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)

            chrome_driver = webdriver.Chrome(chrome_options=chrome_options)
            time.sleep(1)

            logger.info("start_chrome device={}, user-agent={}, headless={}, options={}".format(self.device, self.user_agent,
                                                                                    self.headless, chrome_options.arguments))

            self.driver = chrome_driver
            self.options = chrome_options
            return True
        except Exception as e:
            logger.error("The browser did not start successfully, exception: {}".format(str(e)))
            return False
Example #3
0
def parse_detail_page_by_html(html_text):
    detail = "unknown"
    team = "unknown"
    try:
        logger.info("parse_detail_page_by_html")
        html_text = html_text.replace("<!DOCTYPE html>", "<html>")
        detail_page = BeautifulSoup(html_text, 'lxml')
        sec = detail_page.find("div", {'class': 'job-sec'})
        detail_div = sec.find('div', {'class': 'text'})
        if detail_div:
            detail = detail_div.text.strip()

        team_div = detail_page.find("div", {'class': 'job-tags'})
        if team_div:
            team = team_div.text.strip()
    except Exception as e:
        logger.exception("e={}".format(e))

    return detail, team
Example #4
0
def run_task(query_key, city_num, page, industry='', position='', headless=0):
    query_key_url = urllib.parse.quote(query_key)
    city_code = hot_city[city_num].get("code", "100010000")
    industry = industry
    position = position
    page = 10 if page > 10 else page  # boss一般不超过10页

    start_url = "https://www.zhipin.com/job_detail/?query={}&city={}&industry={}&position={}".format(
        query_key_url, city_code, industry, position)

    file_name = u"{city}_{key}_{time}.csv".format(
        city=hot_city[city_num].get("name", ""),
        key=query_key,
        time=datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
    logger.info(u"-----------开始从BOSS直聘抓取:{}有关{}的职位, 取前{}页!".format(
        hot_city[city_num].get("name", ""), query_key, page))
    start_time = datetime.datetime.now()
    bsa = BossActions(finger_print=HEADERS,
                      start_url=start_url,
                      headless=True if headless == 0 else False)
    bsa.connect_chrome()
    jobs = bsa.search("", page=page)
    data2csv(jobs, file_name=file_name)
    logger.info("导出数据完成, 文件名:{}, 导出数据{}条".format(file_name, len(jobs)))
    run_time = round(
        (datetime.datetime.now() - start_time).total_seconds() / 60.0, 2)
    logger.info(
        u"-----------完成从BOSS直聘抓取:{}有关{}的职位, 取前{}页, 共计数据{}条, 耗时:{}分钟".format(
            hot_city[city_num].get("name", ""), query_key, page, len(jobs),
            run_time))
Example #5
0
    def __init__(self, finger_print={},
                 headless=False, start_url="https://www.zhipin.com/", account_info={}):
        """
        初始化
        :param account_info: 账号相关的信息,如账号、密码、性别等,必须是字典类型
        :param finger_print: 指定浏览器指纹,包括devices/user-agent
        :param headless: 是否指定浏览为无头浏览器
        :param start_url: 启动的URL
        """
        assert isinstance(account_info, dict), "账号信息必须是字典类型"
        assert isinstance(finger_print, dict), "浏览器指纹"

        self.account = account_info.get("account", "")
        self.password = account_info.get("password", "")
        self.gender = account_info.get("gender", 1)
        self.phone_number = account_info.get("phone_number", "")
        self.cookies = account_info.get("configure", {}).get("cookies", "")
        self.start_url = start_url
        self.fb_exp = None
        super(BossActions, self).__init__(finger_print=finger_print, headless=headless)
        logger.info("BossActions init, account_info={}, device={}, user_agent={}, headless={}, "
                    "start_url={}".format(account_info, self.device, self.user_agent, headless, start_url))
Example #6
0
    def browse_page(self, browse_times=0, distance=0, interval=0, back_top=True):
        """
        浏览页面
        :param browse_times: 浏览次数
        :param distance: 每次间隔距离,默认为零,代表使用随机距离
        :param interval: 间隔时间, 单位秒, 默认为零,代表使用随机停顿时间
        :param back_top: 是否回到顶点
        :return:
        """
        # 浏览页面js
        try:
            logger.info('browse_page start.')
            y_dis = 0
            if browse_times <= 0:
                browse_times = random.randint(3, 15)

            for i in range(browse_times):
                if interval <= 0:
                    self.sleep(1, 10)
                else:
                    time.sleep(interval)

                if distance > 0:
                    y_dis += distance
                else:
                    y_dis += random.randint(20, 200)

                self.driver.execute_script("window.scrollTo(0,{})".format(y_dis))

            if back_top:
                self.driver.execute_script("window.scrollTo(0,0)")

            logger.info('browse_page end.')
            return True
        except Exception as e:
            logger.exception('browse_page exception. e={}'.format(e))
            return False
Example #7
0
    def search(self, keyword='', page=1):
        assert self.driver, "Driver is not valid! Please invoke start_chrome before login!"
        self.driver.get(self.start_url)
        self.sleep()
        all_jobs = []
        try:
            if keyword:
                query_box = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.NAME, 'query')))
                # email_box.send_keys(account)
                self.send_keys(query_box, keyword)
                self.sleep()
                query_box.send_keys(Keys.ENTER)
                self.sleep()

            start_page = 1
            while start_page <= page:
                logger.info("--------开始抓取第{}页".format(start_page))
                jobs = parse_list_page(self.driver.page_source, detail=False)
                # job_primarys = self.driver.find_elements_by_class_name("info-primary")
                index_in_page = 0
                list_page_url = self.driver.current_url     # 保留工作列表页面的链接, 详情抓取完成后再回到这个页面
                for jp in jobs:
                    index_in_page += 1
                    try:
                        detail_url = jp.get("detail_url", "")
                        if not detail_url:
                            continue
                        self.driver.get(detail_url)
                        self.sleep(1, 3)
                        logger.info("获取第{}页中第{}个工作职位详情, 职位标题:{}, 详情面URL={}".format(start_page, index_in_page, jp.get("title", ''), self.driver.current_url))
                        WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'job-sec')))
                        jd, td = parse_detail_page_by_html(self.driver.page_source)
                        jp['job_description'] = jd
                        jp['team_description'] = td
                        self.sleep(1, 3)
                    except Exception as e:
                        logger.exception("遍历获取详情页数据时异常, 跳过该详情页解析! url={}, e={}".format(detail_url, e))
                        self.sleep(5, 15)   # 如果异常,在这里降降速
                        continue

                all_jobs += jobs
                logger.info("已获取{}页数据, 累计{}条".format(start_page, len(all_jobs)))

                if start_page >= page:
                    break

                self.driver.get(list_page_url)
                self.sleep()
                logger.info("回到工作列表页面:{}".format(self.driver.current_url))
                # 回到列表页面后,拉到最下面,进行翻页
                self.browse_page(browse_times=1, distance=3000, back_top=False)

                next_btn_disabled = self.driver.find_elements_by_class_name("next disabled")
                if next_btn_disabled:
                    logger.warning("已经到最后一页了, 下一页按钮处于禁用状态.next button is disabled. page={}".format(start_page))
                    break

                next_btn = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.CLASS_NAME, 'next')))
                self.click(next_btn)
                self.sleep()
                start_page += 1
                logger.info("翻到第{}页, url={}".format(start_page, self.driver.current_url))
        except Exception as e:
            logger.exception("获取数据异常:{}".format(e))

        self.quit()
        return all_jobs
Example #8
0
def main():
    res = session.get("https://www.zhipin.com/")
    res = session.get("https://www.zhipin.com/wapi/zpCommon/data/position.json")
    res = session.get("https://www.zhipin.com/wapi/zpCommon/data/city.json")
    res = session.get("https://www.zhipin.com/wapi/zpgeek/qrcode/generate.json?content=https%3A%2F%2Fwww.zhipin.com%2Fd%2Fv2%2F%3Ftype%3Dqr%26pkn%3Dmain-m%26sid%3Dmoren_14&w=200&h=200")

    for k, v in cookies.items():
        session.cookies[k] = v
    requst_url = "https://www.zhipin.com/job_detail/?query={}&city={}&industry={}&position={}".format(query_key, city_code, industry,
                                                                                         position)

    res = session.get(requst_url)

    page = 1
    jobs = []
    while res.status_code == 200 and page <= 20:
        try:
            page += 1
            res.encoding = "utf-8"
            html_text = res.text.replace("<!DOCTYPE html>", "<html>")
            html = BeautifulSoup(html_text, "lxml")
            job_primarys= html.find_all("div", {"class": "job-primary"})
            for primary in job_primarys:
                try:
                    if not primary:
                        logger.error("primary is None")
                        continue

                    job_dict = {}
                    job_info = primary.find("div", {'class': 'info-primary'})
                    job_dict["title"] = job_info.find("div", {'class': "job-title"}).text
                    job_dict['detail_url'] = '{}{}'.format(HOST, job_info.find("a").attrs.get("href", ''))

                    detail, team = parse_detail_page_by_url(job_dict['detail_url'])
                    job_dict['job_description'] = detail
                    job_dict['team_description'] = team
                    job_dict['money'] = job_info.find("span", {'class': 'red'}).text
                    pa = job_info.find("p")
                    job_dict['area'] = str(pa.contents[0]).strip()
                    job_dict['experience'] = str(pa.contents[2]).strip()
                    if len(pa.contents) >= 5:
                        job_dict['education'] = str(pa.contents[4])
                    else:
                        job_dict['education'] = 'unknown'

                    company_info = primary.find("div", {'class': 'info-company'})
                    ac = company_info.find('a')
                    job_dict['company_url'] = '{}{}'.format(HOST, ac.attrs.get("href", ''))
                    job_dict['company_name'] = ac.text

                    pc = company_info.find('p')
                    job_dict['company_industry'] = str(pc.contents[0])
                    job_dict['company_stage'] = str(pc.contents[2])
                    if len(pc.contents) >= 5:
                        job_dict['company_scale'] = str(pc.contents[4])
                    else:
                        job_dict['company_scale'] = "unknown"

                    publis_info = primary.find('div', {'class': 'info-publis'})
                    job_dict['company_contact'] = publis_info.find('h3').text
                    jobs.append(job_dict)
                except:
                    continue
            url = "https://www.zhipin.com/c{}/?query={}&page={}&ka=page-{}".format(city_code, query_key, page, page)
            logger.info("------fetch page={}".format(page))
            time.sleep(5)
            res = session.get(url)
        except Exception as e:
            logger.exception("parse exception,e={}".format(e))
            continue
    else:
        logger.error("request error: status code={}, text={}".format(res.status_code, res.text))

    print(len(jobs))
    pprint.pprint(jobs)
    data2csv(jobs)