Exemple #1
0
def parse_list_page(html_text, detail=True):
    jobs = []
    try:
        html_text = html_text.replace("<!DOCTYPE html>", "<html>")
        html = BeautifulSoup(html_text, "lxml")
        job_primarys = html.find_all("div", {"class": "job-primary"})
        for primary in job_primarys:
            try:
                if not primary:
                    logger.error("primary is None")
                    continue

                job_dict = {}
                job_info = primary.find("div", {'class': 'info-primary'})
                job_dict["title"] = job_info.find("div", {'class': "job-title"}).text
                job_dict['detail_url'] = '{}{}'.format(HOST, job_info.find("a").attrs.get("href", ''))

                if detail:
                    detail, team = parse_detail_page_by_url(job_dict['detail_url'])
                    job_dict['job_description'] = detail
                    job_dict['team_description'] = team
                else:
                    job_dict['job_description'] = ''
                    job_dict['team_description'] = ''

                job_dict['money'] = job_info.find("span", {'class': 'red'}).text
                pa = job_info.find("p")
                job_dict['area'] = str(pa.contents[0]).strip()
                job_dict['experience'] = str(pa.contents[2]).strip()
                if len(pa.contents) >= 5:
                    job_dict['education'] = str(pa.contents[4])
                else:
                    job_dict['education'] = 'unknown'

                company_info = primary.find("div", {'class': 'info-company'})
                ac = company_info.find('a')
                job_dict['company_url'] = '{}{}'.format(HOST, ac.attrs.get("href", ''))
                job_dict['company_name'] = ac.text

                pc = company_info.find('p')
                job_dict['company_industry'] = str(pc.contents[0])
                job_dict['company_stage'] = str(pc.contents[2])
                if len(pc.contents) >= 5:
                    job_dict['company_scale'] = str(pc.contents[4])
                else:
                    job_dict['company_scale'] = "unknown"


                publis_info = primary.find('div', {'class': 'info-publis'})
                job_dict['company_contact'] = publis_info.find('h3').text
                jobs.append(job_dict)
            except:
                continue

    except Exception as e:
        logger.exception("parse exception,e={}".format(e))

    return jobs
Exemple #2
0
def parse_detail_page_by_html(html_text):
    detail = "unknown"
    team = "unknown"
    try:
        logger.info("parse_detail_page_by_html")
        html_text = html_text.replace("<!DOCTYPE html>", "<html>")
        detail_page = BeautifulSoup(html_text, 'lxml')
        sec = detail_page.find("div", {'class': 'job-sec'})
        detail_div = sec.find('div', {'class': 'text'})
        if detail_div:
            detail = detail_div.text.strip()

        team_div = detail_page.find("div", {'class': 'job-tags'})
        if team_div:
            team = team_div.text.strip()
    except Exception as e:
        logger.exception("e={}".format(e))

    return detail, team
Exemple #3
0
    def browse_page(self, browse_times=0, distance=0, interval=0, back_top=True):
        """
        浏览页面
        :param browse_times: 浏览次数
        :param distance: 每次间隔距离,默认为零,代表使用随机距离
        :param interval: 间隔时间, 单位秒, 默认为零,代表使用随机停顿时间
        :param back_top: 是否回到顶点
        :return:
        """
        # 浏览页面js
        try:
            logger.info('browse_page start.')
            y_dis = 0
            if browse_times <= 0:
                browse_times = random.randint(3, 15)

            for i in range(browse_times):
                if interval <= 0:
                    self.sleep(1, 10)
                else:
                    time.sleep(interval)

                if distance > 0:
                    y_dis += distance
                else:
                    y_dis += random.randint(20, 200)

                self.driver.execute_script("window.scrollTo(0,{})".format(y_dis))

            if back_top:
                self.driver.execute_script("window.scrollTo(0,0)")

            logger.info('browse_page end.')
            return True
        except Exception as e:
            logger.exception('browse_page exception. e={}'.format(e))
            return False
Exemple #4
0
    def search(self, keyword='', page=1):
        assert self.driver, "Driver is not valid! Please invoke start_chrome before login!"
        self.driver.get(self.start_url)
        self.sleep()
        all_jobs = []
        try:
            if keyword:
                query_box = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.NAME, 'query')))
                # email_box.send_keys(account)
                self.send_keys(query_box, keyword)
                self.sleep()
                query_box.send_keys(Keys.ENTER)
                self.sleep()

            start_page = 1
            while start_page <= page:
                logger.info("--------开始抓取第{}页".format(start_page))
                jobs = parse_list_page(self.driver.page_source, detail=False)
                # job_primarys = self.driver.find_elements_by_class_name("info-primary")
                index_in_page = 0
                list_page_url = self.driver.current_url     # 保留工作列表页面的链接, 详情抓取完成后再回到这个页面
                for jp in jobs:
                    index_in_page += 1
                    try:
                        detail_url = jp.get("detail_url", "")
                        if not detail_url:
                            continue
                        self.driver.get(detail_url)
                        self.sleep(1, 3)
                        logger.info("获取第{}页中第{}个工作职位详情, 职位标题:{}, 详情面URL={}".format(start_page, index_in_page, jp.get("title", ''), self.driver.current_url))
                        WebDriverWait(self.driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'job-sec')))
                        jd, td = parse_detail_page_by_html(self.driver.page_source)
                        jp['job_description'] = jd
                        jp['team_description'] = td
                        self.sleep(1, 3)
                    except Exception as e:
                        logger.exception("遍历获取详情页数据时异常, 跳过该详情页解析! url={}, e={}".format(detail_url, e))
                        self.sleep(5, 15)   # 如果异常,在这里降降速
                        continue

                all_jobs += jobs
                logger.info("已获取{}页数据, 累计{}条".format(start_page, len(all_jobs)))

                if start_page >= page:
                    break

                self.driver.get(list_page_url)
                self.sleep()
                logger.info("回到工作列表页面:{}".format(self.driver.current_url))
                # 回到列表页面后,拉到最下面,进行翻页
                self.browse_page(browse_times=1, distance=3000, back_top=False)

                next_btn_disabled = self.driver.find_elements_by_class_name("next disabled")
                if next_btn_disabled:
                    logger.warning("已经到最后一页了, 下一页按钮处于禁用状态.next button is disabled. page={}".format(start_page))
                    break

                next_btn = WebDriverWait(self.driver, 6).until(EC.presence_of_element_located((By.CLASS_NAME, 'next')))
                self.click(next_btn)
                self.sleep()
                start_page += 1
                logger.info("翻到第{}页, url={}".format(start_page, self.driver.current_url))
        except Exception as e:
            logger.exception("获取数据异常:{}".format(e))

        self.quit()
        return all_jobs
Exemple #5
0
def main():
    res = session.get("https://www.zhipin.com/")
    res = session.get("https://www.zhipin.com/wapi/zpCommon/data/position.json")
    res = session.get("https://www.zhipin.com/wapi/zpCommon/data/city.json")
    res = session.get("https://www.zhipin.com/wapi/zpgeek/qrcode/generate.json?content=https%3A%2F%2Fwww.zhipin.com%2Fd%2Fv2%2F%3Ftype%3Dqr%26pkn%3Dmain-m%26sid%3Dmoren_14&w=200&h=200")

    for k, v in cookies.items():
        session.cookies[k] = v
    requst_url = "https://www.zhipin.com/job_detail/?query={}&city={}&industry={}&position={}".format(query_key, city_code, industry,
                                                                                         position)

    res = session.get(requst_url)

    page = 1
    jobs = []
    while res.status_code == 200 and page <= 20:
        try:
            page += 1
            res.encoding = "utf-8"
            html_text = res.text.replace("<!DOCTYPE html>", "<html>")
            html = BeautifulSoup(html_text, "lxml")
            job_primarys= html.find_all("div", {"class": "job-primary"})
            for primary in job_primarys:
                try:
                    if not primary:
                        logger.error("primary is None")
                        continue

                    job_dict = {}
                    job_info = primary.find("div", {'class': 'info-primary'})
                    job_dict["title"] = job_info.find("div", {'class': "job-title"}).text
                    job_dict['detail_url'] = '{}{}'.format(HOST, job_info.find("a").attrs.get("href", ''))

                    detail, team = parse_detail_page_by_url(job_dict['detail_url'])
                    job_dict['job_description'] = detail
                    job_dict['team_description'] = team
                    job_dict['money'] = job_info.find("span", {'class': 'red'}).text
                    pa = job_info.find("p")
                    job_dict['area'] = str(pa.contents[0]).strip()
                    job_dict['experience'] = str(pa.contents[2]).strip()
                    if len(pa.contents) >= 5:
                        job_dict['education'] = str(pa.contents[4])
                    else:
                        job_dict['education'] = 'unknown'

                    company_info = primary.find("div", {'class': 'info-company'})
                    ac = company_info.find('a')
                    job_dict['company_url'] = '{}{}'.format(HOST, ac.attrs.get("href", ''))
                    job_dict['company_name'] = ac.text

                    pc = company_info.find('p')
                    job_dict['company_industry'] = str(pc.contents[0])
                    job_dict['company_stage'] = str(pc.contents[2])
                    if len(pc.contents) >= 5:
                        job_dict['company_scale'] = str(pc.contents[4])
                    else:
                        job_dict['company_scale'] = "unknown"

                    publis_info = primary.find('div', {'class': 'info-publis'})
                    job_dict['company_contact'] = publis_info.find('h3').text
                    jobs.append(job_dict)
                except:
                    continue
            url = "https://www.zhipin.com/c{}/?query={}&page={}&ka=page-{}".format(city_code, query_key, page, page)
            logger.info("------fetch page={}".format(page))
            time.sleep(5)
            res = session.get(url)
        except Exception as e:
            logger.exception("parse exception,e={}".format(e))
            continue
    else:
        logger.error("request error: status code={}, text={}".format(res.status_code, res.text))

    print(len(jobs))
    pprint.pprint(jobs)
    data2csv(jobs)