Example #1
0
def get_hit_rescruit():
    base_url = "http://job.hit.edu.cn/index/getZczphData"
    host = "job.hit.edu.cn"
    header = util.get_header(host)
    header['referer'] = "http://job.hit.edu.cn/info?dj=MQ--"
    header['accept'] = "*/*"
    header['X-Requested-With'] = "XMLHttpRequest"
    req = requests.Session()
    header['cookie'] = "JSESSIONID=A36AAA74D82B3F39C3FD2455853EC081"
    req.get("http://job.hit.edu.cn/info?dj=MQ--")
    re = jedis.jedis()
    re.connect_redis()
    # 哈工大最新的就业网站是从2016年9月开始的,至今一共有13个月的数据
    for i in range(0, 14):
        month = 9
        year = 2016
        month = month + i
        if month > 12:
            year = 2017
            month = month - 12

        date = datetime.date(year, month, 1)
        params = {'Month': util.get_month(date)}
        # params = {'Month': '2017-10'}
        params = json.dumps(params)
        print(params)
        res = req.post(headers=header, url=base_url, data=params)
        content = res.content.decode("utf-8")
        # print(content)
        parse_hit_info(content, re)
Example #2
0
def get_scut_recuit():
    print("开始获取华南理工大学数据=====================")
    url = "http://jyzx.6ihnep7.cas.scut.edu.cn/jyzx/xs/zpxx/xyxj/"

    req = requests.Session()

    headers = util.get_header(host='jyzx.6ihnep7.cas.scut.edu.cn')
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 61):
        try:
            data = {
                'pageNo': '60',
                'daoxv1': '0',
                'entName': '',
                'time': '-1',
                'pageNO': str(i)
            }
            content = req.post(url=url, headers=headers,
                               data=data).content.decode('utf-8')
            parse_info(redis, content)
        except BaseException as e:
            util.format_err(e)
            break
    redis.add_to_file(table_name)
    redis.add_university(table_name)
    print("获取华南理工大学数据完成=======================")
def get_zzu_recruit():
    url = "http://job.zzu.edu.cn:9009/service/business/college/jobfair/jobFairInfo/getCalendarInfo.xf"

    req = requests.Session()
    host = 'job.zzu.edu.cn:9009'
    headers = util.get_header(host)
    headers['referer'] = 'http://job.zzu.edu.cn/p/page/jobCalendar.html?channel_code=XJH&type=0'
    redis = jedis.jedis()
    redis.clear_list(table_name)
    year = 2018
    # 从 2017年一直退回到2012年
    for i in range(72, 0, -1):
        month = i % 12
        if month == 0:
            year = year - 1
            month = 12
        params = {
            'remark': '0',
            'year': str(year),
            'month': str(month)
        }
        print(params)
        res = req.post(url=url, headers=headers, data=params)
        content = res.content.decode('utf-8')
        parse_info(content, redis)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
Example #4
0
def get_ouc_recruit():
    print("开始获取中国海洋大学数据=====================")
    url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html"
    host = 'career.ouc.edu.cn'
    headers = util.get_header(host)
    req = requests.Session()
    res = req.get(url=url, headers=headers).content.decode('gbk')
    redis = jedis.jedis()
    redis.clear_list(table_name)
    soup = BeautifulSoup(res, 'html5lib')
    total_infos = int(re.findall(pattern, str(soup))[0][14:])
    page_num = total_infos // 20 + 1
    for i in range(1, page_num + 1):
        try:
            if i == 1:
                url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html"
            else:
                url = "http://career.ouc.edu.cn/html/zp_info/campus/" + str(i) + ".html"
            content = req.get(url=url, headers=headers).content.decode('gbk')
            parse_info(content, redis)
        except BaseException as e:
            util.format_err(e)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
    print("获取中国海洋大学数据完成=====================")
Example #5
0
def get_gzu_recruit():
    base_url = 'http://jobs.gzu.edu.cn/gzujobs/client/recruitment/meet?page='
    req = requests.Session()
    host = 'jobs.gzu.edu.cn'
    content = req.get(url=base_url + str(1),
                      headers=util.get_header(host)).content.decode('utf-8')
    page_num = get_page_num(content)
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, page_num + 1):
        url = base_url + str(i)
        print(url)
        content = req.get(
            url=url, headers=util.get_header(host)).content.decode('utf-8')
        parse_info(content, redis, i)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
def get_sufe_recruit():
    host = "career.sufe.edu.cn/"
    headers = util.get_header(host)
    re = jedis.jedis()
    re.clear_list(table_name)
    url = "http://careersys.sufe.edu.cn/pros_jiuye/s/zxh/owebsiteData/recruitmentAndPreaching?callback=&type=list&eachPageRows=600&currentPageno=1&_="
    req = requests.Session()
    res = req.get(headers=headers, url=url)
    content = res.content.decode("utf-8")
    parse_info(content, re)
def get_top_500(base_url, page_num, company_type):
    host = "www.fortunechina.com"
    header = util.get_header(host)
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    for i in range(1, page_num):
        url = base_url + str(i)
        print(i)
        res = req.get(headers=header, url=url).content.decode("utf-8")
        parse_top500(res, re, company_type)
Example #8
0
def get_nju_rescruit():
    base_url = "http://job.nju.edu.cn/login/nju/home.jsp?type=zph&DZPHBH=&sfss=sfss&zphzt=&jbksrq=&jbjsrq=&sfgq=&pageSearch=2&pageNow="
    req = requests.Session()
    header = util.get_header("job.nju.edu.cn")
    re = jedis.jedis()
    re.connect_redis()
    for i in range(1, 118):
        print(i)
        content = req.get(headers=header,
                          url=base_url + str(i)).content.decode("utf-8")
        parse_nju_info(content, re)
    re.add_university("nju_company_info")
    print("finish")
Example #9
0
def get_fdu_rescruit():
    host = "www.career.fudan.edu.cn"
    headers = util.get_header(host)
    headers['cookie'] = 'JSESSIONID=0000qZlE0QPPNarjW8SKyrjJPEW:19b14rm85'
    # 将count掉值设置为大于等于总信息的数字,可以一次性获得所有数据
    url = "http://www.career.fudan.edu.cn/jsp/career_talk_list.jsp?count=3000&list=true"
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    res = req.get(headers=headers, url=url)
    content = res.content.decode("utf-8")
    parse_info(content, re)
    re.add_university(table_name)
    re.add_to_file(table_name)
def get_lzu_rescruit():
    base_url = "http://job.lzu.edu.cn/htmlfile/article/list/119/list_"
    url_tail = ".shtml"
    host = "job.lzu.edu.cn"
    header = util.get_header(host)
    max_page_num = 50
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    for i in range(1, max_page_num + 1):
        url = base_url + str(i) + url_tail
        html = req.get(headers=header, url=url).content.decode("utf-8")
        parse_html(html, re)
        print(i)
    re.add_university("lzu_company_info")
    print("finish")
def get_cufe_rescruit():
    base_url = "http://scc.cufe.edu.cn/recruitment-datas/15/"
    url_tail = "/2.html"
    host = "scc.cufe.edu.cn"
    req = requests.Session()
    header = util.get_header(host)
    re = jedis.jedis()
    re.clear_list(table_name)
    max_page_num = 422
    for i in range(1, max_page_num):
        print(i)
        url = base_url + str(i) + url_tail
        res = req.get(headers=header, url=url).content.decode("utf-8")
        parse_info(res, re)
    re.add_university(table_name)
    re.add_to_file(table_name)
Example #12
0
def get_ncepu_recruit():
    table_name = "ncepu_company_info"
    base_url = "http://job.ncepu.edu.cn/teachin/index?domain=ncepu&page="
    req = requests.Session()
    redis = jedis.jedis()
    redis.clear_list(table_name)
    host = "job.ncepu.edu.cn"
    header = util.get_header(host)
    # 获取宣讲会信息
    for i in range(1, 34):
        res = req.get(headers=header, url=base_url + str(i))
        html = res.content.decode("utf-8")
        parse_info(html, redis, table_name)
    get_double_choose(req, header, re)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
Example #13
0
def get_nju_rescruit():
    print("NJU Begin===================================================")
    base_url = "http://job.nju.edu.cn:9081/login/nju/home.jsp?type=zph&DZPHBH=&sfss=sfss&zphzt=&jbksrq=&jbjsrq=&sfgq=&pageSearch=2&pageNow="
    req = requests.Session()
    header = util.get_header("job.nju.edu.cn")
    re = jedis.jedis()
    re.connect_redis()
    re.clear_list(table_name)
    for i in range(1, 120):
        print(i)
        content = req.get(headers=header, url=base_url + str(i)).content.decode("utf-8")
        parse_nju_info(content, re)
    get_zph_info(req, header, re)
    re.add_university(table_name)
    re.add_to_file(table_name)
    print("NJU finish ===================================================")
 def get_scu_recruit(self):
     host = 'jy.scu.edu.cn'
     first_url = "http://jy.scu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx"
     base_url = "http://jy.scu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next&currentPage="
     req = requests.Session()
     scu_header = util.get_header(host)
     res = req.get(headers=scu_header, url=first_url)
     content = res.content.decode("utf-8")
     table_name = "scu_company_info"
     page_num = 224
     index_begin = 8
     index_end = 28
     self.parse_info(content, table_name, index_begin, index_end, 2)
     self.get_rescruit(base_url, req, scu_header, table_name, page_num,
                       index_begin, index_end, 2)
     self.re.add_university(table_name)
     self.re.add_to_file(table_name)
def get_xju_recruitment():
    base_url = 'http://zsjy.xju.edu.cn/zpxx/'
    first_url = 'http://zsjy.xju.edu.cn/zpxx.htm'
    req = requests.Session()
    redis = jedis.jedis()
    redis.clear_list(table_name)
    headers = util.get_header('zsjy.xju.edu.cn')
    content = req.get(url=first_url, headers=headers).content.decode('utf-8')
    page_num = get_total_page(content)
    parse_info(content, redis)
    headers['Referer'] = first_url
    for i in range(page_num - 1, 0, -1):
        url = base_url + str(i) + '.htm'
        print(url)
        content = req.get(url=url, headers=headers).content.decode('utf-8')
        parse_info(content, redis)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
Example #16
0
def get_szu_recruit():
    print("深圳大学开始==================================")
    url = 'http://job.szu.edu.cn/EngageListAllMeeting.aspx?index=1'
    headers = util.get_header('job.szu.edu.cn')
    req = requests.session()
    redis = jedis.jedis()
    redis.clear_list(table_name)
    content = req.get(url=url, headers=headers).content.decode("utf-8")
    base_url = url[0:-1]
    total_num = get_total_page(content)
    for i in range(1, total_num + 1):
        url = base_url + str(i)
        print(url)
        content = req.get(url=url, headers=headers).content.decode("utf-8")
        parse_info(content, redis)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
    print("深圳大学结束==================================")
Example #17
0
def get_sjtu_rescruit():
    host = "www.job.sjtu.edu.cn"
    first_url = "http://www.job.sjtu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx&xjhType=yjb"
    base_url = "http://www.job.sjtu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next&currentPage="
    header = util.get_header(host)

    req = requests.Session()
    res = req.get(headers=header, url=first_url).content.decode("utf-8")
    table_name = "sjtu_company_info"
    page_num = 39
    page_num = get_page_num(content=res)
    # 解析数据
    get_rescruit(base_url, req, header, table_name, page_num, 14, 64, 1)
    # 在大学列表里新增表名
    redis.add_university(table_name)

    # 保存到json文件
    redis.add_to_file(table_name)
Example #18
0
def get_ustc_recruit():
    # 专场招聘会URL
    base_url = "http://www.job.ustc.edu.cn/API/Web/Recruit.ashx?rand=0.10286254897924929&pagesize=20&action=list&keyword=&pageindex="
    req = requests.Session()
    host = "www.job.ustc.edu.cn"
    table_name = "ustc_company_info"
    header = util.get_header(host)
    re = jedis.jedis()
    re.connect_redis()
    for i in range(1, 25):
        url = base_url + str(i)
        res = req.get(headers=header, url=url)
        content = res.content.decode("utf-8")
        parse_info(content, re, table_name)
    get_communicate(req, re, header, table_name)
    re.add_university(table_name)
    re.add_to_file(table_name)
    print("finish")
def get_bnu_recuit():
    print("开始获取北京师范大学数据=====================")
    url = "http://career.bnu.edu.cn/front/zp_query/zphQuery.jspa?"
    host = "career.bnu.edu.cn"
    headers = util.get_header(host=host)
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 82):  # 一共81页
        try:
            params = {'paramMap.xxlx': '1', 'page.curPage': '%d' % i}
            html = requests.get(url=url, headers=headers,
                                params=params).json()  # json 数据
            parse_info(html, redis)
        except BaseException as e:
            util.format_err(e)
        finally:
            print('获取北京师范大学第 %d 页(共81页)数据完成' % i)
    redis.add_university(table_name)  # 添加学校到github中
    redis.add_to_file(table_name)  # 添加表到文件中
def get_pku_recruit():
    print("PKU Begin ===================================================")
    base_url = "https://scc.pku.edu.cn/information/base-job-fair!findFairInfoByMonth.action"
    host = "scc.pku.edu.cn"
    headers = util.get_header(host)
    headers[
        'referer'] = "https://scc.pku.edu.cn/timeline?fairDate=2017-11-03%2000:00"
    headers[
        'Cookie'] = "Hm_lvt_f77188aadf0698598108fbf1f0e5df52=1509938240,1510453941; JSESSIONID=A07EA9A7A0B89A27E64ABB70E7D2C5FD; Hm_lpvt_f77188aadf0698598108fbf1f0e5df52=1510454286"
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    re.clear_list(table_name)
    #
    # 获取宣讲会信息
    for i in range(1, 13):
        month = i
        yearMonth = datetime.date(2017, month, i)
        yearMonth = util.get_month(yearMonth)
        data = {'yearMonth': yearMonth}
        # data = {'yearMonth': '2017-01'}
        # req.get(url="https://scc.pku.edu.cn", verify=False)
        res = req.post(headers=headers, url=base_url, data=data, verify=False)
        content = res.content.decode("utf-8")
        parse_info(content, re)

    # 获取双选会信息
    url = "https://scc.pku.edu.cn/home!bigFairJobInfo.action"
    url2 = "https://scc.pku.edu.cn/home!bigFairJobInfo.action"
    data2 = {'start': 0, 'limit': 600, 'currentPage': 1}
    headers['referer'] = "https://scc.pku.edu.cn/home!speciaPreach.action"
    headers[
        'cookie'] = 'JSESSIONID=AFBCF8D631C5F757F2790373BE5AB090; Hm_lvt_f77188aadf0698598108fbf1f0e5df52=1513048945,1514907282; Hm_lpvt_f77188aadf0698598108fbf1f0e5df52=1514907290'
    headers['X-Requested-With'] = "X-Requested-WithXMLHttpRequest"
    headers['Cache-Control'] = "no-cache"
    req.get(url=url, headers=headers, verify=False)
    headers['referer'] = "https://scc.pku.edu.cn/home!bigFairJobInfo.action"
    info = req.post(headers=headers, url=url2, data=data2, verify=False)
    print("get info success")
    parse_info2(info.content.decode("utf-8"), re)
    re.add_university(table_name)
    re.add_to_file(table_name)
    print("PKU Finish ===================================================")
def get_hnu_recuit():
    print("开始获取湖南大学数据=====================")
    url = "http://scc.hnu.edu.cn/newsjob!getMore.action?"
    host = "scc.hnu.edu.cn"
    headers = util.get_header(host=host)
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 310):  # 一共310页,102页及其以前都是2017年的
        try:
            params = {'p.currentPage': '%d' % i, 'Lb': '1'}
            html = requests.get(url=url, headers=headers, params=params).text
            parse_info(html, redis)
        except BaseException as e:  # 还不太会错误处理机制
            util.format_err(e)
            break
        finally:
            print('获取湖南大学第 %d 页(共310页)数据完成' % i)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
def get_nxu_recruit():
    url = 'http://www.nxujob.com/news/news-list.php?id=27&page='
    req = requests.Session()
    headers = util.get_header('www.nxujob.com')
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 48):
        print(url + str(i))
        content = req.get(url=url + str(i),
                          headers=headers).content.decode('gbk')
        soup = BeautifulSoup(content, 'html5lib')
        url_list = soup.find_all(
            href=re.compile('http://www.nxujob.com/news/news-show.php'),
            attrs={'target': '_blank'})
        for j in range(10):
            detail_url = url_list[j].attrs['href']
            print(detail_url)
            detail = req.get(url=detail_url,
                             headers=headers).content.decode('gbk')
            parse_info(detail, redis)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
def get_hit_rescruit():
    print("HIT Begin ===================================================")
    base_url = "http://job.hit.edu.cn/index/getZczphData"
    host = "job.hit.edu.cn"
    header = util.get_header(host)

    req = requests.Session()
    header[
        'Cookie'] = 'UM_distinctid=12d92-04155388a776ed-49566e-1fa400-15fef2bf12e643; CNZZDATA1261107882=1341678225-1511543504-https%253A%252F%252Fwww.baidu.com%252F%7C1513672466; JSESSIONID=E8EAAFC1F662C83D57C2D504594BD6CF'
    # res = req.get("http://job.hit.edu.cn/info?dj=MQ--").content.decode('utf-8')
    # print(res)
    # req.headers.update()
    re = jedis.jedis()
    re.connect_redis()
    re.clear_list(table_name)
    # 哈工大最新的就业网站是从2016年9月开始的
    for i in range(0, 16):
        month = 9
        year = 2016
        month = month + i
        if month > 12:
            year = 2017
            month = month - 12

        date = datetime.date(year, month, 1)
        params = {'Month': util.get_month(date)}
        # params = {'Month': '2017-11'}
        params = json.dumps(params)
        print(params)
        print(base_url)
        res = req.post(url=base_url, headers=header, data=params)
        print(res.status_code)
        content = res.content.decode("utf-8")
        # print(content)
        parse_hit_info(content, re)
    re.add_to_file(table_name)
    re.add_university(table_name)
    print("HIT finish ===================================================")
def get_nku_recruit():
    # 宣讲会
    url = 'http://career.nankai.edu.cn/Home/Reccalender/doxuanjiang'
    # 双选会
    url2 = 'http://career.nankai.edu.cn/Home/Reccalender/doshuangxuan'
    host = 'career.nankai.edu.cn'
    header = util.get_header(host)
    header['referer'] = 'http://career.nankai.edu.cn/reccalender/index.html'
    header[
        'cookie'] = 'yunsuo_session_verify=5374b1e89d110421560f5e8e3182d03c; PHPSESSID=632an0himtafj6me8379r8fkn4; Hm_lvt_6eb8a37eb57545b46494b26e6136af4a=1511532968; Hm_lpvt_6eb8a37eb57545b46494b26e6136af4a=1511533002'
    years = ['2016, 2017']
    req = requests.Session()
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for year in years:
        company_list = req.post(url=url, headers=header, data={
            'year': year
        }).content.decode('unicode-escape')
        parse_info(redis, company_list)

    # 获取双选会
    recruit_list = req.post(url=url2, headers=header, data={
        'year': 2017
    }).content.decode('unicode-escape')
    recruit_list = json.loads(recruit_list)
    for item in recruit_list:
        id = item['id']
        date = item['starttime']
        title = item['title']
        print("===============================")
        print(title, id)
        recruit_url = 'http://career.nankai.edu.cn/Home/Recruitment/content/type/1/id/' + str(
            id) + '.html'
        content = req.get(url=recruit_url,
                          headers=header).content.decode("utf-8")
        parse_recruit_info(redis, content, date, id)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
 def get_scu_recruit(self):
     table_name = "scu_company_info"
     host = 'jy.scu.edu.cn'
     referer = "http://jy.scu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx"
     base_url = "http://jy.scu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next&currentPage="
     url = "http://jy.scu.edu.cn/eweb/wfc/app/pager.so?type=goPager&requestPager=pager&pageMethod=next&currentPage=0"
     self.re.clear_list(table_name)
     req = requests.Session()
     scu_header = util.get_header(host)
     scu_header[
         'Referer'] = 'http://jy.scu.edu.cn/eweb/jygl/zpfw.so?modcode=jygl_xjhxxck&subsyscode=zpfw&type=searchXjhxx&xjhType=all'
     data = {'xjhType': 'yjb', 'jbrq': '', 'zpzt': ''}
     req.get(url='http://jy.scu.edu.cn/eweb/jygl/index.so',
             headers=scu_header)
     res = req.post(headers=scu_header, url=referer, data=str(data))
     content = res.content.decode("utf-8")
     index_begin = 8
     index_end = 28
     page_num = self.get_page_num(content)
     scu_header['Referer'] = referer
     self.get_rescruit(base_url, req, scu_header, table_name, page_num,
                       index_begin, index_end, 2)
     self.re.add_university(table_name)
     self.re.add_to_file(table_name)
Example #26
0
# coding = utf-8
import re
import requests
from bs4 import BeautifulSoup

from jedis import jedis
from util import util

# 云南大学
table_name = 'ynu_company_info'
headers = util.get_header('jobs.ynu.edu.cn')
req = requests.Session()
date_pattern = re.compile('[0-9]{4}-[0-9]{2}-[0-9]{2}')


def get_ynu_recruitment():
    base_url = "http://jobs.ynu.edu.cn/wszplist.jsp?urltype=tree.TreeTempUrl&wbtreeid=1091"
    url = 'http://jobs.ynu.edu.cn/wszplist.jsp?urltype=tree.TreeTempUrl&wbtreeid=1091'

    redis = jedis.jedis()
    redis.clear_list(table_name)
    content = req.get(url=base_url, headers=headers).content.decode('utf-8')
    total_page_num = get_total_num(content)
    params = {
        'reqformCURURI': '3187540095DC12E6C9C66ED4973512AD',
        'reqformKEYTYPES': '4, 12, 93',
        'actiontype': 'Find',
        'reqformORDER': 'desc',
        'reqformORDERKEY': 'wbrelease',
        'reqformCountNo': total_page_num,
        'reqformGOPAGE': '',