def get_ecnu_recruitment():
    # 华东师范大学
    table_name = 'ecnu_company_info'
    url = 'http://www.career.ecnu.edu.cn/commonpage/ListMessage.aspx?infotype=el'

    redis = jedis.jedis()
    redis.clear_list(table_name)

    # 初始参数
    __VIEWSTATE = 'S+WPBHsLsy12D8przcyE7qpkPYu9cdeAgBUifZLLgj2ut3Fs4gXEqRxDo53bG5pn6gpuyVbz6z4e/ERztf5uEEU/rSTn6GdPhUhZ0yRuyTW45lZpMU2xF2qfExaOT6AmTSx1dHGOPHWYofJSK62Vur+uIMgf4En6eCrhhmsyX72Hsy19sQPEeWC5Qaea6FddgJrFo+GjkukltQrrQho2iDFFYK2HPvceAqdmHVpL1gS16SWRF+oZabG7ptN0brscjEVCS/5sEHvNUrMLwg/9b+osSqda1jJJJTVidnu0yjAAD/JlSZx60O6i5zmHdlgDIHDVyD/oqndryIRowYuo1oRd3cQ0f2qr9yeipbvDXBLRpXA6Z0qPCo6/JoRj6vYQGwLHwqA7SPovunrwM3tO1ZQxeMajIUENxhqzNOSFXNGO60GAFaIunfhe/b7F5sAGBIniqIX2W+U66Np7nAmoqEzaTGZHjadMnEDDheTAg1yXcFZxWKCXtjT5i3aQ6FdMtwpi3U7nX2qHnxBsmMaKqkm86liXgR1WwnUlzf8t8YvT/O/j88lnbcZomMNR1xxpGL3LIkD6XxiyiLtSB9iPY/uKs1mcRWUJxYfbNAHMn7hqTsfBvACUqOP72LTf9QYtgRDPyXv5jp4kIMuV9VhhG/kzggveXUFK1UZ0Oy6tQamYS52lBMp/6F8ibpwAmNEf'
    __EVENTVALIDATION = 'PzbwP/Zoy+Gxtp8jeRMHlsqDlG8FHGyiIB0rnsPeHlR5GCKT3S/ijcmOCnvo+xG7JwLukL/LacFOXZFw/Ksx+KUsoZ5uBc6y8n05Seo7Wade+Y1hYQWQa9JCP2Ftf596eTYB4Q7kpKSm44YMgva6YoHVJtDHgW0rEB2az5xE5eqNtr2nqNUAkubtOxNWgSwSbPHlDtx84OZ27rw2cwClP+qtthyJx2oeH1S/SfIguB74I1who3k+Hc18vSj1+QGOHdP44gI6o3KDbDh4ZkAsT7+lj0uAZGq7ICg9UIgKWkprRoQKnd0QdrpexSij4KvJP3rQ0a8Q1ZV0K4/Fi5tAAXWtL/n0oQOwoxUbD0PUbTBZLY1FrcLH0Gdm6SsS45G9'

    session = get_session()

    page_count = 1
    # 结束条件为 只有一条信息
    try:
        while True:
            content = get_content(session, url, __EVENTVALIDATION, __VIEWSTATE)
            __EVENTVALIDATION, __VIEWSTATE = parse_parameters(content)
            end = parse_content(content, redis, table_name)
            print('parse %d page !' % page_count)
            page_count += 1
            if end:
                print('end!')
                break
    except BaseException as e:
        redis.handle_error(e, table_name)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
Exemple #2
0
def get_ouc_recruit():
    print("开始获取中国海洋大学数据=====================")
    url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html"
    host = 'career.ouc.edu.cn'
    headers = util.get_header(host)
    req = requests.Session()
    res = req.get(url=url, headers=headers).content.decode('gbk')
    redis = jedis.jedis()
    redis.clear_list(table_name)
    soup = BeautifulSoup(res, 'html5lib')
    total_infos = int(re.findall(pattern, str(soup))[0][14:])
    page_num = total_infos // 20 + 1
    for i in range(1, page_num + 1):
        try:
            if i == 1:
                url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html"
            else:
                url = "http://career.ouc.edu.cn/html/zp_info/campus/" + str(i) + ".html"
            content = req.get(url=url, headers=headers).content.decode('gbk')
            parse_info(content, redis)
        except BaseException as e:
            util.format_err(e)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
    print("获取中国海洋大学数据完成=====================")
Exemple #3
0
def get_zju_rescruit():
    table_name = "zju_company_info_2018"
    base_url = "http://www.career.zju.edu.cn/ejob/zczphxxmorelogin.do"
    params = {
        'zphix': 0,
        'dwmc': '',
        'hylb': '',
        'zphrq': '',
        'pages.pageSize': 30,
        'pages.currentPage': 0,
        'pages.maxPage': 20,
        'pageno': ''
    }
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    re.clear_list(table_name)
    for i in range(1, 23):
        print(i)
        params['pages.currentPage'] = i
        res = req.post(base_url, data=params)
        content = res.content.decode("GBK")
        # print(content)
        parse_info(content, re, table_name)
    re.add_to_file(table_name)
    re.add_university(table_name)
def get_consult_top100():
    url = 'https://www.sohu.com/a/160219346_169875'
    content = requests.get(url).content.decode("utf-8")
    soup = BeautifulSoup(content, "html5lib")
    company_list = soup.find_all('p')
    com_info = []
    analysis = SmartAnalysisByName()
    re = jedis.jedis()
    table_name = "best_consulting_company_info"
    for i in range(8, 99):
        info = company_list[i].text.strip()
        if info.find('.') != -1:
            com_info.append(info)
            # print(info)
    print(len(com_info))
    for i in range(len(com_info)):
        print("=========================================")
        print(i)
        print(com_info[i])

        if i < 10 or (50 <= i < 59) or (75 <= i < 86):
            com_info[i] = com_info[i][2: -5].strip()
        else:
            com_info[i] = com_info[i][3: -5].strip()
        print(com_info[i])
    # 结巴分词
    for item in com_info:
        short_names = analysis.get_jieba_fenci(item)
        re.save_dict(table_name, data=dict(company_name=item, short_name=short_names))
    analysis.add_to_file(table_name)
    print("获取咨询行业各种排名完成")
def get_zzu_recruit():
    url = "http://job.zzu.edu.cn:9009/service/business/college/jobfair/jobFairInfo/getCalendarInfo.xf"

    req = requests.Session()
    host = 'job.zzu.edu.cn:9009'
    headers = util.get_header(host)
    headers['referer'] = 'http://job.zzu.edu.cn/p/page/jobCalendar.html?channel_code=XJH&type=0'
    redis = jedis.jedis()
    redis.clear_list(table_name)
    year = 2018
    # 从 2017年一直退回到2012年
    for i in range(72, 0, -1):
        month = i % 12
        if month == 0:
            year = year - 1
            month = 12
        params = {
            'remark': '0',
            'year': str(year),
            'month': str(month)
        }
        print(params)
        res = req.post(url=url, headers=headers, data=params)
        content = res.content.decode('utf-8')
        parse_info(content, redis)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
Exemple #6
0
def get_scut_recuit():
    print("开始获取华南理工大学数据=====================")
    url = "http://jyzx.6ihnep7.cas.scut.edu.cn/jyzx/xs/zpxx/xyxj/"

    req = requests.Session()

    headers = util.get_header(host='jyzx.6ihnep7.cas.scut.edu.cn')
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 61):
        try:
            data = {
                'pageNo': '60',
                'daoxv1': '0',
                'entName': '',
                'time': '-1',
                'pageNO': str(i)
            }
            content = req.post(url=url, headers=headers,
                               data=data).content.decode('utf-8')
            parse_info(redis, content)
        except BaseException as e:
            util.format_err(e)
            break
    redis.add_to_file(table_name)
    redis.add_university(table_name)
    print("获取华南理工大学数据完成=======================")
def get_muc_recuitment():
    # 中央民族大学
    table_name = 'muc_company_info'

    redis = jedis.jedis()
    redis.clear_list(table_name)

    session = requests.session()
    header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36',
    }
    session.headers.update(header)
    # 0周 到 55周
    begin_week = 0
    end_week = 51
    try:
        for i in range(begin_week, end_week):
            get_one_week_data(i, redis, table_name, session)
            print('week ' + str(i) + ' done!')
    except TimeoutError as e:
        print('test')
        redis.handle_error(e, table_name)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
def get_ustbr_recuitment():
    # 北京科技大学
    table_name = 'ustbr_company_info'

    redis = jedis.jedis()
    redis.connect_redis()
    redis.clear_list(table_name)

    session = requests.session()
    header = {
        'Host':
        'job.ustb.edu.cn',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36',
        'Referer':
        'http://job.ustb.edu.cn/front/channel.jspa?channelId=766&parentId=763',
    }
    session.headers.update(header)
    # -121周 到 50周
    begin_week = -121
    end_week = 51
    try:
        for i in range(begin_week, end_week):
            get_one_week_data(i, redis, table_name, session)
            print('week ' + str(i) + ' done!')
            # 貌似有反爬机制
            # sleep(1)
    except TimeoutError as e:
        print('test')
        redis.handle_error(e, table_name)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
def get_XJTU_recruit():
    # 西安交通大学

    table_name = 'xjtu_company_info'
    redis = jedis.jedis()
    redis.connect_redis()
    redis.clear_list(table_name)
    # 招聘会
    max_page = 516
    # max_page = 20
    for page in range(1, max_page):
        try:
            get_data1(page, redis, table_name)
            print('page ' + str(page) + ' done!')
        except BaseException as e:
            redis.handle_error(e, table_name)

    # 招聘信息
    max_page = 172
    # max_page = 20
    for page in range(1, max_page):
        try:
            get_data1(page, redis, table_name)
            print('page ' + str(page) + ' done!')
        except BaseException as e:
            redis.handle_error(e, table_name)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
Exemple #10
0
def get_hit_rescruit():
    base_url = "http://job.hit.edu.cn/index/getZczphData"
    host = "job.hit.edu.cn"
    header = util.get_header(host)
    header['referer'] = "http://job.hit.edu.cn/info?dj=MQ--"
    header['accept'] = "*/*"
    header['X-Requested-With'] = "XMLHttpRequest"
    req = requests.Session()
    header['cookie'] = "JSESSIONID=A36AAA74D82B3F39C3FD2455853EC081"
    req.get("http://job.hit.edu.cn/info?dj=MQ--")
    re = jedis.jedis()
    re.connect_redis()
    # 哈工大最新的就业网站是从2016年9月开始的,至今一共有13个月的数据
    for i in range(0, 14):
        month = 9
        year = 2016
        month = month + i
        if month > 12:
            year = 2017
            month = month - 12

        date = datetime.date(year, month, 1)
        params = {'Month': util.get_month(date)}
        # params = {'Month': '2017-10'}
        params = json.dumps(params)
        print(params)
        res = req.post(headers=header, url=base_url, data=params)
        content = res.content.decode("utf-8")
        # print(content)
        parse_hit_info(content, re)
def get_it_top100():
    url = "http://www.sohu.com/a/162100864_608782"
    req = requests.Session()
    re = jedis.jedis()
    re.clear_list(table_name)
    res = req.get(url=url)
    content = res.content.decode("utf-8")
    parse_info(content, re)
Exemple #12
0
def get_scu_recruit():
    f = open('scu_jy.html', 'r', encoding='utf-8')
    data = f.read()
    redis = jedis.jedis()
    redis.clear_list(table_name)
    parse_info(data, redis, 8, 5300)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
Exemple #13
0
def get_cuit_recruit():
    # 成都信息工程大学
    table_name = 'cuit_company_info'
    redis = jedis.jedis()
    redis.connect_redis()
    redis.clear_list(table_name)
    get_data(table_name, redis)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
def get_sufe_recruit():
    host = "career.sufe.edu.cn/"
    headers = util.get_header(host)
    re = jedis.jedis()
    re.clear_list(table_name)
    url = "http://careersys.sufe.edu.cn/pros_jiuye/s/zxh/owebsiteData/recruitmentAndPreaching?callback=&type=list&eachPageRows=600&currentPageno=1&_="
    req = requests.Session()
    res = req.get(headers=headers, url=url)
    content = res.content.decode("utf-8")
    parse_info(content, re)
def get_scc_recuit():
    # 上海海关学院
    table_name = 'scc_company_info'
    redis = jedis.jedis()
    redis.clear_list(table_name)
    # 只有一页
    get_data(table_name, redis)

    redis.add_to_file(table_name)
    redis.add_university(table_name)
def get_top_500(base_url, page_num, company_type):
    host = "www.fortunechina.com"
    header = util.get_header(host)
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    for i in range(1, page_num):
        url = base_url + str(i)
        print(i)
        res = req.get(headers=header, url=url).content.decode("utf-8")
        parse_top500(res, re, company_type)
 def __init__(self):
     self.re = jedis.jedis().get_re()
     self.USA_company_list = []
     self.China_company_list = []
     self.World_company_list = []
     self.china_top500_dict = {}
     self.world_top500_dict = {}
     self.usa_top500_dict = {}
     self.usa_company_str = ""
     self.china_company_str = ""
     self.world_company_str = ""
Exemple #18
0
def get_raw_data():
    re = jedis.jedis().get_re()
    company_list = []
    temp = re.lrange("intelli_drive", 0, -1)
    company_list.append(temp)
    temp = re.lrange("car_net", 0, -1)
    company_list.append(temp)
    temp = re.lrange("driving_without_man", 0, -1)
    company_list.append(temp)
    temp = re.lrange("intelli_car", 0, -1)
    company_list.append(temp)
    return company_list
Exemple #19
0
def get_nju_rescruit():
    base_url = "http://job.nju.edu.cn/login/nju/home.jsp?type=zph&DZPHBH=&sfss=sfss&zphzt=&jbksrq=&jbjsrq=&sfgq=&pageSearch=2&pageNow="
    req = requests.Session()
    header = util.get_header("job.nju.edu.cn")
    re = jedis.jedis()
    re.connect_redis()
    for i in range(1, 118):
        print(i)
        content = req.get(headers=header,
                          url=base_url + str(i)).content.decode("utf-8")
        parse_nju_info(content, re)
    re.add_university("nju_company_info")
    print("finish")
def get_csu_recruit():
    # 中南大学
    table_name = 'CSU_company_info'
    redis = jedis.jedis()
    max_page = 706
    for i in range(1, max_page):
        try:
            get_one_page_data(i, redis, table_name)
            print('page ' + str(i) + ' done!')
        except Exception as e:
            redis.handle_error(e, table_name)
    redis.add_to_file(table_name)
    redis.add_university(table_name)
Exemple #21
0
def get_hust_recruit():
    table_name = 'HUST_company_info'
    re = jedis.jedis()
    max_page = 212
    try:
        # 从第三页开始爬取
        for i in range(3, max_page):
            get_data(i, re, table_name)
            print('page ' + str(i) + ' done!')
    except BaseException as e:
        re.handle_error(e, table_name)
    re.add_to_file(table_name)
    re.add_university(table_name)
Exemple #22
0
def get_jincheng_recruit():
    base_url = "http://www.scujcc.com.cn/channels/229"
    req = requests.Session()
    content = req.get(base_url + ".html").content.decode("utf-8")
    re = jedis.jedis()
    re.connect_redis()
    parse_jincheng(content, re)
    for i in range(2, 99):
        print(i)
        url = base_url + "_" + str(i) + ".html"
        content = req.get(url).content.decode("utf-8")
        parse_jincheng(content, re)
    re.add_university("jincheng")
    print("finish")
Exemple #23
0
def get_fdu_rescruit():
    host = "www.career.fudan.edu.cn"
    headers = util.get_header(host)
    headers['cookie'] = 'JSESSIONID=0000qZlE0QPPNarjW8SKyrjJPEW:19b14rm85'
    # 将count掉值设置为大于等于总信息的数字,可以一次性获得所有数据
    url = "http://www.career.fudan.edu.cn/jsp/career_talk_list.jsp?count=3000&list=true"
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    res = req.get(headers=headers, url=url)
    content = res.content.decode("utf-8")
    parse_info(content, re)
    re.add_university(table_name)
    re.add_to_file(table_name)
def get_tsinghua_recruit():
    print("THU Begin ===================================================")
    base_url = "http://career.cic.tsinghua.edu.cn/xsglxt/b/jyxt/anony/jrqzph?callback=jQuery18303533298941862095_1508665403743&_=1508665403779"
    list_url = "http://career.cic.tsinghua.edu.cn/xsglxt/b/jyxt/anony/queryTodayHdList?&callback=&_=&rq="
    req = requests.Session()
    res = req.get(url=base_url)
    content = res.content.decode("utf-8")
    th_infos = content.split("[")[1]
    th_info = th_infos.split("]")[0]
    th_info = "[" + th_info + "]"
    json_info = json.loads(th_info)
    th_info_dict = []

    # 去重
    item_name = ""
    for item in json_info:
        if item['zphmc'] != item_name:
            th_info_dict.append(item)
            item_name = item['zphmc']
        else:
            continue
    re = jedis.jedis()
    re.connect_redis()
    re.clear_list(table_name)
    for item in th_info_dict:
        # 计算就业洽谈会的参展公司
        if item['zphmc'].find("就业洽谈会") != -1:
            rq = util.get_short_date(item['qsrq'])
            print(rq)
            company_list = req.get(url=list_url + str(rq)).content.decode("utf-8")
            json_company_list = json.loads(company_list[1:len(company_list) - 1])
            for info in json_company_list:
                if info['bt'].find("就业洽谈会") != -1:
                    zphid = info['zphid']
                    company_list_detail = parse_tsinghua_info(zphid).split("\t\t\t\t\t\t\t\t\t\t\t\t")
                    # company_list_dict = []
                    for company in company_list_detail:
                        if company != "\n" and company != "\n\t" and company != "\t" and company != "\t\n" and company != "":
                            # company_list_dict.append({'date': item['qsrq'], 'company': company.strip()})
                            re.save_info(table_name, item['qsrq'], company.strip())

                            # re.save_infos("thu_company_info", company_list_dict)
                else:
                    continue
        else:
            re.save_info(table_name, item['qsrq'], item['zphmc'])

    re.add_university(table_name)
    re.add_to_file(table_name)
    print("THU Finish ===================================================")
Exemple #25
0
def get_cqu_recruit():
    table_name = 'cqu_company_info'
    re = jedis.jedis()
    url = 'http://www.job.cqu.edu.cn/jyxt/zczphxxlistlogin.do'
    # max_page = 117
    max_page = 10
    try:
        for i in range(1, max_page):
            get_data(url, i, re, table_name)
            print('page ' + str(i) + ' done!')
    except BaseException as e:
        re.handle_error(e, table_name)
    re.add_to_file(table_name)
    re.add_university(table_name)
Exemple #26
0
def get_uestc_recruit():
    table_name = "uestc_company_info"
    re = jedis.jedis()
    max_page_num = 407
    try:
        for i in range(1, max_page_num):
            get_data(i, re, table_name)
            print("page " + str(i) + " done!")
    except BaseException as e:
        # 意外退出时保存数据到文件
        re.handle_error(e, table_name)
    # 保存到文件
    re.add_to_file(table_name)
    # 在大学列表里增加学校名
    re.add_university(table_name)
def get_investment_top100():
    analysis = SmartAnalysisByName()
    url = 'http://www.sohu.com/a/165350436_499106'
    res = requests.get(url).content.decode('utf-8')
    soup = BeautifulSoup(res, 'html5lib')
    company_list = soup.find_all('td')
    table_name = 'investment_top100'
    re = jedis.jedis()
    re.clear_list(table_name)
    for i in range(5, len(company_list), 5):
        company_name = company_list[i + 1].text.strip()
        short_names = analysis.get_jieba_fenci(company_name)
        re.save_info(table_name, company_name, short_names)
    analysis.add_to_file(table_name)
    print(table_name + '完成')
def get_cufe_rescruit():
    base_url = "http://scc.cufe.edu.cn/recruitment-datas/15/"
    url_tail = "/2.html"
    host = "scc.cufe.edu.cn"
    req = requests.Session()
    header = jedis.get_header(host)
    re = jedis.jedis()
    re.connect_redis()
    max_page_num = 422
    for i in range(1, max_page_num):
        print(i)
        url = base_url + str(i) + url_tail
        res = req.get(headers=header, url=url).content.decode("utf-8")
        parse_info(res, re)
    re.add_university("cufe_company_info")
Exemple #29
0
def get_cuit_recruit():
    # 成都信息工程大学
    print("成都信息工程大学开始================================")
    table_name = 'cuit_company_info'
    try:
        redis = jedis.jedis()
        redis.clear_list(table_name)
        get_data(table_name, redis)
        redis.add_to_file(table_name)
        redis.add_university(table_name)
    except BaseException as e:
        print("成都信息工程大学:")
        print(e)
        pass
    print("成都信息工程大学开始================================")
def get_lzu_rescruit():
    base_url = "http://job.lzu.edu.cn/htmlfile/article/list/119/list_"
    url_tail = ".shtml"
    host = "job.lzu.edu.cn"
    header = util.get_header(host)
    max_page_num = 50
    req = requests.Session()
    re = jedis.jedis()
    re.connect_redis()
    for i in range(1, max_page_num + 1):
        url = base_url + str(i) + url_tail
        html = req.get(headers=header, url=url).content.decode("utf-8")
        parse_html(html, re)
        print(i)
    re.add_university("lzu_company_info")
    print("finish")