def get_ecnu_recruitment(): # 华东师范大学 table_name = 'ecnu_company_info' url = 'http://www.career.ecnu.edu.cn/commonpage/ListMessage.aspx?infotype=el' redis = jedis.jedis() redis.clear_list(table_name) # 初始参数 __VIEWSTATE = 'S+WPBHsLsy12D8przcyE7qpkPYu9cdeAgBUifZLLgj2ut3Fs4gXEqRxDo53bG5pn6gpuyVbz6z4e/ERztf5uEEU/rSTn6GdPhUhZ0yRuyTW45lZpMU2xF2qfExaOT6AmTSx1dHGOPHWYofJSK62Vur+uIMgf4En6eCrhhmsyX72Hsy19sQPEeWC5Qaea6FddgJrFo+GjkukltQrrQho2iDFFYK2HPvceAqdmHVpL1gS16SWRF+oZabG7ptN0brscjEVCS/5sEHvNUrMLwg/9b+osSqda1jJJJTVidnu0yjAAD/JlSZx60O6i5zmHdlgDIHDVyD/oqndryIRowYuo1oRd3cQ0f2qr9yeipbvDXBLRpXA6Z0qPCo6/JoRj6vYQGwLHwqA7SPovunrwM3tO1ZQxeMajIUENxhqzNOSFXNGO60GAFaIunfhe/b7F5sAGBIniqIX2W+U66Np7nAmoqEzaTGZHjadMnEDDheTAg1yXcFZxWKCXtjT5i3aQ6FdMtwpi3U7nX2qHnxBsmMaKqkm86liXgR1WwnUlzf8t8YvT/O/j88lnbcZomMNR1xxpGL3LIkD6XxiyiLtSB9iPY/uKs1mcRWUJxYfbNAHMn7hqTsfBvACUqOP72LTf9QYtgRDPyXv5jp4kIMuV9VhhG/kzggveXUFK1UZ0Oy6tQamYS52lBMp/6F8ibpwAmNEf' __EVENTVALIDATION = 'PzbwP/Zoy+Gxtp8jeRMHlsqDlG8FHGyiIB0rnsPeHlR5GCKT3S/ijcmOCnvo+xG7JwLukL/LacFOXZFw/Ksx+KUsoZ5uBc6y8n05Seo7Wade+Y1hYQWQa9JCP2Ftf596eTYB4Q7kpKSm44YMgva6YoHVJtDHgW0rEB2az5xE5eqNtr2nqNUAkubtOxNWgSwSbPHlDtx84OZ27rw2cwClP+qtthyJx2oeH1S/SfIguB74I1who3k+Hc18vSj1+QGOHdP44gI6o3KDbDh4ZkAsT7+lj0uAZGq7ICg9UIgKWkprRoQKnd0QdrpexSij4KvJP3rQ0a8Q1ZV0K4/Fi5tAAXWtL/n0oQOwoxUbD0PUbTBZLY1FrcLH0Gdm6SsS45G9' session = get_session() page_count = 1 # 结束条件为 只有一条信息 try: while True: content = get_content(session, url, __EVENTVALIDATION, __VIEWSTATE) __EVENTVALIDATION, __VIEWSTATE = parse_parameters(content) end = parse_content(content, redis, table_name) print('parse %d page !' % page_count) page_count += 1 if end: print('end!') break except BaseException as e: redis.handle_error(e, table_name) redis.add_to_file(table_name) redis.add_university(table_name)
def get_ouc_recruit(): print("开始获取中国海洋大学数据=====================") url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html" host = 'career.ouc.edu.cn' headers = util.get_header(host) req = requests.Session() res = req.get(url=url, headers=headers).content.decode('gbk') redis = jedis.jedis() redis.clear_list(table_name) soup = BeautifulSoup(res, 'html5lib') total_infos = int(re.findall(pattern, str(soup))[0][14:]) page_num = total_infos // 20 + 1 for i in range(1, page_num + 1): try: if i == 1: url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html" else: url = "http://career.ouc.edu.cn/html/zp_info/campus/" + str(i) + ".html" content = req.get(url=url, headers=headers).content.decode('gbk') parse_info(content, redis) except BaseException as e: util.format_err(e) redis.add_university(table_name) redis.add_to_file(table_name) print("获取中国海洋大学数据完成=====================")
def get_zju_rescruit(): table_name = "zju_company_info_2018" base_url = "http://www.career.zju.edu.cn/ejob/zczphxxmorelogin.do" params = { 'zphix': 0, 'dwmc': '', 'hylb': '', 'zphrq': '', 'pages.pageSize': 30, 'pages.currentPage': 0, 'pages.maxPage': 20, 'pageno': '' } req = requests.Session() re = jedis.jedis() re.connect_redis() re.clear_list(table_name) for i in range(1, 23): print(i) params['pages.currentPage'] = i res = req.post(base_url, data=params) content = res.content.decode("GBK") # print(content) parse_info(content, re, table_name) re.add_to_file(table_name) re.add_university(table_name)
def get_consult_top100(): url = 'https://www.sohu.com/a/160219346_169875' content = requests.get(url).content.decode("utf-8") soup = BeautifulSoup(content, "html5lib") company_list = soup.find_all('p') com_info = [] analysis = SmartAnalysisByName() re = jedis.jedis() table_name = "best_consulting_company_info" for i in range(8, 99): info = company_list[i].text.strip() if info.find('.') != -1: com_info.append(info) # print(info) print(len(com_info)) for i in range(len(com_info)): print("=========================================") print(i) print(com_info[i]) if i < 10 or (50 <= i < 59) or (75 <= i < 86): com_info[i] = com_info[i][2: -5].strip() else: com_info[i] = com_info[i][3: -5].strip() print(com_info[i]) # 结巴分词 for item in com_info: short_names = analysis.get_jieba_fenci(item) re.save_dict(table_name, data=dict(company_name=item, short_name=short_names)) analysis.add_to_file(table_name) print("获取咨询行业各种排名完成")
def get_zzu_recruit(): url = "http://job.zzu.edu.cn:9009/service/business/college/jobfair/jobFairInfo/getCalendarInfo.xf" req = requests.Session() host = 'job.zzu.edu.cn:9009' headers = util.get_header(host) headers['referer'] = 'http://job.zzu.edu.cn/p/page/jobCalendar.html?channel_code=XJH&type=0' redis = jedis.jedis() redis.clear_list(table_name) year = 2018 # 从 2017年一直退回到2012年 for i in range(72, 0, -1): month = i % 12 if month == 0: year = year - 1 month = 12 params = { 'remark': '0', 'year': str(year), 'month': str(month) } print(params) res = req.post(url=url, headers=headers, data=params) content = res.content.decode('utf-8') parse_info(content, redis) redis.add_to_file(table_name) redis.add_university(table_name)
def get_scut_recuit(): print("开始获取华南理工大学数据=====================") url = "http://jyzx.6ihnep7.cas.scut.edu.cn/jyzx/xs/zpxx/xyxj/" req = requests.Session() headers = util.get_header(host='jyzx.6ihnep7.cas.scut.edu.cn') redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 61): try: data = { 'pageNo': '60', 'daoxv1': '0', 'entName': '', 'time': '-1', 'pageNO': str(i) } content = req.post(url=url, headers=headers, data=data).content.decode('utf-8') parse_info(redis, content) except BaseException as e: util.format_err(e) break redis.add_to_file(table_name) redis.add_university(table_name) print("获取华南理工大学数据完成=======================")
def get_muc_recuitment(): # 中央民族大学 table_name = 'muc_company_info' redis = jedis.jedis() redis.clear_list(table_name) session = requests.session() header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36', } session.headers.update(header) # 0周 到 55周 begin_week = 0 end_week = 51 try: for i in range(begin_week, end_week): get_one_week_data(i, redis, table_name, session) print('week ' + str(i) + ' done!') except TimeoutError as e: print('test') redis.handle_error(e, table_name) redis.add_to_file(table_name) redis.add_university(table_name)
def get_ustbr_recuitment(): # 北京科技大学 table_name = 'ustbr_company_info' redis = jedis.jedis() redis.connect_redis() redis.clear_list(table_name) session = requests.session() header = { 'Host': 'job.ustb.edu.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36', 'Referer': 'http://job.ustb.edu.cn/front/channel.jspa?channelId=766&parentId=763', } session.headers.update(header) # -121周 到 50周 begin_week = -121 end_week = 51 try: for i in range(begin_week, end_week): get_one_week_data(i, redis, table_name, session) print('week ' + str(i) + ' done!') # 貌似有反爬机制 # sleep(1) except TimeoutError as e: print('test') redis.handle_error(e, table_name) redis.add_to_file(table_name) redis.add_university(table_name)
def get_XJTU_recruit(): # 西安交通大学 table_name = 'xjtu_company_info' redis = jedis.jedis() redis.connect_redis() redis.clear_list(table_name) # 招聘会 max_page = 516 # max_page = 20 for page in range(1, max_page): try: get_data1(page, redis, table_name) print('page ' + str(page) + ' done!') except BaseException as e: redis.handle_error(e, table_name) # 招聘信息 max_page = 172 # max_page = 20 for page in range(1, max_page): try: get_data1(page, redis, table_name) print('page ' + str(page) + ' done!') except BaseException as e: redis.handle_error(e, table_name) redis.add_to_file(table_name) redis.add_university(table_name)
def get_hit_rescruit(): base_url = "http://job.hit.edu.cn/index/getZczphData" host = "job.hit.edu.cn" header = util.get_header(host) header['referer'] = "http://job.hit.edu.cn/info?dj=MQ--" header['accept'] = "*/*" header['X-Requested-With'] = "XMLHttpRequest" req = requests.Session() header['cookie'] = "JSESSIONID=A36AAA74D82B3F39C3FD2455853EC081" req.get("http://job.hit.edu.cn/info?dj=MQ--") re = jedis.jedis() re.connect_redis() # 哈工大最新的就业网站是从2016年9月开始的,至今一共有13个月的数据 for i in range(0, 14): month = 9 year = 2016 month = month + i if month > 12: year = 2017 month = month - 12 date = datetime.date(year, month, 1) params = {'Month': util.get_month(date)} # params = {'Month': '2017-10'} params = json.dumps(params) print(params) res = req.post(headers=header, url=base_url, data=params) content = res.content.decode("utf-8") # print(content) parse_hit_info(content, re)
def get_it_top100(): url = "http://www.sohu.com/a/162100864_608782" req = requests.Session() re = jedis.jedis() re.clear_list(table_name) res = req.get(url=url) content = res.content.decode("utf-8") parse_info(content, re)
def get_scu_recruit(): f = open('scu_jy.html', 'r', encoding='utf-8') data = f.read() redis = jedis.jedis() redis.clear_list(table_name) parse_info(data, redis, 8, 5300) redis.add_university(table_name) redis.add_to_file(table_name)
def get_cuit_recruit(): # 成都信息工程大学 table_name = 'cuit_company_info' redis = jedis.jedis() redis.connect_redis() redis.clear_list(table_name) get_data(table_name, redis) redis.add_to_file(table_name) redis.add_university(table_name)
def get_sufe_recruit(): host = "career.sufe.edu.cn/" headers = util.get_header(host) re = jedis.jedis() re.clear_list(table_name) url = "http://careersys.sufe.edu.cn/pros_jiuye/s/zxh/owebsiteData/recruitmentAndPreaching?callback=&type=list&eachPageRows=600¤tPageno=1&_=" req = requests.Session() res = req.get(headers=headers, url=url) content = res.content.decode("utf-8") parse_info(content, re)
def get_scc_recuit(): # 上海海关学院 table_name = 'scc_company_info' redis = jedis.jedis() redis.clear_list(table_name) # 只有一页 get_data(table_name, redis) redis.add_to_file(table_name) redis.add_university(table_name)
def get_top_500(base_url, page_num, company_type): host = "www.fortunechina.com" header = util.get_header(host) req = requests.Session() re = jedis.jedis() re.connect_redis() for i in range(1, page_num): url = base_url + str(i) print(i) res = req.get(headers=header, url=url).content.decode("utf-8") parse_top500(res, re, company_type)
def __init__(self): self.re = jedis.jedis().get_re() self.USA_company_list = [] self.China_company_list = [] self.World_company_list = [] self.china_top500_dict = {} self.world_top500_dict = {} self.usa_top500_dict = {} self.usa_company_str = "" self.china_company_str = "" self.world_company_str = ""
def get_raw_data(): re = jedis.jedis().get_re() company_list = [] temp = re.lrange("intelli_drive", 0, -1) company_list.append(temp) temp = re.lrange("car_net", 0, -1) company_list.append(temp) temp = re.lrange("driving_without_man", 0, -1) company_list.append(temp) temp = re.lrange("intelli_car", 0, -1) company_list.append(temp) return company_list
def get_nju_rescruit(): base_url = "http://job.nju.edu.cn/login/nju/home.jsp?type=zph&DZPHBH=&sfss=sfss&zphzt=&jbksrq=&jbjsrq=&sfgq=&pageSearch=2&pageNow=" req = requests.Session() header = util.get_header("job.nju.edu.cn") re = jedis.jedis() re.connect_redis() for i in range(1, 118): print(i) content = req.get(headers=header, url=base_url + str(i)).content.decode("utf-8") parse_nju_info(content, re) re.add_university("nju_company_info") print("finish")
def get_csu_recruit(): # 中南大学 table_name = 'CSU_company_info' redis = jedis.jedis() max_page = 706 for i in range(1, max_page): try: get_one_page_data(i, redis, table_name) print('page ' + str(i) + ' done!') except Exception as e: redis.handle_error(e, table_name) redis.add_to_file(table_name) redis.add_university(table_name)
def get_hust_recruit(): table_name = 'HUST_company_info' re = jedis.jedis() max_page = 212 try: # 从第三页开始爬取 for i in range(3, max_page): get_data(i, re, table_name) print('page ' + str(i) + ' done!') except BaseException as e: re.handle_error(e, table_name) re.add_to_file(table_name) re.add_university(table_name)
def get_jincheng_recruit(): base_url = "http://www.scujcc.com.cn/channels/229" req = requests.Session() content = req.get(base_url + ".html").content.decode("utf-8") re = jedis.jedis() re.connect_redis() parse_jincheng(content, re) for i in range(2, 99): print(i) url = base_url + "_" + str(i) + ".html" content = req.get(url).content.decode("utf-8") parse_jincheng(content, re) re.add_university("jincheng") print("finish")
def get_fdu_rescruit(): host = "www.career.fudan.edu.cn" headers = util.get_header(host) headers['cookie'] = 'JSESSIONID=0000qZlE0QPPNarjW8SKyrjJPEW:19b14rm85' # 将count掉值设置为大于等于总信息的数字,可以一次性获得所有数据 url = "http://www.career.fudan.edu.cn/jsp/career_talk_list.jsp?count=3000&list=true" req = requests.Session() re = jedis.jedis() re.connect_redis() res = req.get(headers=headers, url=url) content = res.content.decode("utf-8") parse_info(content, re) re.add_university(table_name) re.add_to_file(table_name)
def get_tsinghua_recruit(): print("THU Begin ===================================================") base_url = "http://career.cic.tsinghua.edu.cn/xsglxt/b/jyxt/anony/jrqzph?callback=jQuery18303533298941862095_1508665403743&_=1508665403779" list_url = "http://career.cic.tsinghua.edu.cn/xsglxt/b/jyxt/anony/queryTodayHdList?&callback=&_=&rq=" req = requests.Session() res = req.get(url=base_url) content = res.content.decode("utf-8") th_infos = content.split("[")[1] th_info = th_infos.split("]")[0] th_info = "[" + th_info + "]" json_info = json.loads(th_info) th_info_dict = [] # 去重 item_name = "" for item in json_info: if item['zphmc'] != item_name: th_info_dict.append(item) item_name = item['zphmc'] else: continue re = jedis.jedis() re.connect_redis() re.clear_list(table_name) for item in th_info_dict: # 计算就业洽谈会的参展公司 if item['zphmc'].find("就业洽谈会") != -1: rq = util.get_short_date(item['qsrq']) print(rq) company_list = req.get(url=list_url + str(rq)).content.decode("utf-8") json_company_list = json.loads(company_list[1:len(company_list) - 1]) for info in json_company_list: if info['bt'].find("就业洽谈会") != -1: zphid = info['zphid'] company_list_detail = parse_tsinghua_info(zphid).split("\t\t\t\t\t\t\t\t\t\t\t\t") # company_list_dict = [] for company in company_list_detail: if company != "\n" and company != "\n\t" and company != "\t" and company != "\t\n" and company != "": # company_list_dict.append({'date': item['qsrq'], 'company': company.strip()}) re.save_info(table_name, item['qsrq'], company.strip()) # re.save_infos("thu_company_info", company_list_dict) else: continue else: re.save_info(table_name, item['qsrq'], item['zphmc']) re.add_university(table_name) re.add_to_file(table_name) print("THU Finish ===================================================")
def get_cqu_recruit(): table_name = 'cqu_company_info' re = jedis.jedis() url = 'http://www.job.cqu.edu.cn/jyxt/zczphxxlistlogin.do' # max_page = 117 max_page = 10 try: for i in range(1, max_page): get_data(url, i, re, table_name) print('page ' + str(i) + ' done!') except BaseException as e: re.handle_error(e, table_name) re.add_to_file(table_name) re.add_university(table_name)
def get_uestc_recruit(): table_name = "uestc_company_info" re = jedis.jedis() max_page_num = 407 try: for i in range(1, max_page_num): get_data(i, re, table_name) print("page " + str(i) + " done!") except BaseException as e: # 意外退出时保存数据到文件 re.handle_error(e, table_name) # 保存到文件 re.add_to_file(table_name) # 在大学列表里增加学校名 re.add_university(table_name)
def get_investment_top100(): analysis = SmartAnalysisByName() url = 'http://www.sohu.com/a/165350436_499106' res = requests.get(url).content.decode('utf-8') soup = BeautifulSoup(res, 'html5lib') company_list = soup.find_all('td') table_name = 'investment_top100' re = jedis.jedis() re.clear_list(table_name) for i in range(5, len(company_list), 5): company_name = company_list[i + 1].text.strip() short_names = analysis.get_jieba_fenci(company_name) re.save_info(table_name, company_name, short_names) analysis.add_to_file(table_name) print(table_name + '完成')
def get_cufe_rescruit(): base_url = "http://scc.cufe.edu.cn/recruitment-datas/15/" url_tail = "/2.html" host = "scc.cufe.edu.cn" req = requests.Session() header = jedis.get_header(host) re = jedis.jedis() re.connect_redis() max_page_num = 422 for i in range(1, max_page_num): print(i) url = base_url + str(i) + url_tail res = req.get(headers=header, url=url).content.decode("utf-8") parse_info(res, re) re.add_university("cufe_company_info")
def get_cuit_recruit(): # 成都信息工程大学 print("成都信息工程大学开始================================") table_name = 'cuit_company_info' try: redis = jedis.jedis() redis.clear_list(table_name) get_data(table_name, redis) redis.add_to_file(table_name) redis.add_university(table_name) except BaseException as e: print("成都信息工程大学:") print(e) pass print("成都信息工程大学开始================================")
def get_lzu_rescruit(): base_url = "http://job.lzu.edu.cn/htmlfile/article/list/119/list_" url_tail = ".shtml" host = "job.lzu.edu.cn" header = util.get_header(host) max_page_num = 50 req = requests.Session() re = jedis.jedis() re.connect_redis() for i in range(1, max_page_num + 1): url = base_url + str(i) + url_tail html = req.get(headers=header, url=url).content.decode("utf-8") parse_html(html, re) print(i) re.add_university("lzu_company_info") print("finish")