def calculate_activity_degree(self): university_list = self.get_university_list() for university_table in university_list: university = UNIVERSITY_INFO[ university_table[:-len('_company_info')]] company_list = self.re.lrange(university_table, 0, -1) if university[1] == 'C9': self.get_company_num_in_diff_date(company_list, self.c9_company_date_dict, "c9") elif university[1] == '985': self.get_company_num_in_diff_date(company_list, self.p985_company_date_dict, "p985") elif university[1] == '211': self.get_company_num_in_diff_date(company_list, self.p211_company_date_dict, "p211") elif university[1] == '一本': self.get_company_num_in_diff_date(company_list, self.top_company_date_dict, "top") elif university[1] == '二本': self.get_company_num_in_diff_date(company_list, self.basic_company_date_dict, "basic") else: util.format_err(university) self.save_date_dict() print("finish")
def get_ouc_recruit(): print("开始获取中国海洋大学数据=====================") url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html" host = 'career.ouc.edu.cn' headers = util.get_header(host) req = requests.Session() res = req.get(url=url, headers=headers).content.decode('gbk') redis = jedis.jedis() redis.clear_list(table_name) soup = BeautifulSoup(res, 'html5lib') total_infos = int(re.findall(pattern, str(soup))[0][14:]) page_num = total_infos // 20 + 1 for i in range(1, page_num + 1): try: if i == 1: url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html" else: url = "http://career.ouc.edu.cn/html/zp_info/campus/" + str(i) + ".html" content = req.get(url=url, headers=headers).content.decode('gbk') parse_info(content, redis) except BaseException as e: util.format_err(e) redis.add_university(table_name) redis.add_to_file(table_name) print("获取中国海洋大学数据完成=====================")
def print_and_save_result2(self, result_dict, filename): self.data_array = [] for key, values in result_dict.items(): print('--------------------------------------') try: print(key, values) self.data_array.append(dict(name=key, value=values)) except BaseException as e: util.format_err(e) continue self.data_array = sorted(self.data_array, key=lambda x: float(x['name'])) # 表示从 2013-01-01 至2017-12-30, 步长为一天 for item in self.data_array: # 2013.01.01之前的数据都不要了 if int(float(item['name'])) < 1356969600: pass else: item['name'] = util.get_standard_time_from_mktime( int(float(item['name']))) self.data_array_range.append(item) # self.data_array = list(map(lambda x: util.get_standard_time_from_mktime(int(float(x['name']))), self.data_array)) self.save_result2(filename, self.data_array_range)
def get_scut_recuit(): print("开始获取华南理工大学数据=====================") url = "http://jyzx.6ihnep7.cas.scut.edu.cn/jyzx/xs/zpxx/xyxj/" req = requests.Session() headers = util.get_header(host='jyzx.6ihnep7.cas.scut.edu.cn') redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 61): try: data = { 'pageNo': '60', 'daoxv1': '0', 'entName': '', 'time': '-1', 'pageNO': str(i) } content = req.post(url=url, headers=headers, data=data).content.decode('utf-8') parse_info(redis, content) except BaseException as e: util.format_err(e) break redis.add_to_file(table_name) redis.add_university(table_name) print("获取华南理工大学数据完成=======================")
def parse_info(html, redis): for n in range(len(html['data'])): try: timestamp = html['data'][n]['startTime'] / 1000 date = str(datetime.fromtimestamp(timestamp).date()) company_name = html['data'][n]['name'] print(date, company_name) redis.save_info(table_name, date, company_name) except BaseException as e: util.format_err(e) continue
def parse_recruit_info(redis, content, date, id): soup = BeautifulSoup(content, "html5lib") company_list = soup.find_all(href=re.compile("/recruitment/company")) print(date) for item in company_list: company_name = item.text.strip() # print(company_name) redis.save_info(table_name, date, company_name) print("获取双选会完成") # 2018大型双选会 if int(id) == 62: company_list = soup.find_all( attrs={ 'style': 'font-size:14px;font-family:\'微软雅黑\',sans-serif;color:#666666' }) for i in range(13, 211, 2): try: company_name = company_list[i + 1].text.strip() print(company_name) redis.save_info(table_name, date, company_name) except BaseException as e: util.format_err(e) break # 大型双选会 if int(id) == 69: company_list = soup.find_all( attrs={ 'style': 'font-size:14px;font-family:\'微软雅黑\',sans-serif;color:#666666' }) for i in range(14, 179, 2): try: company_name = company_list[i + 1].text.strip() print(company_name) redis.save_info(table_name, date, company_name) except BaseException as e: util.format_err(e) break # 国有企业双选会 if int(id) == 68: company_list = soup.find_all(attrs={'style': 'font-size: 19px'}) for i in range(0, len(company_list) - 1, 2): company_name = company_list[i + 1].text.strip() # print(company_list[i]) # print(company_name) redis.save_info(table_name, date, company_name) if len(company_list) == 0: print("failed")
def print_and_save_result(self, result_dict, filename): self.data_array = [] for key, values in result_dict.items(): print('--------------------------------------') print(key + ":" + str(len(values)) + " ".join(values)) university_short_name = key[:-len('_company_info')] try: print(UNIVERSITY_INFO[university_short_name][0] + ":" + str(len(values))) self.data_array.append(dict(name=UNIVERSITY_INFO[university_short_name], data=values, total_num=self.university_company_list_length_dict[key])) except BaseException as e: util.format_err(e) continue self.save_result(filename)
def parse_info(html, redis): bf = BeautifulSoup(html, 'lxml') bf1 = bf.find_all('div', class_='r_list1') bf2 = BeautifulSoup(str(bf1), 'lxml') date_list = bf2.find_all('span') company_list = bf2.find_all(href=re.compile('articledetail\?t.PostId=')) for i in range(len(date_list)): try: date = date_list[i].text.replace('/', '-') company_name = company_list[i].text.strip() if company_name.find('取消') == -1 and date != '': redis.save_info(table_name, date, company_name) except BaseException as e: util.format_err(e) continue
def parse_info(redis, content): soup = BeautifulSoup(content, 'html5lib') company_list = soup.find_all( href=re.compile('/jyzx/newSystem/noticeDetail.jsp?')) date_list = soup.select('.date') for i in range(len(company_list)): try: date = date_list[i].text.strip()[:10] company_name = company_list[i].text.strip() if pattern.match(date): print(date, company_name) redis.save_info(table_name, date, company_name) print("=====") except BaseException as e: util.format_err(e) pass
def get_2017_company_list(self, university_table_name): company_list = self.re.lrange(university_table_name, 0, -1) company_list_2017 = [] for item in company_list: try: item = item.replace('\'', '"') item = json.loads(item) date = item['date'] if date.find('2017') != -1: company_list_2017.append(item) except BaseException as e: util.format_err(e, university_table_name, item) continue print("Finish to find 2017 Recruitment-->" + university_table_name) self.count += 1 print(self.count) return company_list_2017, len(company_list_2017)
def get_bnu_recuit(): print("开始获取北京师范大学数据=====================") url = "http://career.bnu.edu.cn/front/zp_query/zphQuery.jspa?" host = "career.bnu.edu.cn" headers = util.get_header(host=host) redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 82): # 一共81页 try: params = {'paramMap.xxlx': '1', 'page.curPage': '%d' % i} html = requests.get(url=url, headers=headers, params=params).json() # json 数据 parse_info(html, redis) except BaseException as e: util.format_err(e) finally: print('获取北京师范大学第 %d 页(共81页)数据完成' % i) redis.add_university(table_name) # 添加学校到github中 redis.add_to_file(table_name) # 添加表到文件中
def parse_info(content, redis): content = json.loads(content) trList = content['result']['trList'] print(trList) for item in trList: for day in item['tdList']: if day is not None: if 'careerList' in day: careerList = day['careerList'] if careerList is not None: try: for data in careerList: date = data['mergeStartTime'] company = data['mergeTitle'] print(date, company) redis.save_info(table_name, date, company) except BaseException as e: util.format_err(e) pass
def get_hnu_recuit(): print("开始获取湖南大学数据=====================") url = "http://scc.hnu.edu.cn/newsjob!getMore.action?" host = "scc.hnu.edu.cn" headers = util.get_header(host=host) redis = jedis.jedis() redis.clear_list(table_name) for i in range(1, 310): # 一共310页,102页及其以前都是2017年的 try: params = {'p.currentPage': '%d' % i, 'Lb': '1'} html = requests.get(url=url, headers=headers, params=params).text parse_info(html, redis) except BaseException as e: # 还不太会错误处理机制 util.format_err(e) break finally: print('获取湖南大学第 %d 页(共310页)数据完成' % i) redis.add_university(table_name) redis.add_to_file(table_name)
def get_company_num_in_diff_date(self, company_list, result_dict, type): for company in company_list: try: company = company.replace('\'', '\"') company = json.loads(company) date_time = company['date'] # company_name = company['company_name'] print(date_time) # 将标准时间转化为时间戳 date_time = util.get_mktime(date_time) if str(date_time) in result_dict: result_dict[str(date_time)] += 1 else: result_dict[str(date_time)] = 1 if str(int(float(date_time))) in self.data_array_each_day_dict: self.data_array_each_day_dict[str(int( float(date_time)))][type] += 1 except BaseException as e: util.format_err(e)
def get_top_500_list(self): company_info = self.re.lrange("company_info", 0, -1) for item in company_info: try: item = item.replace('\'', '"') item = item.replace('==', '\'') # print(item) item = json.loads(item) company_name = item['company_name'] company_name = company_name.replace('++', '\"') company_type = item['company_type'] # print(company_name) if company_type == "USATop500": self.USA_company_list.append(company_name) elif company_type == "ChinaTop500": self.China_company_list.append(company_name) elif company_type == "WorldTop500": self.World_company_list.append(company_name) except BaseException as e: util.format_err(e, item) continue
def parse_info(content, redis, page): soup = BeautifulSoup(content, 'html5lib') company_list = soup.find_all(href=re.compile('/gzujobs/client/jobsinfor/')) date_list = soup.select('.time') for i in range(0, len(company_list)): company_name = company_list[i].text.strip() if page < 128: try: year = re.findall(pattern2, company_name)[0][1:-1] if int(year) > 624: year = '17' else: year = '16' except IndexError: try: year = re.findall(pattern3, company_name)[0][1:-1] except BaseException as e: util.format_err(e) continue else: try: year = re.findall(re.compile('[0-9]+-?-?第[0-9]+期'), company_name)[0][0:2] except IndexError: try: year = re.findall(re.compile('[0-9]+-?-?[0-9]+期'), company_name)[0][0:2] except BaseException as e: util.format_err(e) continue if year == '44': year = '09' date = '20' + str(year) + '-' + date_list[i].text[1:-1] company_name = company_name.split('(')[0].strip() print(company_name, date) redis.save_info(table_name, date, company_name)
def get_top_public_infos(): print("开始获取一本数据=====================") try: get_ncepu_recruit() except BaseException as e: util.format_err(e, "ncepu") pass try: get_ncut_recuitment() except BaseException as e: util.format_err(e, "ncut") pass try: get_njupt_recruitment() except BaseException as e: util.format_err(e, "njupt") pass try: get_ysu_recruitment() except BaseException as e: util.format_err(e, "ysu") pass try: get_hqu_recruitment() except BaseException as e: util.format_err(e, "hqu") pass try: get_hznu_recruitment() except BaseException as e: util.format_err(e, "hznu") pass try: get_cueb_recuitment() except BaseException as e: util.format_err(e, "hznu") pass try: get_wust_recruitment() except BaseException as e: util.format_err(e, "wust") pass try: get_hbu_recruitment() except BaseException as e: util.format_err(e, "hbu") pass try: get_sxu_recuit() except BaseException as e: util.format_err(e, "sxu") pass try: get_anu_recruitment() except BaseException as e: util.format_err(e, "anu") pass try: get_gdut_recruitment() except BaseException as e: util.format_err(e, "gdut") pass print("获取一本数据完成=====================")
def get_basic_public_info(): try: get_bipt_recruitment() except BaseException as e: util.format_err(e, "bipt") pass try: get_cuit_recruit() except BaseException as e: util.format_err(e, "cuit") pass try: get_jhu_recruitment() except BaseException as e: util.format_err(e, "jhu") pass try: get_jincheng_recruit() except BaseException as e: util.format_err(e, "jcxy") pass try: get_scc_recuit() except BaseException as e: util.format_err(e, "scc") pass try: get_tjpu_recruitment() except BaseException as e: util.format_err(e, "tjpu") pass try: get_wzu_recruitment() except BaseException as e: util.format_err(e, "wzu") pass try: get_ytu_recruitment() except BaseException as e: util.format_err(e, "ytu") pass try: get_yangtzeu_recruitment() except BaseException as e: util.format_err(e, "yangtzeu") pass try: get_lut_recruitment() except BaseException as e: util.format_err(e, "yangtzeu") pass
def get_211_infos(): try: get_cufe_rescruit() except BaseException as e: util.format_err(e, "cufe") pass try: get_sufe_recruit() except BaseException as e: util.format_err(e, "sufe") pass try: get_ustbr_recuitment() except BaseException as e: util.format_err(e, "ustb") pass try: get_swu_recruitment() except BaseException as e: util.format_err(e, "swu") pass try: get_zzu_recruit() except BaseException as e: util.format_err(e, "zzu") pass try: get_shzu_recruitment() except BaseException as e: util.format_err(e, "shzu") pass try: get_gzu_recruit() except BaseException as e: util.format_err(e, "gzu") pass try: get_hnu_recruitment() except BaseException as e: util.format_err(e, "gzu") pass try: get_cau_recruitment() except BaseException as e: util.format_err(e, "cnu") pass try: get_lmu_recruitment() except BaseException as e: util.format_err(e, "lmu") pass try: get_lnu_recruitment() except BaseException as e: util.format_err(e, "lnu") pass try: get_ccnu_recruitment() except BaseException as e: util.format_err(e, "ccnu") pass try: get_tyut_recruitment() except BaseException as e: util.format_err(e, "tyut") pass try: get_xju_recruitment() except BaseException as e: util.format_err(e, "xju") pass try: get_ynu_recruitment() except BaseException as e: util.format_err(e, "ynu") pass util.format_err("获取211数据完成")
def get_985_infos(): try: get_scu_recruit() except BaseException as e: # util.format_err(e) pass try: get_csu_recruit() except BaseException as e: util.format_err(e) pass try: get_cqu_recruit() except BaseException as e: util.format_err(e) pass try: get_hust_recruit() except BaseException as e: util.format_err(e) pass try: get_lzu_rescruit() except BaseException as e: util.format_err(e) pass try: get_uestc_recruit() except BaseException as e: util.format_err(e) pass try: get_nku_recruit() except BaseException as e: util.format_err(e) pass try: get_scut_recuit() except BaseException as e: util.format_err(e) pass try: get_ouc_recruit() except BaseException as e: util.format_err(e) pass try: get_bhu_recruitment() except BaseException as e: util.format_err(e) pass try: get_jlu_recruitment() except BaseException as e: util.format_err(e) pass try: get_nwafu_recruitment() except BaseException as e: util.format_err(e) pass try: get_hnu_recuit() except BaseException as e: util.format_err(e) pass try: get_muc_recuitment() except BaseException as e: util.format_err(e) pass try: get_dlut_recruitment() except BaseException as e: util.format_err(e) pass try: get_bnu_recuit() except BaseException as e: util.format_err(e) try: get_ecnu_recruitment() except BaseException as e: util.format_err(e) try: get_tju_recruitment() except BaseException as e: util.format_err(e) try: get_cau_recruitment() except BaseException as e: util.format_err(e) try: get_ruc_recruitment() except BaseException as e: util.format_err(e)
def get_c9_info(): print("Begin to collect c9's information") try: get_sjtu_rescruit() except BaseException as e: util.format_err(e, "sjtu") pass try: get_tsinghua_recruit() except BaseException as e: util.format_err(e, "thu") pass try: get_fdu_rescruit() except BaseException as e: util.format_err(e, "fdu") pass try: get_ustc_recruit() except BaseException as e: util.format_err(e, "ustc") pass try: get_hit_rescruit() except BaseException as e: util.format_err(e, "hit") pass try: get_zju_rescruit() except BaseException as e: util.format_err(e, "zju") pass try: get_XJTU_recruit() except BaseException as e: util.format_err(e, "xjtu") pass try: get_nju_rescruit() except BaseException as e: util.format_err(e, "nju") pass try: # 北大的需要更新cookie get_pku_recruit() except BaseException as e: util.format_err(e, "pku") pass