コード例 #1
0
 def calculate_activity_degree(self):
     university_list = self.get_university_list()
     for university_table in university_list:
         university = UNIVERSITY_INFO[
             university_table[:-len('_company_info')]]
         company_list = self.re.lrange(university_table, 0, -1)
         if university[1] == 'C9':
             self.get_company_num_in_diff_date(company_list,
                                               self.c9_company_date_dict,
                                               "c9")
         elif university[1] == '985':
             self.get_company_num_in_diff_date(company_list,
                                               self.p985_company_date_dict,
                                               "p985")
         elif university[1] == '211':
             self.get_company_num_in_diff_date(company_list,
                                               self.p211_company_date_dict,
                                               "p211")
         elif university[1] == '一本':
             self.get_company_num_in_diff_date(company_list,
                                               self.top_company_date_dict,
                                               "top")
         elif university[1] == '二本':
             self.get_company_num_in_diff_date(company_list,
                                               self.basic_company_date_dict,
                                               "basic")
         else:
             util.format_err(university)
     self.save_date_dict()
     print("finish")
コード例 #2
0
def get_ouc_recruit():
    print("开始获取中国海洋大学数据=====================")
    url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html"
    host = 'career.ouc.edu.cn'
    headers = util.get_header(host)
    req = requests.Session()
    res = req.get(url=url, headers=headers).content.decode('gbk')
    redis = jedis.jedis()
    redis.clear_list(table_name)
    soup = BeautifulSoup(res, 'html5lib')
    total_infos = int(re.findall(pattern, str(soup))[0][14:])
    page_num = total_infos // 20 + 1
    for i in range(1, page_num + 1):
        try:
            if i == 1:
                url = "http://career.ouc.edu.cn/html/zp_info/campus/index.html"
            else:
                url = "http://career.ouc.edu.cn/html/zp_info/campus/" + str(i) + ".html"
            content = req.get(url=url, headers=headers).content.decode('gbk')
            parse_info(content, redis)
        except BaseException as e:
            util.format_err(e)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
    print("获取中国海洋大学数据完成=====================")
コード例 #3
0
    def print_and_save_result2(self, result_dict, filename):
        self.data_array = []
        for key, values in result_dict.items():
            print('--------------------------------------')
            try:
                print(key, values)
                self.data_array.append(dict(name=key, value=values))
            except BaseException as e:
                util.format_err(e)
                continue
        self.data_array = sorted(self.data_array,
                                 key=lambda x: float(x['name']))
        # 表示从 2013-01-01 至2017-12-30, 步长为一天
        for item in self.data_array:
            # 2013.01.01之前的数据都不要了
            if int(float(item['name'])) < 1356969600:
                pass
            else:
                item['name'] = util.get_standard_time_from_mktime(
                    int(float(item['name'])))
                self.data_array_range.append(item)

        # self.data_array = list(map(lambda x: util.get_standard_time_from_mktime(int(float(x['name']))), self.data_array))

        self.save_result2(filename, self.data_array_range)
コード例 #4
0
def get_scut_recuit():
    print("开始获取华南理工大学数据=====================")
    url = "http://jyzx.6ihnep7.cas.scut.edu.cn/jyzx/xs/zpxx/xyxj/"

    req = requests.Session()

    headers = util.get_header(host='jyzx.6ihnep7.cas.scut.edu.cn')
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 61):
        try:
            data = {
                'pageNo': '60',
                'daoxv1': '0',
                'entName': '',
                'time': '-1',
                'pageNO': str(i)
            }
            content = req.post(url=url, headers=headers,
                               data=data).content.decode('utf-8')
            parse_info(redis, content)
        except BaseException as e:
            util.format_err(e)
            break
    redis.add_to_file(table_name)
    redis.add_university(table_name)
    print("获取华南理工大学数据完成=======================")
コード例 #5
0
def parse_info(html, redis):
    for n in range(len(html['data'])):
        try:
            timestamp = html['data'][n]['startTime'] / 1000
            date = str(datetime.fromtimestamp(timestamp).date())
            company_name = html['data'][n]['name']
            print(date, company_name)
            redis.save_info(table_name, date, company_name)
        except BaseException as e:
            util.format_err(e)
            continue
コード例 #6
0
def parse_recruit_info(redis, content, date, id):
    soup = BeautifulSoup(content, "html5lib")
    company_list = soup.find_all(href=re.compile("/recruitment/company"))
    print(date)
    for item in company_list:
        company_name = item.text.strip()
        # print(company_name)
        redis.save_info(table_name, date, company_name)
    print("获取双选会完成")

    # 2018大型双选会
    if int(id) == 62:
        company_list = soup.find_all(
            attrs={
                'style':
                'font-size:14px;font-family:\'微软雅黑\',sans-serif;color:#666666'
            })
        for i in range(13, 211, 2):
            try:
                company_name = company_list[i + 1].text.strip()
                print(company_name)
                redis.save_info(table_name, date, company_name)
            except BaseException as e:
                util.format_err(e)
                break

    # 大型双选会
    if int(id) == 69:
        company_list = soup.find_all(
            attrs={
                'style':
                'font-size:14px;font-family:\'微软雅黑\',sans-serif;color:#666666'
            })
        for i in range(14, 179, 2):
            try:
                company_name = company_list[i + 1].text.strip()
                print(company_name)
                redis.save_info(table_name, date, company_name)
            except BaseException as e:
                util.format_err(e)
                break

    # 国有企业双选会
    if int(id) == 68:
        company_list = soup.find_all(attrs={'style': 'font-size: 19px'})
        for i in range(0, len(company_list) - 1, 2):
            company_name = company_list[i + 1].text.strip()
            # print(company_list[i])
            # print(company_name)
            redis.save_info(table_name, date, company_name)

    if len(company_list) == 0:
        print("failed")
コード例 #7
0
 def print_and_save_result(self, result_dict, filename):
     self.data_array = []
     for key, values in result_dict.items():
         print('--------------------------------------')
         print(key + ":" + str(len(values)) + " ".join(values))
         university_short_name = key[:-len('_company_info')]
         try:
             print(UNIVERSITY_INFO[university_short_name][0] + ":" + str(len(values)))
             self.data_array.append(dict(name=UNIVERSITY_INFO[university_short_name], data=values,
                                         total_num=self.university_company_list_length_dict[key]))
         except BaseException as e:
             util.format_err(e)
             continue
     self.save_result(filename)
コード例 #8
0
def parse_info(html, redis):
    bf = BeautifulSoup(html, 'lxml')
    bf1 = bf.find_all('div', class_='r_list1')
    bf2 = BeautifulSoup(str(bf1), 'lxml')
    date_list = bf2.find_all('span')
    company_list = bf2.find_all(href=re.compile('articledetail\?t.PostId='))
    for i in range(len(date_list)):
        try:
            date = date_list[i].text.replace('/', '-')
            company_name = company_list[i].text.strip()
            if company_name.find('取消') == -1 and date != '':
                redis.save_info(table_name, date, company_name)
        except BaseException as e:
            util.format_err(e)
            continue
コード例 #9
0
def parse_info(redis, content):
    soup = BeautifulSoup(content, 'html5lib')
    company_list = soup.find_all(
        href=re.compile('/jyzx/newSystem/noticeDetail.jsp?'))
    date_list = soup.select('.date')
    for i in range(len(company_list)):
        try:
            date = date_list[i].text.strip()[:10]
            company_name = company_list[i].text.strip()
            if pattern.match(date):
                print(date, company_name)
                redis.save_info(table_name, date, company_name)
                print("=====")
        except BaseException as e:
            util.format_err(e)
            pass
コード例 #10
0
 def get_2017_company_list(self, university_table_name):
     company_list = self.re.lrange(university_table_name, 0, -1)
     company_list_2017 = []
     for item in company_list:
         try:
             item = item.replace('\'', '"')
             item = json.loads(item)
             date = item['date']
             if date.find('2017') != -1:
                 company_list_2017.append(item)
         except BaseException as e:
             util.format_err(e, university_table_name, item)
             continue
     print("Finish to find 2017 Recruitment-->" + university_table_name)
     self.count += 1
     print(self.count)
     return company_list_2017, len(company_list_2017)
コード例 #11
0
def get_bnu_recuit():
    print("开始获取北京师范大学数据=====================")
    url = "http://career.bnu.edu.cn/front/zp_query/zphQuery.jspa?"
    host = "career.bnu.edu.cn"
    headers = util.get_header(host=host)
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 82):  # 一共81页
        try:
            params = {'paramMap.xxlx': '1', 'page.curPage': '%d' % i}
            html = requests.get(url=url, headers=headers,
                                params=params).json()  # json 数据
            parse_info(html, redis)
        except BaseException as e:
            util.format_err(e)
        finally:
            print('获取北京师范大学第 %d 页(共81页)数据完成' % i)
    redis.add_university(table_name)  # 添加学校到github中
    redis.add_to_file(table_name)  # 添加表到文件中
コード例 #12
0
def parse_info(content, redis):
    content = json.loads(content)
    trList = content['result']['trList']
    print(trList)
    for item in trList:
        for day in item['tdList']:
            if day is not None:
                if 'careerList' in day:
                    careerList = day['careerList']
                    if careerList is not None:
                        try:
                            for data in careerList:
                                date = data['mergeStartTime']
                                company = data['mergeTitle']
                                print(date, company)
                                redis.save_info(table_name, date, company)
                        except BaseException as e:
                            util.format_err(e)
                            pass
コード例 #13
0
def get_hnu_recuit():
    print("开始获取湖南大学数据=====================")
    url = "http://scc.hnu.edu.cn/newsjob!getMore.action?"
    host = "scc.hnu.edu.cn"
    headers = util.get_header(host=host)
    redis = jedis.jedis()
    redis.clear_list(table_name)
    for i in range(1, 310):  # 一共310页,102页及其以前都是2017年的
        try:
            params = {'p.currentPage': '%d' % i, 'Lb': '1'}
            html = requests.get(url=url, headers=headers, params=params).text
            parse_info(html, redis)
        except BaseException as e:  # 还不太会错误处理机制
            util.format_err(e)
            break
        finally:
            print('获取湖南大学第 %d 页(共310页)数据完成' % i)
    redis.add_university(table_name)
    redis.add_to_file(table_name)
コード例 #14
0
    def get_company_num_in_diff_date(self, company_list, result_dict, type):
        for company in company_list:
            try:
                company = company.replace('\'', '\"')
                company = json.loads(company)
                date_time = company['date']
                # company_name = company['company_name']
                print(date_time)
                # 将标准时间转化为时间戳
                date_time = util.get_mktime(date_time)
                if str(date_time) in result_dict:
                    result_dict[str(date_time)] += 1

                else:
                    result_dict[str(date_time)] = 1
                if str(int(float(date_time))) in self.data_array_each_day_dict:
                    self.data_array_each_day_dict[str(int(
                        float(date_time)))][type] += 1
            except BaseException as e:
                util.format_err(e)
コード例 #15
0
 def get_top_500_list(self):
     company_info = self.re.lrange("company_info", 0, -1)
     for item in company_info:
         try:
             item = item.replace('\'', '"')
             item = item.replace('==', '\'')
             # print(item)
             item = json.loads(item)
             company_name = item['company_name']
             company_name = company_name.replace('++', '\"')
             company_type = item['company_type']
             # print(company_name)
             if company_type == "USATop500":
                 self.USA_company_list.append(company_name)
             elif company_type == "ChinaTop500":
                 self.China_company_list.append(company_name)
             elif company_type == "WorldTop500":
                 self.World_company_list.append(company_name)
         except BaseException as e:
             util.format_err(e, item)
             continue
コード例 #16
0
def parse_info(content, redis, page):
    soup = BeautifulSoup(content, 'html5lib')
    company_list = soup.find_all(href=re.compile('/gzujobs/client/jobsinfor/'))
    date_list = soup.select('.time')
    for i in range(0, len(company_list)):
        company_name = company_list[i].text.strip()
        if page < 128:
            try:
                year = re.findall(pattern2, company_name)[0][1:-1]
                if int(year) > 624:
                    year = '17'
                else:
                    year = '16'
            except IndexError:
                try:
                    year = re.findall(pattern3, company_name)[0][1:-1]
                except BaseException as e:
                    util.format_err(e)
                    continue
        else:
            try:
                year = re.findall(re.compile('[0-9]+-?-?第[0-9]+期'),
                                  company_name)[0][0:2]
            except IndexError:
                try:
                    year = re.findall(re.compile('[0-9]+-?-?[0-9]+期'),
                                      company_name)[0][0:2]
                except BaseException as e:
                    util.format_err(e)
                    continue
            if year == '44':
                year = '09'
        date = '20' + str(year) + '-' + date_list[i].text[1:-1]

        company_name = company_name.split('(')[0].strip()
        print(company_name, date)
        redis.save_info(table_name, date, company_name)
コード例 #17
0
def get_top_public_infos():
    print("开始获取一本数据=====================")
    try:
        get_ncepu_recruit()
    except BaseException as e:
        util.format_err(e, "ncepu")
        pass
    try:
        get_ncut_recuitment()
    except BaseException as e:
        util.format_err(e, "ncut")
        pass
    try:
        get_njupt_recruitment()
    except BaseException as e:
        util.format_err(e, "njupt")
        pass
    try:
        get_ysu_recruitment()
    except BaseException as e:
        util.format_err(e, "ysu")
        pass
    try:
        get_hqu_recruitment()
    except BaseException as e:
        util.format_err(e, "hqu")
        pass
    try:
        get_hznu_recruitment()
    except BaseException as e:
        util.format_err(e, "hznu")
        pass

    try:
        get_cueb_recuitment()
    except BaseException as e:
        util.format_err(e, "hznu")
        pass

    try:
        get_wust_recruitment()
    except BaseException as e:
        util.format_err(e, "wust")
        pass
    try:
        get_hbu_recruitment()
    except BaseException as e:
        util.format_err(e, "hbu")
        pass

    try:
        get_sxu_recuit()
    except BaseException as e:
        util.format_err(e, "sxu")
        pass

    try:
        get_anu_recruitment()
    except BaseException as e:
        util.format_err(e, "anu")
        pass

    try:
        get_gdut_recruitment()
    except BaseException as e:
        util.format_err(e, "gdut")
        pass

    print("获取一本数据完成=====================")
コード例 #18
0
def get_basic_public_info():
    try:
        get_bipt_recruitment()
    except BaseException as e:
        util.format_err(e, "bipt")
        pass

    try:
        get_cuit_recruit()
    except BaseException as e:
        util.format_err(e, "cuit")
        pass

    try:
        get_jhu_recruitment()
    except BaseException as e:
        util.format_err(e, "jhu")
        pass

    try:
        get_jincheng_recruit()
    except BaseException as e:
        util.format_err(e, "jcxy")
        pass

    try:
        get_scc_recuit()
    except BaseException as e:
        util.format_err(e, "scc")
        pass

    try:
        get_tjpu_recruitment()
    except BaseException as e:
        util.format_err(e, "tjpu")
        pass

    try:
        get_wzu_recruitment()
    except BaseException as e:
        util.format_err(e, "wzu")
        pass

    try:
        get_ytu_recruitment()
    except BaseException as e:
        util.format_err(e, "ytu")
        pass

    try:
        get_yangtzeu_recruitment()
    except BaseException as e:
        util.format_err(e, "yangtzeu")
        pass

    try:
        get_lut_recruitment()
    except BaseException as e:
        util.format_err(e, "yangtzeu")
        pass
コード例 #19
0
def get_211_infos():
    try:
        get_cufe_rescruit()
    except BaseException as e:
        util.format_err(e, "cufe")
        pass
    try:
        get_sufe_recruit()
    except BaseException as e:
        util.format_err(e, "sufe")
        pass
    try:
        get_ustbr_recuitment()
    except BaseException as e:
        util.format_err(e, "ustb")
        pass

    try:
        get_swu_recruitment()
    except BaseException as e:
        util.format_err(e, "swu")
        pass

    try:
        get_zzu_recruit()
    except BaseException as e:
        util.format_err(e, "zzu")
        pass

    try:
        get_shzu_recruitment()
    except BaseException as e:
        util.format_err(e, "shzu")
        pass
    try:
        get_gzu_recruit()
    except BaseException as e:
        util.format_err(e, "gzu")
        pass

    try:
        get_hnu_recruitment()
    except BaseException as e:
        util.format_err(e, "gzu")
        pass

    try:
        get_cau_recruitment()
    except BaseException as e:
        util.format_err(e, "cnu")
        pass

    try:
        get_lmu_recruitment()
    except BaseException as e:
        util.format_err(e, "lmu")
        pass

    try:
        get_lnu_recruitment()
    except BaseException as e:
        util.format_err(e, "lnu")
        pass

    try:
        get_ccnu_recruitment()
    except BaseException as e:
        util.format_err(e, "ccnu")
        pass
    try:
        get_tyut_recruitment()
    except BaseException as e:
        util.format_err(e, "tyut")
        pass

    try:
        get_xju_recruitment()
    except BaseException as e:
        util.format_err(e, "xju")
        pass

    try:
        get_ynu_recruitment()
    except BaseException as e:
        util.format_err(e, "ynu")
        pass

    util.format_err("获取211数据完成")
コード例 #20
0
def get_985_infos():
    try:
        get_scu_recruit()
    except BaseException as e:
        # util.format_err(e)
        pass
    try:
        get_csu_recruit()
    except BaseException as e:
        util.format_err(e)
        pass
    try:
        get_cqu_recruit()
    except BaseException as e:
        util.format_err(e)
        pass
    try:
        get_hust_recruit()
    except BaseException as e:
        util.format_err(e)
        pass
    try:
        get_lzu_rescruit()
    except BaseException as e:
        util.format_err(e)
        pass
    try:
        get_uestc_recruit()
    except BaseException as e:
        util.format_err(e)
        pass
    try:
        get_nku_recruit()
    except BaseException as e:
        util.format_err(e)
        pass
    try:
        get_scut_recuit()
    except BaseException as e:
        util.format_err(e)
        pass
    try:
        get_ouc_recruit()
    except BaseException as e:
        util.format_err(e)
        pass

    try:
        get_bhu_recruitment()
    except BaseException as e:
        util.format_err(e)
        pass

    try:
        get_jlu_recruitment()
    except BaseException as e:
        util.format_err(e)
        pass

    try:
        get_nwafu_recruitment()
    except BaseException as e:
        util.format_err(e)
        pass

    try:
        get_hnu_recuit()
    except BaseException as e:
        util.format_err(e)
        pass

    try:
        get_muc_recuitment()
    except BaseException as e:
        util.format_err(e)
        pass

    try:
        get_dlut_recruitment()
    except BaseException as e:
        util.format_err(e)
        pass

    try:
        get_bnu_recuit()
    except BaseException as e:
        util.format_err(e)

    try:
        get_ecnu_recruitment()
    except BaseException as e:
        util.format_err(e)

    try:
        get_tju_recruitment()
    except BaseException as e:
        util.format_err(e)

    try:
        get_cau_recruitment()
    except BaseException as e:
        util.format_err(e)

    try:
        get_ruc_recruitment()
    except BaseException as e:
        util.format_err(e)
コード例 #21
0
def get_c9_info():
    print("Begin to collect c9's information")
    try:
        get_sjtu_rescruit()
    except BaseException as e:
        util.format_err(e, "sjtu")
        pass

    try:
        get_tsinghua_recruit()
    except BaseException as e:
        util.format_err(e, "thu")
        pass

    try:
        get_fdu_rescruit()
    except BaseException as e:
        util.format_err(e, "fdu")
        pass

    try:
        get_ustc_recruit()
    except BaseException as e:
        util.format_err(e, "ustc")
        pass
    try:
        get_hit_rescruit()
    except BaseException as e:
        util.format_err(e, "hit")
        pass
    try:
        get_zju_rescruit()
    except BaseException as e:
        util.format_err(e, "zju")
        pass
    try:
        get_XJTU_recruit()
    except BaseException as e:
        util.format_err(e, "xjtu")
        pass

    try:
        get_nju_rescruit()
    except BaseException as e:
        util.format_err(e, "nju")
        pass

    try:
        # 北大的需要更新cookie
        get_pku_recruit()
    except BaseException as e:
        util.format_err(e, "pku")
        pass