def statistic_company(df, mysql_url): """ 公司相关维度分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.company_name.isNotNull()).filter(df.position_category.isNotNull()) latest_work_df = df.filter(df.work_index == df.work_len) # 持续化数据 df.persist() latest_work_df.persist() # 分析结果表 result_tables = dict() result_tables["company__rank"] = statistic_company_rank(df) result_tables["company__salary"] = statistic_company_salary(df) result_tables["company__gender"] = statistic_company_gender(df) result_tables["company__age"] = statistic_company_age(df) result_tables["company__address"] = statistic_company_address(df) # result_tables["company__work_year"] = statistic_company_work_year(df) result_tables["company__school"] = statistic_company_school(df) result_tables["company__major"] = statistic_company_major(df) result_tables["company__degree"] = statistic_company_degree(df) result_tables["company__prev_company"] = statistic_company_prev_company(df) result_tables["company__next_company"] = statistic_company_next_company(df) result_tables["company__duration"] = statistic_company_duration(df) # 删除持续化数据 df.unpersist() latest_work_df.unpersist() # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables)
def statistic_school_major(df, mysql_url): """ 学校+专业相关分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.avg_salary > MIN_SALARY) df = df.filter(df.school_name.isNotNull()).filter( df.major.isNotNull()).filter(df.degree.isNotNull()) latest_work_df = df.filter(df.work_index == df.work_len) # 持续化数据 df.persist() latest_work_df.persist() # 分析结果表 result_tables = dict() result_tables["school__major__rank"] = statistic_school_major_rank( latest_work_df) result_tables["school__major__position"] = statistic_school_major_position( df) # result_tables["school__major__work_year"] = statistic_school_major_work_year(df) result_tables["school__major__industry"] = statistic_school_major_industry( df) result_tables["school__major__company"] = statistic_school_major_company( df) result_tables["school__major__flow"] = statistic_school_major_flow(df) result_tables[ "school__major__position__rank"] = statistic_school_major_position_rank( df) # 删除持续化数据 df.unpersist() latest_work_df.unpersist() # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables)
def statistic_major(df, mysql_url): """ 专业相关分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.major.isNotNull()).filter(df.degree.isNotNull()) df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min) # 最近5年本专业应届毕业生去向 latest_work_df = df.filter(df.work_index == df.work_len) \ .filter(df.edu_end_year >= datetime.datetime.now().year - LATEST_YEAR) # 持续化数据 df.persist() latest_work_df.persist() # 分析结果表 result_tables = dict() result_tables["major__rank"] = statistic_major_rank(latest_work_df) result_tables["major__gender"] = statistic_major_gender(latest_work_df) result_tables["major__address"] = statistic_major_address(latest_work_df) result_tables["major__company"] = statistic_major_company(latest_work_df) result_tables["major__industry"] = statistic_major_industry(latest_work_df) result_tables["major__position"] = statistic_major_position(latest_work_df) # result_tables["major__work_year"] = statistic_major_work_year(df) # 删除持续化数据 df.unpersist() latest_work_df.unpersist() # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables)
def statistic_position_address(df, mysql_url): """ 职位相关维度分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.position_name.isNotNull()).filter(df.address.isNotNull()) df.persist() # 分析结果表 result_tables = dict() result_tables["position__address__compare"] = statistic_position_address_compare(df) result_tables["position__address__flow"] = statistic_position_address_flow(df) result_tables["position__address__salary_range"] = statistic_position_address_salary_range(df) result_tables["position__address__age_range"] = statistic_position_address_age_range(df) result_tables["position__address__gender"] = statistic_position_address_gender(df) result_tables["position__address__work_year_range"] = statistic_position_address_work_year_range(df) result_tables["position__address__degree"] = statistic_position_address_degree(df) result_tables["position__address__duration"] = statistic_position_address_duration(df) result_tables["position__address__industry"] = statistic_position_address_industry(df) result_tables["position__address__salary_work_year"] = statistic_position_address_salary_work_year(df) result_tables["position__address__prev_position"] = statistic_position_address_prev_position(df) result_tables["position__address__next_position"] = statistic_position_address_next_position(df) result_tables["position__address__change"] = statistic_position_address_change(df) # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables) df.unpersist()
def statistic_dimension(df, mysql_url): # 薪资类型(salary category) common = statistic median = statistic_median_salary avg = statistic_avg_salary parameters = [ # 专业维度相关分析 (["major", "degree"], [ ("major__rank", None, median), ("major__gender", "gender", median), ("major__address", "address", median), ("major__company", "company_name", median), ("major__industry", "industry", median), ("major__position", "position_name", median), ]), # ("major__rank", ["major", "degree"], sc, True), # ("major__gender", ["major", "degree","gender"], sc), # ("major__address", ["major", "degree","address"], sc), # ("major__company", ["major", "degree", "company_name"], sc), # ("major__industry", ["major", "degree", "industry"], sc), # ("major__position", ["major", "degree","position_name"], sc), # # # 学校维度相关分析 # ("school__rank", ["school_name", "degree"], sc, True), # ("school__gender", ["school_name", "degree", "gender"], sc), # ("school__address", ["school_name", "degree", "address"], sc), # ("school__company", ["school_name", "degree", "company_name"], sc), # ("school__industry", ["school_name", "degree", "industry"], sc), # ("school__position", ["school_name", "degree", "position_name"], sc), # # # 学校 + 专业 相关维度的分析 # ("school__major__rank", ["school_name", "major", "degree"], sc, True), # ("school__major__position", ["school_name", "major", "degree", "position_name"], sc), # ("school__major__industry", ["school_name", "major", "degree", "industry"], sc), # ("school__major__company", ["school_name", "major", "degree", "company_name"], sc), # ("school__major__flow", ["school_name", "major", "degree", "company_name", "industry"], sc), # ("school__major__position__rank", ["school_name", "major", "degree", "position_name"], sc, True), ] for groups, values in parameters: result_tables = dict() mdf = df for field in groups: mdf = df.filter(df[field].isNotNull()) mdf.persist() for table, result_field, func in values: if not result_field: result_field = [] if isinstance(result_field, str): result_field = [result_field] result_df = mdf.filter(mdf[result_field].isNotNull()) fields = groups + [result_field] result_tables[table] = func(result_df, fields, "rank" in table) write_mysql(mysql_url, result_tables) mdf.unpersist()
def statistic_dimension(df, mysql_url, parameters, func): """各维度简历数据分析""" # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值 df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min) \ .filter(df.avg_salary.isNotNull()) result_tables = dict() df.persist() for groups, na_field, values in parameters: result_tables[values[0]] = func(df, groups, na_field, *values[1:]) write_mysql(mysql_url, result_tables) df.unpersist()
def postgraduate(profile_df, education_df, work_df, mysql_url): """ 留学、考研、就业分析 :param profile_df: 简历基本信息 :param education_df: 简历教育经历 :param work_df: 简历工作经历 :param mysql_url: 数据库路径 :return: """ education_df = education_df.filter(education_df.school_name.isNotNull()) \ .filter(education_df.major.isNotNull()) \ .filter(education_df.degree.isNotNull()) education_df = education_df.select("resume_id", "school_name", "school_area", "major", "degree", "edu_index", "edu_end_date") education_df = education_df.groupby("resume_id").apply(cal_postgraduate) education_df = education_df.filter( education_df.postgraduate.isNotNull()).filter( education_df.flag.isNull()) education_df.persist() # 计算每份工作平均薪资 work_df = work_df.withColumn( "avg_salary", F.udf(cal_avg_salary, FloatType())(work_df.salary_min, work_df.salary_max)) work_df = work_df.filter(work_df.avg_salary > MIN_SALARY) # # 过滤掉只有简历概要信息的所有数据(工作经历分数大于等于2) # wc_df = work_df.filter(work_df.work_index == 2).select("resume_id") # edu_df = wc_df.join(education_df, "resume_id") # 组合教育经历和工作经历 df = education_df.join(work_df, "resume_id") df = df.withColumn("work_month", F.months_between(df.work_end_date, df.edu_end_date)) df = df.withColumn("work_year", F.udf(get_years, IntegerType())(df.work_month)) df = df.filter(df.work_year >= 0).filter(df.work_year <= 40) # 分析结果表 result_tables = dict() result_tables[ "school__major__postgraduate__ratio__v1"] = postgraduate_ratio( education_df) result_tables[ "school__major__postgraduate__work_year__v1"] = postgraduate_work_year( df) # 删除持续化数据 education_df.unpersist() # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables)
def statistic_person(df, mysql_url): """ 个人维度分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.avg_salary > MIN_SALARY) df.persist() # 分析结果表 result_tables = dict() result_tables["person__rank"] = statistic_person_rank(df) # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables) df.unpersist()
def statistic_industry(df, mysql_url): """ 行业相关维度分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.industry.isNotNull()) df.persist() # 分析结果表 result_tables = dict() result_tables["industry__rank"] = statistic_industry_rank(df) result_tables["industry__address"] = statistic_industry_address(df) # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables) df.unpersist()
def statistic_special(df, mysql_url): """ 特殊情况分析 :param df: :param mysql_url: :return: """ config = { "company__age": [ (["company_name"], "position_category", ("company__age", None, False, True)), ], "company__duration": [ (["company_name"], "position_category", ("company__duration", None)), ], "position__address__duration": [ (["position_name"], "address", ("position__address__duration", None, False, True)) ], "address__flow": [ (["industry"], None, ("industry__address__flow", "address,expect_area")), (["position_name"], None, ("position__address__flow", "address,expect_area")) ], "address_change": [ (["position_name"], "address", ("position__address__change", None)), ] } # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值 df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min) for type, parameters in config.items(): result_tables = dict() df.persist() for groups, na_field, values in parameters: if type == 'company__age': result_df = statistic_company_age(df) elif type == 'company__duration': result_df = statistic_company_duration(df) elif type == 'position__address__duration': result_df = statistic_position_address_duration(df) elif type == 'address__flow': result_df = statistic_address_flow(df, groups, na_field, *values[1:]) # 过滤掉某些字段中的不合法值 result_tables[values[0]] = filter_invalid_value(result_df) write_mysql(mysql_url, result_tables) df.unpersist()
def statistic_industry_position_address(df, mysql_url): """ 行业相关维度分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.industry.isNotNull()).filter( df.position_name.isNotNull()).filter(df.address.isNotNull()) df.persist() # 分析结果表 result_tables = dict() result_tables[ "industry__position__address__work_year"] = statistic_industry_position_address_work_year( df) # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables) df.unpersist()
def statistic_neighbour_special(df, mysql_url): config = { "address_change": [ (["position_name"], "address", ("position__address__change", None)), ] } # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值 df = df.filter( (df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min) for type, parameters in config.items(): result_tables = dict() df.persist() for groups, na_field, values in parameters: result_df = statistic_position_address_change( df, groups, na_field, *values[1:]) # 过滤掉某些字段中的不合法值 result_tables[values[0]] = filter_invalid_value(result_df) write_mysql(mysql_url, result_tables) df.unpersist()
def statistic_job(position_df, job_df, mysql_url): """ 招聘职位相关分析 :param df: :param mysql_url: :return: """ job_df = job_df.filter(job_df.position_name.isNotNull()) # position_df = filter_str(position_df, "expect_career") # position_df = position_df.withColumnRenamed("expect_career", "position_name") \ # .withColumnRenamed("update_date", "date_time").withColumnRenamed("address", "area") \ # .withColumnRenamed("expect_area", "address").withColumnRenamed("avg_salary", "job_avg_salary") # result_tables = dict() # 分析详情表 result_tables["position__jd"] = statistic_job_jd(job_df) # 分析关系表 # result_tables["job__position__relation"] = statistic_job_position_relation(job_df, position_df) # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables)
def statistic_industry_address(df, mysql_url): """ 行业相关维度分析 :param df: :param mysql_url: :return: """ # 数据过滤 df = df.filter(df.industry.isNotNull()).filter(df.address.isNotNull()) df.persist() # 分析结果表 result_tables = dict() result_tables["industry__address__compare"] = statistic_industry_address_compare(df) result_tables["industry__address__flow"] = statistic_industry_address_flow(df) result_tables["industry__address__salary"] = statistic_industry_address_salary_range(df) result_tables["industry__address__age"] = statistic_industry_address_age_range(df) result_tables["industry__address__gender"] = statistic_industry_address_gender(df) # result_tables["industry__address__work_year"] = statistic_industry_address_work_year_range(df) result_tables["industry__address__degree"] = statistic_industry_address_degree(df) result_tables["industry__address__salary_work_year"] = statistic_industry_address_salary_work_year(df) result_tables["industry__address__rank"] = statistic_industry_address_rank(df) # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables) df.unpersist()
def statistic_work_year(df, mysql_url): df = flat_work_year(df) df.persist() result_tables = dict() parameters = [ { "name": "major__work_year", "groups": ["major", "work_year", "degree"] }, { "name": "school__work_year", "groups": ["school_name", "work_year", "degree"] }, { "name": "company__work_year", "groups": ["company_name", "work_year", "position_category"] }, { "name": "industry__address__work_year", "groups": ["industry", "work_year", "address"] }, { "name": "position__address__work_year", "groups": ["position_name", "work_year", "address"] }, { "name": "school__major__work_year", "groups": ["school_name", "major", "work_year", "degree"] }, ] for param in parameters: result_tables[param["name"]] = statistic_work_year_dimension( df, param["groups"]) write_mysql(mysql_url, result_tables) df.unpersist()