def statistic_work_year_dimension(df, groups): """ 各维度对应的年限分布和薪资情况, 薪资为中位数薪资 :param df: :param groups :return: """ # 按薪资,百分比排序 for field in groups[:-2]: df = df.filter(df[field].isNotNull()) df = df.filter(df[groups[-1]].isNotNull()) # 添加中位数薪资 df = add_median_salary(df, groups, sort_field="avg_salary") # 分组分析 median_df = df.groupby(groups).agg( F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) median_df = median_df.filter(median_df.person_num > MIN_NUM) # 分组分析,学历不限 median_one_df = median_df.groupby(groups[:-1]).agg( F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) median_one_df = median_one_df.withColumn(groups[-1], F.lit(NA)) median_df = median_df.unionByName(median_one_df) return median_df
def statistic_median_salary(df, groups, is_rank=False): df = add_median_salary(df, groups, sort_field="avg_salary") # 分组分析 median_df = df.groupby(groups).agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) # 分组分析,学历不限 median_one_df = df.groupby(groups[:-1]).agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) median_one_df = median_one_df.withColumn(groups[-1], F.lit(NA)) median_df = median_df.unionByName(median_one_df) return add_rank(median_df, groups, is_rank)
def statistic_position_industry_rank(df): """职位和行业对应人数排名""" groups = ("position_name", "industry") df = df.filter(df.industry.isNotNull()).filter( (df.avg_salary > MIN_SALARY)) df = add_median_salary(df, groups) pi_df = df.groupby(*groups).agg( F.count("*").alias("person_num"), F.avg(df.avg_salary).alias("avg_salary")) pi_df = pi_df.filter(pi_df.person_num > MIN_NUM) pi_df = pi_df.groupby("position_name").apply(position_industry_rank) return pi_df
def statistic_industry_address_compare(df): """ 行业与地区对比 :param df: :return: """ df = df.filter(df.avg_salary.isNotNull()) groups = ("industry", "address") df = add_median_salary(df, groups) df = filter_cube_num(df, "industry", "address") mdf = df.cube("industry", "address").agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) return mdf.fillna({"industry": NA, "address": NA})
def statistic_position_address_compare(df): """ 职位与地区对比 :param df: :return: """ df = df.filter(df.avg_salary > MIN_SALARY) groups = ("position_name", "address") df = filter_cube_num(df, groups) df = add_median_salary(df, groups) mdf = df.cube(*groups).agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) return mdf.fillna({"position_name": NA, "address": NA})
def statistic_company_rank(df): """ 公司排名 :param df: :return: """ df = df.filter((df.avg_salary > MIN_SALARY)) df = add_median_salary(df, ("company_name",)) # 计算当前工作的平均薪资,毕业到当前工作的年限 mdf = df.groupby("company_name").agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) mdf = mdf.filter(mdf.person_num > MIN_NUM) mdf = add_rank(mdf) return mdf
def statistic_industry_rank(df): """ 行业排行 :param df: :return: """ df = df.filter((df.avg_salary > MIN_SALARY)) df = add_median_salary(df, ("industry", )) mdf = df.groupby("industry").agg( F.count("*").alias("person_num"), F.first(df.avg_salary).alias("avg_salary")) mdf = mdf.filter(mdf.person_num > MIN_NUM) mdf = add_rank(mdf) return mdf
def statistic_industry_address_work_year_range(df): """ 行业对应的工作年限分布 :param df: :return: """ df = df.filter(df.work_year_range.isNotNull()).filter(df.avg_salary > MIN_SALARY) groups = ("industry", "address", "work_year_range") df = add_median_salary(df, groups) mdf = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) mdf = mdf.filter(mdf.person_num > MIN_NUM) sdf = mdf.groupby("industry", "work_year_range").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) sdf = sdf.withColumn("address", F.lit(NA)) return mdf.unionByName(sdf)
def statistic_position_address_degree(df): """ 职位+地点对应的学历分布 :param df: :return: """ df = df.filter(df.degree.isNotNull()).filter((df.avg_salary > MIN_SALARY)) groups = ("position_name", "address", "degree") df = add_median_salary(df, groups) mdf = df.groupby("position_name", "address", "degree").agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) mdf = mdf.filter(mdf.person_num > MIN_NUM) sdf = mdf.groupby("position_name", "degree").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) sdf = sdf.withColumn("address", F.lit(NA)) return mdf.unionByName(sdf)
def statistic_school_major_rank(df): """ 各学校各专业排名 :param df: :return: """ # 计算当前工作的平均薪资,毕业到当前工作的年限 groups = ("school_name", "major", "degree") df = filter_cube_num(df, *groups) df = add_median_salary(df, groups) smd_df = df.cube(*groups).agg( F.count("*").alias("person_num"), F.avg(df.avg_salary).alias("avg_salary")) smd_df = smd_df.fillna({"school_name": NA, "major": NA, "degree": NA}) smd_df = smd_df.cube("major", "degree").apply(school_major_rank) return smd_df
def statistic_major_address(df): """ 专业和工作地分析 :param df: :return: """ groups = ("major", "degree", "address") df = df.filter(df.address.isNotNull()) df = add_median_salary(df, groups) mda_pdf = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.first(df.avg_salary).alias("avg_salary")) mda_pdf = mda_pdf.filter(mda_pdf.person_num > MIN_NUM) ma_pdf = mda_pdf.groupby("major", "address").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) ma_pdf = ma_pdf.withColumn("degree", F.lit(NA)) ma_pdf = mda_pdf.unionByName(ma_pdf) return ma_pdf
def statistic_major_company(df): """ 最近5年本专业毕业生公司去向分布 :param df: :return: """ groups = ("major", "degree", "company_name") df = df.filter(df.company_name.isNotNull()) df = add_median_salary(df, groups) mdc_df = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.first(df.avg_salary).alias("avg_salary")) mdc_df = mdc_df.filter(mdc_df.person_num > MIN_NUM) mc_df = mdc_df.groupby("major", "company_name").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) mc_df = mc_df.withColumn("degree", F.lit(NA)) mc_df = mdc_df.unionByName(mc_df) return mc_df
def statistic_major_gender(df): """ 专业对应的 :param df: :return: """ groups = ("major", "degree", "gender") df = df.filter(df.gender.isNotNull()) df = add_median_salary(df, groups) mdf1 = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.first(df.avg_salary).alias("avg_salary")) mdf1 = mdf1.filter(mdf1.person_num > MIN_NUM) mdf2 = mdf1.groupby("major", "gender").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) mdf2 = mdf2.withColumn("degree", F.lit(NA)) mdf2 = mdf1.unionByName(mdf2) return mdf2
def statistic_industry_address_rank(df): """ 行业、地区排行 :param df: :return: """ df = df.filter((df.avg_salary > MIN_SALARY)) groups = ("industry", "address") df = add_median_salary(df, groups) ia_df = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) ia_df = ia_df.filter(ia_df.person_num > MIN_NUM) i_df = ia_df.groupby("industry").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) i_df = i_df.withColumn("address", F.lit(NA)) ia_df = ia_df.unionByName(i_df) return add_rank(ia_df, "address")
def statistic_school_major_position(df): """ 学校、专业维度下的职位人数和薪酬分析 :param df: :return: """ df = df.filter(df.position_name.isNotNull()) groups = ("school_name", "major", "degree", "position_name") df = filter_cube_num(df, *groups) df = add_median_salary(df, groups) smdp_df = df.cube(*groups).agg( F.count("*").alias("person_num"), F.avg(df.avg_salary).alias("avg_salary")) smdp_df = smdp_df.filter(smdp_df.position_name.isNotNull()) smdp_df = smdp_df.dropna(how="all", subset=["school_name", "major", "degree"]) smdp_df = smdp_df.fillna({"school_name": NA, "major": NA, "degree": NA}) return smdp_df
def statistic_person_rank(df): """ 人员排行 :param df: :return: """ df = df.filter(df.address.isNotNull()) df = df.filter(df.gender.isNotNull()) groups = ("address", "age", "gender") df = filter_cube_num(df, *groups) df = add_median_salary(df, groups) df = filter_age(df).withColumn("age", F.udf(str)(df.age)) aag_df = df.cube(*groups).agg( F.count("*").alias("person_num"), F.avg(df.avg_salary).alias("avg_salary")) aag_df = aag_df.filter(aag_df.age.isNotNull()) aag_df = aag_df.fillna({"address": NA, "gender": NA}) return aag_df
def statistic_school_position(df): """ 学校和职位对应的分析 :param df: :return: """ df = df.filter(df.position_name.isNotNull()) groups = ("school_name", "degree", "position_name") df = add_median_salary(df, groups) # 职位别名 df = df.withColumn("position_title", F.lower(F.trim(df.position_title))) pdf = df.groupby("position_name", "position_title").agg(F.count("*").alias("total")) pdf = pdf.groupby("position_name").apply(filter_position) pdf = pdf.groupby("position_name").agg( F.collect_set("position_title").alias("position_set")) pdf = pdf.withColumn("position_alias", F.udf(lambda x: "/".join(x))(pdf.position_set)) pdf = pdf.select("position_name", "position_alias") # 职位对应行业 idf = df.groupby("position_name", "industry").agg(F.count("*").alias("total")) idf = idf.groupby("position_name").apply(filter_industry) idf = idf.groupby("position_name").agg( F.collect_set("industry").alias("industry_set")) idf = idf.withColumn("industry_alias", F.udf(lambda x: "/".join(x))(idf.industry_set)) idf = idf.select("position_name", "industry_alias") # 限制degree分析 sdp_df = df.groupby(*groups).agg( F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) sdp_df = sdp_df.filter(sdp_df.person_num > MIN_NUM) # 不限degree分析 sp_df = sdp_df.groupby("school_name", "position_name").agg( F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) sp_df = sp_df.withColumn("degree", F.lit(NA)) sdp_df = sdp_df.unionByName(sp_df) # 融合职位别名 sdp_df = sdp_df.join(pdf, "position_name") # 融合职位对应的行业 sdp_df = sdp_df.join(idf, "position_name") return sdp_df
def statistic_major_rank(df): """ 专业排名 :param df: :return: """ groups = ("major", "degree") df = add_median_salary(df, groups) md_df = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) md_df = md_df.filter(md_df.person_num > MIN_NUM) # 不限degree分析 m_df = md_df.groupby("major").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) m_df = m_df.withColumn("degree", F.lit(NA)) md_df = md_df.unionByName(m_df) md_df = md_df.filter(md_df.person_num > MIN_NUM) md_df = add_rank(md_df, "degree") return md_df
def statistic_school_gender(df): """ 学校和性别对应的分析 :param df: :return: """ df = df.filter(df.gender.isNotNull()) groups = ("school_name", "degree", "gender") df = add_median_salary(df, groups) sdg_df = df.groupby(*groups).agg( F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) sdg_df = sdg_df.filter(sdg_df.person_num > MIN_NUM) sg_df = sdg_df.groupby("school_name", "gender").agg( F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) sg_df = sg_df.withColumn("degree", F.lit(NA)) sdg_df = sdg_df.unionByName(sg_df) return sdg_df
def statistic_school_major_position_rank(df): """ 专业类别对应职位分布 :param df: :return: """ df = df.filter(df.position_name.isNotNull()) groups = ("school_name", "major", "degree", "position_name") df = filter_cube_num(df, *groups) df = add_median_salary(df, groups) scdp_df = df.cube(*groups).agg( F.count("*").alias("person_num"), F.avg(df.avg_salary).alias("avg_salary")) scdp_df = scdp_df.dropna(how="all", subset=["position_name"]) scdp_df = scdp_df.dropna(how="all", subset=["school_name", "major"]) scdp_df = scdp_df.fillna({"school_name": NA, "major": NA, "degree": NA}) scdp_df = scdp_df.filter(scdp_df.person_num > MIN_NUM) scdp_df = scdp_df.groupby("school_name", "major", "degree").apply(school_major_position_rank) return scdp_df
def statistic_school_rank(df): """ 专业排名 :param df: :return: """ groups = ("school_name", "degree") df = add_median_salary(df, groups) sd_df = df.groupby(*groups).agg( F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) sd_df = sd_df.filter(sd_df.person_num > MIN_NUM) # 不限degree分析 s_df = sd_df.groupby("school_name").agg( F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) s_df = s_df.withColumn("degree", F.lit(NA)) sd_df = sd_df.unionByName(s_df) sd_df = sd_df.filter(sd_df.person_num > MIN_NUM) sd_df = add_rank(sd_df, "degree") return sd_df