예제 #1
0
def statistic_work_year_dimension(df, groups):
    """
    各维度对应的年限分布和薪资情况, 薪资为中位数薪资
    :param df: 
    :param groups
    :return: 
    """
    # 按薪资,百分比排序
    for field in groups[:-2]:
        df = df.filter(df[field].isNotNull())
    df = df.filter(df[groups[-1]].isNotNull())
    # 添加中位数薪资
    df = add_median_salary(df, groups, sort_field="avg_salary")
    # 分组分析
    median_df = df.groupby(groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))

    median_df = median_df.filter(median_df.person_num > MIN_NUM)
    # 分组分析,学历不限
    median_one_df = median_df.groupby(groups[:-1]).agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    median_one_df = median_one_df.withColumn(groups[-1], F.lit(NA))

    median_df = median_df.unionByName(median_one_df)
    return median_df
예제 #2
0
def statistic_median_salary(df, groups, is_rank=False):
    df = add_median_salary(df, groups, sort_field="avg_salary")
    # 分组分析
    median_df = df.groupby(groups).agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary"))
    # 分组分析,学历不限
    median_one_df = df.groupby(groups[:-1]).agg(F.count("*").alias("person_num"),
                                                F.avg("avg_salary").alias("avg_salary"))
    median_one_df = median_one_df.withColumn(groups[-1], F.lit(NA))

    median_df = median_df.unionByName(median_one_df)
    return add_rank(median_df, groups, is_rank)
예제 #3
0
def statistic_position_industry_rank(df):
    """职位和行业对应人数排名"""
    groups = ("position_name", "industry")
    df = df.filter(df.industry.isNotNull()).filter(
        (df.avg_salary > MIN_SALARY))
    df = add_median_salary(df, groups)
    pi_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.avg(df.avg_salary).alias("avg_salary"))
    pi_df = pi_df.filter(pi_df.person_num > MIN_NUM)
    pi_df = pi_df.groupby("position_name").apply(position_industry_rank)
    return pi_df
예제 #4
0
def statistic_industry_address_compare(df):
    """
    行业与地区对比
    :param df: 
    :return: 
    """
    df = df.filter(df.avg_salary.isNotNull())
    groups = ("industry", "address")
    df = add_median_salary(df, groups)
    df = filter_cube_num(df, "industry", "address")
    mdf = df.cube("industry", "address").agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary"))
    return mdf.fillna({"industry": NA, "address": NA})
예제 #5
0
def statistic_position_address_compare(df):
    """
    职位与地区对比
    :param df: 
    :return: 
    """
    df = df.filter(df.avg_salary > MIN_SALARY)
    groups = ("position_name", "address")
    df = filter_cube_num(df, groups)
    df = add_median_salary(df, groups)
    mdf = df.cube(*groups).agg(F.count("*").alias("person_num"),
                               F.avg("avg_salary").alias("avg_salary"))
    return mdf.fillna({"position_name": NA, "address": NA})
예제 #6
0
파일: company.py 프로젝트: reganzm/ai
def statistic_company_rank(df):
    """
    公司排名
    :param df: 
    :return: 
    """
    df = df.filter((df.avg_salary > MIN_SALARY))
    df = add_median_salary(df, ("company_name",))
    # 计算当前工作的平均薪资,毕业到当前工作的年限
    mdf = df.groupby("company_name").agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary"))
    mdf = mdf.filter(mdf.person_num > MIN_NUM)
    mdf = add_rank(mdf)
    return mdf
예제 #7
0
파일: industry.py 프로젝트: reganzm/ai
def statistic_industry_rank(df):
    """
    行业排行
    :param df: 
    :return: 
    """
    df = df.filter((df.avg_salary > MIN_SALARY))
    df = add_median_salary(df, ("industry", ))
    mdf = df.groupby("industry").agg(
        F.count("*").alias("person_num"),
        F.first(df.avg_salary).alias("avg_salary"))
    mdf = mdf.filter(mdf.person_num > MIN_NUM)
    mdf = add_rank(mdf)
    return mdf
예제 #8
0
def statistic_industry_address_work_year_range(df):
    """
    行业对应的工作年限分布
    :param df: 
    :return: 
    """
    df = df.filter(df.work_year_range.isNotNull()).filter(df.avg_salary > MIN_SALARY)
    groups = ("industry", "address", "work_year_range")
    df = add_median_salary(df, groups)
    mdf = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                  F.avg("avg_salary").alias("avg_salary"))
    mdf = mdf.filter(mdf.person_num > MIN_NUM)
    sdf = mdf.groupby("industry", "work_year_range").agg(F.sum("person_num").alias("person_num"),
                                                         F.avg("avg_salary").alias("avg_salary"))
    sdf = sdf.withColumn("address", F.lit(NA))
    return mdf.unionByName(sdf)
예제 #9
0
def statistic_position_address_degree(df):
    """
    职位+地点对应的学历分布
    :param df: 
    :return: 
    """
    df = df.filter(df.degree.isNotNull()).filter((df.avg_salary > MIN_SALARY))
    groups = ("position_name", "address", "degree")
    df = add_median_salary(df, groups)
    mdf = df.groupby("position_name", "address", "degree").agg(F.count("*").alias("person_num"),
                                                               F.avg("avg_salary").alias("avg_salary"))
    mdf = mdf.filter(mdf.person_num > MIN_NUM)
    sdf = mdf.groupby("position_name", "degree").agg(F.sum("person_num").alias("person_num"),
                                                     F.avg("avg_salary").alias("avg_salary"))
    sdf = sdf.withColumn("address", F.lit(NA))
    return mdf.unionByName(sdf)
예제 #10
0
def statistic_school_major_rank(df):
    """
    各学校各专业排名
    :param df:
    :return:
    """
    # 计算当前工作的平均薪资,毕业到当前工作的年限
    groups = ("school_name", "major", "degree")
    df = filter_cube_num(df, *groups)
    df = add_median_salary(df, groups)
    smd_df = df.cube(*groups).agg(
        F.count("*").alias("person_num"),
        F.avg(df.avg_salary).alias("avg_salary"))
    smd_df = smd_df.fillna({"school_name": NA, "major": NA, "degree": NA})
    smd_df = smd_df.cube("major", "degree").apply(school_major_rank)
    return smd_df
예제 #11
0
def statistic_major_address(df):
    """
    专业和工作地分析
    :param df: 
    :return: 
    """
    groups = ("major", "degree", "address")
    df = df.filter(df.address.isNotNull())
    df = add_median_salary(df, groups)
    mda_pdf = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                      F.first(df.avg_salary).alias("avg_salary"))
    mda_pdf = mda_pdf.filter(mda_pdf.person_num > MIN_NUM)
    ma_pdf = mda_pdf.groupby("major", "address").agg(F.sum("person_num").alias("person_num"),
                                                     F.avg("avg_salary").alias("avg_salary"))
    ma_pdf = ma_pdf.withColumn("degree", F.lit(NA))
    ma_pdf = mda_pdf.unionByName(ma_pdf)
    return ma_pdf
예제 #12
0
def statistic_major_company(df):
    """
    最近5年本专业毕业生公司去向分布
    :param df: 
    :return: 
    """
    groups = ("major", "degree", "company_name")
    df = df.filter(df.company_name.isNotNull())
    df = add_median_salary(df, groups)
    mdc_df = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                     F.first(df.avg_salary).alias("avg_salary"))
    mdc_df = mdc_df.filter(mdc_df.person_num > MIN_NUM)
    mc_df = mdc_df.groupby("major", "company_name").agg(F.sum("person_num").alias("person_num"),
                                                        F.avg("avg_salary").alias("avg_salary"))
    mc_df = mc_df.withColumn("degree", F.lit(NA))
    mc_df = mdc_df.unionByName(mc_df)
    return mc_df
예제 #13
0
def statistic_major_gender(df):
    """
    专业对应的
    :param df: 
    :return: 
    """
    groups = ("major", "degree", "gender")
    df = df.filter(df.gender.isNotNull())
    df = add_median_salary(df, groups)
    mdf1 = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                   F.first(df.avg_salary).alias("avg_salary"))
    mdf1 = mdf1.filter(mdf1.person_num > MIN_NUM)
    mdf2 = mdf1.groupby("major", "gender").agg(F.sum("person_num").alias("person_num"),
                                               F.avg("avg_salary").alias("avg_salary"))
    mdf2 = mdf2.withColumn("degree", F.lit(NA))
    mdf2 = mdf1.unionByName(mdf2)
    return mdf2
예제 #14
0
def statistic_industry_address_rank(df):
    """
    行业、地区排行
    :param df:
    :return:
    """
    df = df.filter((df.avg_salary > MIN_SALARY))
    groups = ("industry", "address")
    df = add_median_salary(df, groups)
    ia_df = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                    F.avg("avg_salary").alias("avg_salary"))
    ia_df = ia_df.filter(ia_df.person_num > MIN_NUM)
    i_df = ia_df.groupby("industry").agg(F.sum("person_num").alias("person_num"),
                                         F.avg("avg_salary").alias("avg_salary"))
    i_df = i_df.withColumn("address", F.lit(NA))
    ia_df = ia_df.unionByName(i_df)
    return add_rank(ia_df, "address")
예제 #15
0
def statistic_school_major_position(df):
    """
    学校、专业维度下的职位人数和薪酬分析
    :param df:
    :return:
    """
    df = df.filter(df.position_name.isNotNull())
    groups = ("school_name", "major", "degree", "position_name")
    df = filter_cube_num(df, *groups)
    df = add_median_salary(df, groups)
    smdp_df = df.cube(*groups).agg(
        F.count("*").alias("person_num"),
        F.avg(df.avg_salary).alias("avg_salary"))
    smdp_df = smdp_df.filter(smdp_df.position_name.isNotNull())
    smdp_df = smdp_df.dropna(how="all",
                             subset=["school_name", "major", "degree"])
    smdp_df = smdp_df.fillna({"school_name": NA, "major": NA, "degree": NA})
    return smdp_df
예제 #16
0
def statistic_person_rank(df):
    """
    人员排行
    :param df: 
    :return: 
    """
    df = df.filter(df.address.isNotNull())
    df = df.filter(df.gender.isNotNull())
    groups = ("address", "age", "gender")
    df = filter_cube_num(df, *groups)
    df = add_median_salary(df, groups)
    df = filter_age(df).withColumn("age", F.udf(str)(df.age))
    aag_df = df.cube(*groups).agg(
        F.count("*").alias("person_num"),
        F.avg(df.avg_salary).alias("avg_salary"))
    aag_df = aag_df.filter(aag_df.age.isNotNull())
    aag_df = aag_df.fillna({"address": NA, "gender": NA})
    return aag_df
예제 #17
0
def statistic_school_position(df):
    """
    学校和职位对应的分析
    :param df: 
    :return: 
    """
    df = df.filter(df.position_name.isNotNull())
    groups = ("school_name", "degree", "position_name")
    df = add_median_salary(df, groups)
    # 职位别名
    df = df.withColumn("position_title", F.lower(F.trim(df.position_title)))
    pdf = df.groupby("position_name",
                     "position_title").agg(F.count("*").alias("total"))
    pdf = pdf.groupby("position_name").apply(filter_position)
    pdf = pdf.groupby("position_name").agg(
        F.collect_set("position_title").alias("position_set"))
    pdf = pdf.withColumn("position_alias",
                         F.udf(lambda x: "/".join(x))(pdf.position_set))
    pdf = pdf.select("position_name", "position_alias")
    # 职位对应行业
    idf = df.groupby("position_name",
                     "industry").agg(F.count("*").alias("total"))
    idf = idf.groupby("position_name").apply(filter_industry)
    idf = idf.groupby("position_name").agg(
        F.collect_set("industry").alias("industry_set"))
    idf = idf.withColumn("industry_alias",
                         F.udf(lambda x: "/".join(x))(idf.industry_set))
    idf = idf.select("position_name", "industry_alias")
    # 限制degree分析
    sdp_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sdp_df = sdp_df.filter(sdp_df.person_num > MIN_NUM)
    # 不限degree分析
    sp_df = sdp_df.groupby("school_name", "position_name").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    sp_df = sp_df.withColumn("degree", F.lit(NA))
    sdp_df = sdp_df.unionByName(sp_df)
    # 融合职位别名
    sdp_df = sdp_df.join(pdf, "position_name")
    # 融合职位对应的行业
    sdp_df = sdp_df.join(idf, "position_name")
    return sdp_df
예제 #18
0
def statistic_major_rank(df):
    """
    专业排名
    :param df: 
    :return: 
    """
    groups = ("major", "degree")
    df = add_median_salary(df, groups)
    md_df = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                    F.first("avg_salary").alias("avg_salary"))
    md_df = md_df.filter(md_df.person_num > MIN_NUM)
    # 不限degree分析
    m_df = md_df.groupby("major").agg(F.sum("person_num").alias("person_num"),
                                      F.avg("avg_salary").alias("avg_salary"))
    m_df = m_df.withColumn("degree", F.lit(NA))
    md_df = md_df.unionByName(m_df)
    md_df = md_df.filter(md_df.person_num > MIN_NUM)
    md_df = add_rank(md_df, "degree")
    return md_df
예제 #19
0
def statistic_school_gender(df):
    """
    学校和性别对应的分析
    :param df: 
    :return: 
    """
    df = df.filter(df.gender.isNotNull())
    groups = ("school_name", "degree", "gender")
    df = add_median_salary(df, groups)
    sdg_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sdg_df = sdg_df.filter(sdg_df.person_num > MIN_NUM)
    sg_df = sdg_df.groupby("school_name", "gender").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    sg_df = sg_df.withColumn("degree", F.lit(NA))
    sdg_df = sdg_df.unionByName(sg_df)
    return sdg_df
예제 #20
0
def statistic_school_major_position_rank(df):
    """
    专业类别对应职位分布
    :param df:
    :return: 
    """
    df = df.filter(df.position_name.isNotNull())
    groups = ("school_name", "major", "degree", "position_name")
    df = filter_cube_num(df, *groups)
    df = add_median_salary(df, groups)
    scdp_df = df.cube(*groups).agg(
        F.count("*").alias("person_num"),
        F.avg(df.avg_salary).alias("avg_salary"))

    scdp_df = scdp_df.dropna(how="all", subset=["position_name"])
    scdp_df = scdp_df.dropna(how="all", subset=["school_name", "major"])
    scdp_df = scdp_df.fillna({"school_name": NA, "major": NA, "degree": NA})
    scdp_df = scdp_df.filter(scdp_df.person_num > MIN_NUM)
    scdp_df = scdp_df.groupby("school_name", "major",
                              "degree").apply(school_major_position_rank)
    return scdp_df
예제 #21
0
def statistic_school_rank(df):
    """
    专业排名
    :param df: 
    :return: 
    """
    groups = ("school_name", "degree")
    df = add_median_salary(df, groups)
    sd_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    # 不限degree分析
    s_df = sd_df.groupby("school_name").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    s_df = s_df.withColumn("degree", F.lit(NA))
    sd_df = sd_df.unionByName(s_df)
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    sd_df = add_rank(sd_df, "degree")
    return sd_df