Ejemplo n.º 1
0
Archivo: rank.py Proyecto: reganzm/ai
def statistic_rank_dimension(df, groups, na_field, fields, is_sa=True, is_cube=False, sort_field='-avg_salary',
                             need_total=False):
    """
    排名
    """
    na_field = [na_field] if na_field else []
    fields = [fields] if fields else []
    all_fields = groups + na_field + fields
    for field in all_fields:
        df = df.filter(df[field].isNotNull())

    if is_cube:
        md_df = statistic_cube(df, groups, na_field, fields, is_sa)
    else:
        md_df = statistic_groups(df, groups, na_field, fields, is_sa)

    if need_total:
        md_df = add_total(md_df, *(groups + na_field))

    if fields:
        md_df = add_rank(md_df, *(groups + na_field), sort_field=sort_field)
    else:
        md_df = add_rank(md_df, *(groups[1:] + na_field), sort_field=sort_field)

    return md_df
Ejemplo n.º 2
0
def statistic_company_rank(df):
    """
    公司排名
    :param df: 
    :return: 
    """
    df = df.filter((df.avg_salary > MIN_SALARY))
    df = add_median_salary(df, ("company_name",))
    # 计算当前工作的平均薪资,毕业到当前工作的年限
    mdf = df.groupby("company_name").agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary"))
    mdf = mdf.filter(mdf.person_num > MIN_NUM)
    mdf = add_rank(mdf)
    return mdf
Ejemplo n.º 3
0
def statistic_industry_rank(df):
    """
    行业排行
    :param df: 
    :return: 
    """
    df = df.filter((df.avg_salary > MIN_SALARY))
    df = add_median_salary(df, ("industry", ))
    mdf = df.groupby("industry").agg(
        F.count("*").alias("person_num"),
        F.first(df.avg_salary).alias("avg_salary"))
    mdf = mdf.filter(mdf.person_num > MIN_NUM)
    mdf = add_rank(mdf)
    return mdf
Ejemplo n.º 4
0
def statistic_industry_address_rank(df):
    """
    行业、地区排行
    :param df:
    :return:
    """
    df = df.filter((df.avg_salary > MIN_SALARY))
    groups = ("industry", "address")
    df = add_median_salary(df, groups)
    ia_df = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                    F.avg("avg_salary").alias("avg_salary"))
    ia_df = ia_df.filter(ia_df.person_num > MIN_NUM)
    i_df = ia_df.groupby("industry").agg(F.sum("person_num").alias("person_num"),
                                         F.avg("avg_salary").alias("avg_salary"))
    i_df = i_df.withColumn("address", F.lit(NA))
    ia_df = ia_df.unionByName(i_df)
    return add_rank(ia_df, "address")
Ejemplo n.º 5
0
def statistic_position_address_duration(df):
    """
    职位+地点对应单次在职时长
    :param df:
    :return:
    """
    df = df.filter(df.work_duration_month.isNotNull()).filter(df.work_duration_month > 0) \
        .filter(df.work_duration_month < 240).filter(df.position_name.isNotNull()).filter(df.address.isNotNull())
    df = df.withColumn('avg_duration', df.work_duration_month)

    df = filter_cube_num(df, "position_name", "address")
    md_df = df.cube("position_name", "address").agg(F.count("*").alias("person_num"),
                                                    F.avg(df.avg_duration).alias("avg_duration"))
    md_df = md_df.filter(md_df["position_name"].isNotNull())
    md_df = md_df.fillna(NA)

    md_df = add_rank(md_df, "address", sort_field='-avg_duration')
    return md_df
Ejemplo n.º 6
0
def statistic_major_rank(df):
    """
    专业排名
    :param df: 
    :return: 
    """
    groups = ("major", "degree")
    df = add_median_salary(df, groups)
    md_df = df.groupby(*groups).agg(F.count("*").alias("person_num"),
                                    F.first("avg_salary").alias("avg_salary"))
    md_df = md_df.filter(md_df.person_num > MIN_NUM)
    # 不限degree分析
    m_df = md_df.groupby("major").agg(F.sum("person_num").alias("person_num"),
                                      F.avg("avg_salary").alias("avg_salary"))
    m_df = m_df.withColumn("degree", F.lit(NA))
    md_df = md_df.unionByName(m_df)
    md_df = md_df.filter(md_df.person_num > MIN_NUM)
    md_df = add_rank(md_df, "degree")
    return md_df
Ejemplo n.º 7
0
def statistic_position_address_change(df,
                                      groups,
                                      na_field,
                                      fields,
                                      is_sa=False):
    """
    职位延续性分析
    :param df:
    :return:
    """
    na_field = [na_field] if na_field else []
    fields = [fields] if fields else []
    all_fields = groups + na_field + fields
    df = df.select(all_fields[0], all_fields[1], "resume_id", "work_index")

    df = next_same(df, groups[0])
    md_df = statistic_cube(df, groups, na_field, fields, is_sa)
    md_df = add_rank(md_df, *(groups[1:] + na_field), sort_field='-person_num')
    return md_df
Ejemplo n.º 8
0
def statistic_school_rank(df):
    """
    专业排名
    :param df: 
    :return: 
    """
    groups = ("school_name", "degree")
    df = add_median_salary(df, groups)
    sd_df = df.groupby(*groups).agg(
        F.count("*").alias("person_num"),
        F.first("avg_salary").alias("avg_salary"))
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    # 不限degree分析
    s_df = sd_df.groupby("school_name").agg(
        F.sum("person_num").alias("person_num"),
        F.avg("avg_salary").alias("avg_salary"))
    s_df = s_df.withColumn("degree", F.lit(NA))
    sd_df = sd_df.unionByName(s_df)
    sd_df = sd_df.filter(sd_df.person_num > MIN_NUM)
    sd_df = add_rank(sd_df, "degree")
    return sd_df