def statistic_rank_dimension(df, groups, na_field, fields, is_sa=True, is_cube=False, sort_field='-avg_salary', need_total=False): """ 排名 """ na_field = [na_field] if na_field else [] fields = [fields] if fields else [] all_fields = groups + na_field + fields for field in all_fields: df = df.filter(df[field].isNotNull()) if is_cube: md_df = statistic_cube(df, groups, na_field, fields, is_sa) else: md_df = statistic_groups(df, groups, na_field, fields, is_sa) if need_total: md_df = add_total(md_df, *(groups + na_field)) if fields: md_df = add_rank(md_df, *(groups + na_field), sort_field=sort_field) else: md_df = add_rank(md_df, *(groups[1:] + na_field), sort_field=sort_field) return md_df
def statistic_company_rank(df): """ 公司排名 :param df: :return: """ df = df.filter((df.avg_salary > MIN_SALARY)) df = add_median_salary(df, ("company_name",)) # 计算当前工作的平均薪资,毕业到当前工作的年限 mdf = df.groupby("company_name").agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) mdf = mdf.filter(mdf.person_num > MIN_NUM) mdf = add_rank(mdf) return mdf
def statistic_industry_rank(df): """ 行业排行 :param df: :return: """ df = df.filter((df.avg_salary > MIN_SALARY)) df = add_median_salary(df, ("industry", )) mdf = df.groupby("industry").agg( F.count("*").alias("person_num"), F.first(df.avg_salary).alias("avg_salary")) mdf = mdf.filter(mdf.person_num > MIN_NUM) mdf = add_rank(mdf) return mdf
def statistic_industry_address_rank(df): """ 行业、地区排行 :param df: :return: """ df = df.filter((df.avg_salary > MIN_SALARY)) groups = ("industry", "address") df = add_median_salary(df, groups) ia_df = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) ia_df = ia_df.filter(ia_df.person_num > MIN_NUM) i_df = ia_df.groupby("industry").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) i_df = i_df.withColumn("address", F.lit(NA)) ia_df = ia_df.unionByName(i_df) return add_rank(ia_df, "address")
def statistic_position_address_duration(df): """ 职位+地点对应单次在职时长 :param df: :return: """ df = df.filter(df.work_duration_month.isNotNull()).filter(df.work_duration_month > 0) \ .filter(df.work_duration_month < 240).filter(df.position_name.isNotNull()).filter(df.address.isNotNull()) df = df.withColumn('avg_duration', df.work_duration_month) df = filter_cube_num(df, "position_name", "address") md_df = df.cube("position_name", "address").agg(F.count("*").alias("person_num"), F.avg(df.avg_duration).alias("avg_duration")) md_df = md_df.filter(md_df["position_name"].isNotNull()) md_df = md_df.fillna(NA) md_df = add_rank(md_df, "address", sort_field='-avg_duration') return md_df
def statistic_major_rank(df): """ 专业排名 :param df: :return: """ groups = ("major", "degree") df = add_median_salary(df, groups) md_df = df.groupby(*groups).agg(F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) md_df = md_df.filter(md_df.person_num > MIN_NUM) # 不限degree分析 m_df = md_df.groupby("major").agg(F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) m_df = m_df.withColumn("degree", F.lit(NA)) md_df = md_df.unionByName(m_df) md_df = md_df.filter(md_df.person_num > MIN_NUM) md_df = add_rank(md_df, "degree") return md_df
def statistic_position_address_change(df, groups, na_field, fields, is_sa=False): """ 职位延续性分析 :param df: :return: """ na_field = [na_field] if na_field else [] fields = [fields] if fields else [] all_fields = groups + na_field + fields df = df.select(all_fields[0], all_fields[1], "resume_id", "work_index") df = next_same(df, groups[0]) md_df = statistic_cube(df, groups, na_field, fields, is_sa) md_df = add_rank(md_df, *(groups[1:] + na_field), sort_field='-person_num') return md_df
def statistic_school_rank(df): """ 专业排名 :param df: :return: """ groups = ("school_name", "degree") df = add_median_salary(df, groups) sd_df = df.groupby(*groups).agg( F.count("*").alias("person_num"), F.first("avg_salary").alias("avg_salary")) sd_df = sd_df.filter(sd_df.person_num > MIN_NUM) # 不限degree分析 s_df = sd_df.groupby("school_name").agg( F.sum("person_num").alias("person_num"), F.avg("avg_salary").alias("avg_salary")) s_df = s_df.withColumn("degree", F.lit(NA)) sd_df = sd_df.unionByName(s_df) sd_df = sd_df.filter(sd_df.person_num > MIN_NUM) sd_df = add_rank(sd_df, "degree") return sd_df